In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import ipywidgets as widgets
from IPython.display import display

import re

In [2]:
movies_df = pd.read_csv("~/Desktop/MovieLens-resources/movies.csv")

ratings_df = pd.read_csv("~/Desktop/MovieLens-resources/ratings.csv")

In [3]:
# ratings_df = ratings_df.loc[ratings_df['userId'] <= 1050]

In [4]:
# ratings_df = ratings_df.drop('timestamp',axis=1)
# ratings_df

In [5]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title


In [6]:
movies_df["clean_title"] = movies_df["title"].apply(clean_title)

In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies_df["clean_title"])

In [9]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies_df.iloc[indices].iloc[::-1]
    
    return results

In [10]:
movie_input = widgets.Text(
    
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [11]:
# movie_id = 89745

# #def find_similar_movies(movie_id):
# movie = movies_df[movies_df["movieId"] == movie_id]

In [12]:
# similar_users = ratings_df[(ratings_df["movieId"] == movie_id) & (ratings_df["rating"] > 4)]["userId"].unique()

# similar_user_recs = ratings_df[(ratings_df["userId"].isin(similar_users)) & (ratings_df["rating"] > 4)]["movieId"]

In [13]:
# similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

# similar_user_recs = similar_user_recs[similar_user_recs > .10]

# similar_user_recs

In [14]:
# all_users = ratings_df[(ratings_df["movieId"].isin(similar_user_recs.index)) & (ratings_df["rating"] > 4)]

# all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

# all_user_recs

In [15]:
# rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)

# rec_percentages.columns = ["similar", "all"]

# rec_percentages

In [16]:
# rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

# rec_percentages = rec_percentages.sort_values("score", ascending=False)

# rec_percentages.head(10).merge(movies_df, left_index=True, right_on="movieId")

In [17]:
def find_similar_movies(movie_id):
    similar_users = ratings_df[(ratings_df["movieId"] == movie_id) & (ratings_df["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings_df[(ratings_df["userId"].isin(similar_users)) & (ratings_df["rating"] > 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    all_users = ratings_df[(ratings_df["movieId"].isin(similar_user_recs.index)) & (ratings_df["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies_df, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [18]:
movie_name_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()