In [2]:
import pandas as pd

In [3]:
movies = pd.read_csv("movies.csv")

In [4]:
import re

def clean_title(title):
   return re.sub("[^a-zA-Z0-9 ]","", title)

In [5]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity,-5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [8]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)
movie_list = widgets.Output()
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))
movie_input.observe(on_type,names = 'value')
display(movie_input,movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [9]:
ratings = pd.read_csv("ratings.csv")

In [10]:
similar_users = ratings[(ratings["movieId"] == 1) & (ratings["rating"]>=4)]["userId"].unique()

In [11]:
similar_users_reqs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4 )]

In [12]:
similar_users_reqs

Unnamed: 0,userId,movieId,rating,timestamp
255,3,29,4.5,1484754967
256,3,32,4.5,1439474635
257,3,50,5.0,1439474391
261,3,214,5.0,1484753888
263,3,293,5.0,1484753912
...,...,...,...,...
24999248,162534,101962,4.5,1526734434
24999269,162534,109487,4.5,1526714913
24999326,162534,164179,5.0,1526712632
24999329,162534,165549,5.0,1526713272


In [13]:
similar_users_reqs = similar_users_reqs["movieId"].value_counts() / len(similar_users)

similar_users_reqs = similar_users_reqs[similar_users_reqs>.1]

In [14]:
all_users = ratings[(ratings["movieId"].isin(similar_users_reqs.index)) & (ratings["rating"] > 4)]

In [15]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [16]:
all_users_recs

movieId
318     0.345497
296     0.287399
2571    0.246370
356     0.237518
593     0.228071
          ...   
3114    0.054220
2716    0.053892
34      0.052729
1073    0.049232
1148    0.047922
Name: count, Length: 90, dtype: float64

In [17]:
rec_percentages = pd.concat([similar_users_reqs,all_users_recs],axis =1)
rec_percentages.columns = ["similar","all"]

In [18]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.499483,0.125923
318,0.421226,0.345497
260,0.367817,0.224334
296,0.353337,0.287399
356,0.322708,0.237518
...,...,...
1148,0.103609,0.047922
1527,0.102867,0.066762
4995,0.102522,0.076403
778,0.102495,0.075473


In [19]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [20]:
rec_percentages = rec_percentages.sort_values("score",ascending = False)

In [21]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.499483,0.125923,3.966586
3114,0.170357,0.054220,3.141967
4886,0.166645,0.071489,2.331060
6377,0.166565,0.072960,2.282977
1073,0.111591,0.049232,2.266621
...,...,...,...
58559,0.180461,0.147871,1.220392
318,0.421226,0.345497,1.219189
4973,0.136148,0.113481,1.199744
2959,0.252380,0.218792,1.153517


In [22]:
rec_percentages.head(10).merge(movies,left_index=True,right_index=True)

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
1,0.499483,0.125923,3.966586,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
3114,0.170357,0.05422,3.141967,3207,"Snows of Kilimanjaro, The (1952)",Adventure,Snows of Kilimanjaro The 1952
4886,0.166645,0.071489,2.33106,4992,Kate & Leopold (2001),Comedy|Romance,Kate Leopold 2001
6377,0.166565,0.07296,2.282977,6500,"Satanic Rites of Dracula, The (1974)",Horror,Satanic Rites of Dracula The 1974
1073,0.111591,0.049232,2.266621,1100,Days of Thunder (1990),Action|Drama|Romance,Days of Thunder 1990
8961,0.154207,0.069109,2.231373,26732,Johnny Stecchino (1991),Comedy,Johnny Stecchino 1991
588,0.151449,0.068159,2.221989,596,Pinocchio (1940),Animation|Children|Fantasy|Musical,Pinocchio 1940
1148,0.103609,0.047922,2.162033,1176,"Double Life of Veronique, The (Double Vie de V...",Drama|Fantasy|Romance,Double Life of Veronique The Double Vie de Vro...
364,0.18473,0.086585,2.133522,369,Mrs. Parker and the Vicious Circle (1994),Drama,Mrs Parker and the Vicious Circle 1994
595,0.12806,0.060551,2.1149,603,"Bye Bye, Love (1995)",Comedy,Bye Bye Love 1995


In [23]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"]>4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4 )]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs>.10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar","all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score",ascending = False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score","title","genres"]]

In [24]:
movie_input_name = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_input_name.observe(on_type,names = "value")
display(movie_input_name,recommendation_list)


Text(value='Toy Story', description='Movie Title:')

Output()