In [9]:
import pandas as pd

movies = pd.read_csv("movies.csv")

In [10]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [11]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [12]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [13]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [14]:
!pip install scikit-learn




In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity,-5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [21]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title)>5:
            display(search(title))

movie_input.observe(on_type,names='value')
display(movie_input,movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [22]:
ratings = pd.read_csv("ratings.csv")

In [23]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
3972229,26146,4701,3.5,1473551300
3972230,26146,4720,4.0,1473551314
3972231,26146,4754,4.0,1473552597
3972232,26146,4848,4.0,1473552557


In [30]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [31]:
movie_id=1

In [32]:
similar_users = ratings[(ratings["movieId"]==movie_id)&(ratings["rating"]>4)]["userId"].unique()

In [33]:
similar_users

array([   36,    75,    86, ..., 26134, 26140, 26141])

In [38]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users))&(ratings["rating"]>4)]["movieId"]

In [39]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
3971668     58559
3971669     60069
3971671     68954
3971672     79132
3971677    109487
Name: movieId, Length: 217357, dtype: int64

In [44]:
similar_user_recs =similar_user_recs.value_counts()/len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs>.1]

In [45]:
similar_user_recs

movieId
1       1.000000
318     0.431205
260     0.385122
296     0.352205
356     0.350889
          ...   
1732    0.101712
1208    0.101382
5418    0.101382
1307    0.100724
778     0.100395
Name: count, Length: 105, dtype: float64

In [48]:
all_users= ratings[(ratings["movieId"].isin(similar_user_recs.index))&(ratings["rating"]>4)]

In [54]:
all_users_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())

In [55]:
all_users_recs

movieId
318      0.341039
296      0.284831
2571     0.245189
356      0.231631
593      0.227758
           ...   
1148     0.047101
1307     0.046318
50872    0.038942
78499    0.034574
2355     0.024684
Name: count, Length: 105, dtype: float64

In [58]:
rec_percentages = pd.concat([similar_user_recs,all_users_recs],axis=1)
rec_percentages.columns=["similar","all"]

In [59]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.125191
318,0.431205,0.341039
260,0.385122,0.219145
296,0.352205,0.284831
356,0.350889,0.231631
...,...,...
1732,0.101712,0.077183
1208,0.101382,0.080068
5418,0.101382,0.062183
1307,0.100724,0.046318


In [60]:
rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]

In [61]:
rec_percentages = rec_percentages.sort_values("score",ascending = False)

In [62]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.125191,7.987821
3114,0.282093,0.054354,5.189964
2355,0.109941,0.024684,4.453977
78499,0.142857,0.034574,4.131960
588,0.223173,0.068200,3.272352
...,...,...,...
296,0.352205,0.284831,1.236541
2329,0.115207,0.093378,1.233776
79132,0.158657,0.130795,1.213021
4973,0.131995,0.110026,1.199669


In [64]:
rec_percentages.head(10).merge(movies,left_index = True,right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.125191,7.987821,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.282093,0.054354,5.189964,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.109941,0.024684,4.453977,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.142857,0.034574,4.13196,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.223173,0.0682,3.272352,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
4780,0.220211,0.07026,3.134224,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
6258,0.211652,0.069189,3.059064,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.187953,0.061771,3.042726,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.194207,0.0682,2.847622,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.24786,0.08732,2.838522,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [65]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [68]:
movie_name_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title",
    disables = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title)>5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type,names = "value")
display(movie_name_input,recommendation_list)

Text(value='Toy Story', description='Movie Title')

Output()