In [2]:
import pandas as pd

movies = pd.read_csv('movies.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [12]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9]", " ", title)

In [13]:
movies['clean_title'] = movies['title'].apply(clean_title)

In [14]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# ngram: groups of two words that are consecutive to make search more accurate than single grouping
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies['clean_title'])

In [27]:
# Compute similarity between word we enter and all the movies in our list
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    # Compare query term to each of the titles in the dataset and return similarity
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    # Make most similar result the first one in the list
    results = movies.iloc[indices][::-1]
    return results

In [28]:
query_vec
similarity
indices
results

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [26]:
# Build an interactive search box
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(value='Toy Story', description='Movie Title:', disabled=False)
# Hook it with output widget
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data['new']
        
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [30]:
# Find movies similar to what we searched
ratings = pd.read_csv('ratings.csv')
ratings
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [31]:
movie_id = 1

In [35]:
similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >= 4)]['userId'].unique()

In [36]:
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530], dtype=int64)

In [40]:
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] >= 4)]['movieId']
similar_user_recs

5101           1
5104          11
5105          34
5106          46
5108          60
            ... 
24998389    3735
24998390    3751
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 1783202, dtype: int64

In [44]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs

1       1.000000
260     0.521102
318     0.516733
356     0.502443
296     0.458315
          ...   
1358    0.101733
1485    0.101659
16      0.101140
899     0.100326
8874    0.100252
Name: movieId, Length: 256, dtype: float64

In [47]:
all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
19,1,2692,5.0,1147869100
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [48]:
all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())
all_users_recs

318     0.332559
296     0.276637
2571    0.237144
356     0.228624
593     0.219531
          ...   
2078    0.014215
1485    0.013025
2080    0.012825
1022    0.012214
708     0.010045
Name: movieId, Length: 256, dtype: float64

In [51]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ['similar', 'all']
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.121207
260,0.521102,0.215934
318,0.516733,0.332559
356,0.502443,0.228624
296,0.458315,0.276637
...,...,...
1358,0.101733,0.029705
1485,0.101659,0.013025
16,0.101140,0.033154
899,0.100326,0.030747


In [53]:
rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
rec_percentages = rec_percentages.sort_values('score', ascending=False)
rec_percentages

Unnamed: 0,similar,all,score
708,0.107878,0.010045,10.739077
2355,0.249889,0.024383,10.248480
2080,0.109211,0.012825,8.515203
1022,0.102473,0.012214,8.389772
1,1.000000,0.121207,8.250332
...,...,...,...
58559,0.201392,0.142334,1.414925
7361,0.140234,0.101297,1.384388
79132,0.174219,0.127675,1.364553
4973,0.143492,0.109231,1.313651


In [54]:
# Take top 10 recommendations and merge with movie data to get the title of the movies
rec_percentages.head(10).merge(movies, left_index=True, right_on='movieId')

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
693,0.107878,0.010045,10.739077,708,"Truth About Cats & Dogs, The (1996)",Comedy|Romance,Truth About Cats Dogs The 1996
2264,0.249889,0.024383,10.24848,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bug s Life A 1998
1991,0.109211,0.012825,8.515203,2080,Lady and the Tramp (1955),Animation|Children|Comedy|Romance,Lady and the Tramp 1955
999,0.102473,0.012214,8.389772,1022,Cinderella (1950),Animation|Children|Fantasy|Musical|Romance,Cinderella 1950
0,1.0,0.121207,8.250332,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
435,0.135125,0.016558,8.160812,440,Dave (1993),Comedy|Romance,Dave 1993
721,0.169776,0.02141,7.929787,736,Twister (1996),Action|Adventure|Romance|Thriller,Twister 1996
1989,0.112691,0.014215,7.927372,2078,"Jungle Book, The (1967)",Animation|Children|Comedy|Musical,Jungle Book The 1967
1,0.13594,0.017163,7.920634,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
588,0.117355,0.014827,7.915112,596,Pinocchio (1940),Animation|Children|Fantasy|Musical,Pinocchio 1940


In [55]:
def find_similar_movies(movie_id):
    # Finding recommendations with users similar to us
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    # Adjusting so we only have recommendations where over 10% of users recommended that movie
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    # Finding how common the recommendations were among all the users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # Creating our score
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    # Sorting it
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [58]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()