In [2]:
import pandas as pd
movies = pd.read_csv('movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
import re

def clean_title(title):
    return re.sub("[^a-zA-z0-9 ]", "", title)
    

In [4]:
movies["clean_title"] = movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [5]:
# Building search engine
# Using inverse document frequency, which tracks how unique words are within the data set. 
# This allows the search engine to create similarity scores for what we enter in vs. what the movie titles are

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2)) # what ngram does is looks for pairs of words as well as just one word(for search index). Improves accuracy
tfidf = vectorizer.fit_transform(movies['clean_title'])


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten() # calculates similarity score for all movies in relation to title
    indices = np.argpartition(similarity, -5)[-5:] # finds 5 most similar movies to title entered
    results = movies.iloc[indices][::-1] # reverses resutls
    return results

search("Harry Potter")

Unnamed: 0,movieId,title,genres,clean_title
13512,69844,Harry Potter and the Half-Blood Prince (2009),Adventure|Fantasy|Mystery|Romance|IMAX,Harry Potter and the HalfBlood Prince 2009
4790,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,Harry Potter and the Sorcerers Stone aka Harry...
5704,5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy,Harry Potter and the Chamber of Secrets 2002
10408,40815,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX,Harry Potter and the Goblet of Fire 2005
11700,54001,Harry Potter and the Order of the Phoenix (2007),Adventure|Drama|Fantasy|IMAX,Harry Potter and the Order of the Phoenix 2007


In [7]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names="value")

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [8]:
movie_id = 1
ratings = pd.read_csv('ratings.csv')
similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
# this gives us set of people who like the same movie as us
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [9]:
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']
# similar user recommendations


In [10]:
# now we're going to narrow this down to movies that 10% or more of users that are similar to us also liked
# this gives us more specific movies rather than just getting all the movies that the users similar to us rated 5 stars
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

similar_user_recs

movieId
1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: count, Length: 113, dtype: float64

In [11]:
# some of these movies are specific to our niche
# we need to find what percentage of regular people(outside of similar users) liked these movies ^
# this gives us movies that define the similarity to the movie you like
    # meaning that this gives us movies that are similar to the movie we like AND similar people to us like

all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
# these are all the users who watched movies that were recommended to us


In [12]:
# now we need to find percentage of all these users recommened each of the similar_user_recs(recommended movies)
all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

In [13]:

# so with this information, we want movies that a higher percentage of people who are similar to you like over percentage of all people like
    # this is so that we find movies that are similar to the movie we like
    # for example if im looking for movies like the avengers, lets say that 100% of people similar to me like toy story. however, since toy story is a very popular movie, 100% of all people also like toy story. since there is little(or no) difference in these 2 percentages, this is not a good movie to recommend.
        # however if im looking for movies like the avengers, lets say that 100% of people similar to me like thor, but only 30% of all people like thor. this means this is a good movie to recommend since people specific to us like it, not just all people
# now we create a recommendation score

rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ['similar', 'all']

In [14]:
rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
rec_percentages = rec_percentages.sort_values('score', ascending=False)
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [15]:
rec_percentages.head(10).merge(movies, left_index=True, right_on='movieId')

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [16]:
# combine everything above into one function
def find_similar_movies (movie_id):
    similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs [similar_user_recs > .10]
    
    all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
    all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ['similar', 'all']
    
    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
    
    rec_percentages = rec_percentages.sort_values('score', ascending=False)

    return rec_percentages.head(10).merge(movies, left_index=True, right_on='movieId')[['score', 'title', 'genres']]

In [18]:
movie_name_input = widgets.Text(
    value="",
    description="Movie Title:",
    disabled=False
)    

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data['new']
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]['movieId']
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()