In [335]:
import pandas as pd
import re  # python regular expressin library
import numpy as np

In [336]:
movies= pd.read_csv(r"C:\Users\User\OneDrive\Desktop\Boot Camp\recommender system\data\ml-25m\movies.csv")

In [337]:
movies.head(50)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


With movies data set we will make a movie search engine, that when we type in the name of the movie we will find the movie we are loking for. and with ratings data set we will actually build the recommendation engine.

CLEANING MOVIES TITLES WITH REGEX

In [338]:
def clean_title(title):  # it will take a movie title as an input and clean that title.
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [339]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [340]:
movies.head(2)

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995


CREATING A TFIDF MATRIX

Building a search engine

In [341]:
from sklearn.feature_extraction.text import TfidfVectorizer

#  tfidf turns titles into numbers

In [342]:
vectorizer = TfidfVectorizer(ngram_range = (1,2))

# ngram_range parameter instead of just looking at individual words in the title, its also goinig to look what is called 
# ngrams i.e words of two groups that are consecutive. so instead of just loking at Toy Story 1995, its also going to lok at 
# toy story together and Story 1995 together.
# so its just make our search a little more accurate

In [343]:
tfidf = vectorizer.fit_transform(movies["clean_title"])

#here we actually use that vectorizer to turn our set of title into matrix, into sets of numbers.

Creating a search functin

The next thing we need to do is to compute similarity between a term that we enter and all of the movies in our list.We will do it using cosine similarity.

In [344]:
# import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

title = "Harry Potter"
title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

query_vec = vectorizer.transform([title]) # it turns the search term that we enter into a set of numbers.
similarity = cosine_similarity(query_vec, tfidf).flatten()
# ie similarity betweeen query_vec and tfidf

In [345]:
similarity

array([0., 0., 0., ..., 0., 0., 0.])

The answer above is Numpy vector, and the first title was Toy Story and there is actually no difference between Toy Story and 
Harry Potter, Jumanji etc

In [346]:
query_vec

<1x170073 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

We are going to compute the similarity between the term that we enter e.g Harry Potter and all of the movies in our list. 

title = "Men 1995"
title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

query_vec = vectorizer.transform([title]) # 
similarity = cosine_similarity(query_vec, tfidf).flatten()

In [347]:
similarity

array([0., 0., 0., ..., 0., 0., 0.])

we can see this is more similar to some of our results. we entered 1995 that is similar to first title that have 1995,similar 
to jumangi and its very similar to the third title that is Grumpier Old Men

In [348]:
title = "Heat"
title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

query_vec = vectorizer.transform([title]) # 
similarity = cosine_similarity(query_vec, tfidf).flatten()

In [349]:
similarity

array([0., 0., 0., ..., 0., 0., 0.])

The next thing we need to is we need to find the titles that have the greatest similarity to our search term
To do that we are going to use np.argpartition and we are going to pass in our similarity data and -5, and it will find out the
five most similar title to our search term.

In [350]:
title = "Men 1995"
title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

query_vec = vectorizer.transform([title]) # 
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -5)[-5:] # it will find the five most similar titles to our search term.

In [351]:
indices    

array([ 7071, 11003, 28489,     2, 60174], dtype=int64)

indices is going to give us the indices in our vector of the five most similar search terms

Then we need to do is actually index our movies data by these indices to actually get titles that we care abut.

In [352]:
results = movies.iloc[indices]
results
# so this gives us five movies that are most similar to Men 1995

Unnamed: 0,movieId,title,genres,clean_title
7071,7196,"Men, The (1950)",Drama,Men The 1950
11003,47484,G Men (1935),Crime|Drama,G Men 1935
28489,131824,Men... (1985),Comedy,Men 1985
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
60174,202701,Любить по-русски (1995),Drama|Romance,1995


...

In [353]:
title = "Men"
title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

query_vec = vectorizer.transform([title]) # 
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -5)[-5:]
results = movies.iloc[indices]
results

Unnamed: 0,movieId,title,genres,clean_title
41389,161586,Little Men (2016),Drama,Little Men 2016
55673,192451,Dead Men (2018),Action|Drama|Western,Dead Men 2018
28489,131824,Men... (1985),Comedy,Men 1985
7071,7196,"Men, The (1950)",Drama,Men The 1950
11003,47484,G Men (1935),Crime|Drama,G Men 1935


In [354]:
title = "Dangerous"
title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

query_vec = vectorizer.transform([title]) # 
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -5)[-5:]
results = movies.iloc[indices]
results

Unnamed: 0,movieId,title,genres,clean_title
54258,189167,A Dangerous Son (2018),Documentary,A Dangerous Son 2018
19809,102908,"Dangerous Place, A (2012)",Thriller,Dangerous Place A 2012
434,439,Dangerous Game (1993),Drama,Dangerous Game 1993
14301,74263,Dangerous (1935),Drama,Dangerous 1935
34866,146417,A Dangerous Game (2014),Documentary,A Dangerous Game 2014


In [355]:
title = "Dead"
title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

query_vec = vectorizer.transform([title]) # 
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -5)[-5:]
results = movies.iloc[indices]
results

Unnamed: 0,movieId,title,genres,clean_title
1228,1261,Evil Dead II (Dead by Dawn) (1987),Action|Comedy|Fantasy|Horror,Evil Dead II Dead by Dawn 1987
17799,92923,"Dead, The (2010)",Horror,Dead The 2010
40907,160515,Dead 7 (2016),Horror|Western,Dead 7 2016
3994,4098,"Dead, The (1987)",Drama,Dead The 1987
22962,116668,Dead Snow 2: Red vs. Dead (2014),Action|Comedy|Horror,Dead Snow 2 Red vs Dead 2014


In [356]:
title = "Die"
title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

query_vec = vectorizer.transform([title]) # 
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -5)[-5:]
results = movies.iloc[indices]
results

Unnamed: 0,movieId,title,genres,clean_title
38297,154508,"Die Frau, die im Wald verschwand (2009)",Drama,Die Frau die im Wald verschwand 2009
11804,54964,Die! Die! My Darling! (Fanatic) (1965),Horror|Thriller,Die Die My Darling Fanatic 1965
19228,100056,Die (2010),Thriller,Die 2010
6803,6928,"Die, Mommie, Die (2003)",Comedy,Die Mommie Die 2003
4092,4196,"Die, Monster, Die! (1965)",Horror|Mystery|Sci-Fi,Die Monster Die 1965


In [357]:
title = "Toy Story 1995"
title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

query_vec = vectorizer.transform([title]) # 
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -5)[-5:]
results = movies.iloc[indices]
results

Unnamed: 0,movieId,title,genres,clean_title
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995


If we type in [::-1] it will reverse the results, in the above list the most similar result was in the last, and we need it to be first.i.e  if we dont need the five most similar movies but only one similar movie.

In [358]:
title = "Toy Story 1995"
title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

query_vec = vectorizer.transform([title]) # 
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -5)[-5:]
results = movies.iloc[indices][::-1]
results

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [359]:
# so the most similar movie is returned at the top

Turning above into a function

In [360]:
def search(title):
    title = "Toy Story 1995"
    title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

    query_vec = vectorizer.transform([title]) # 
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [361]:
search("Toy Story 1995")

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [362]:
def search(title):
    title = "Dangerous"
    title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

    query_vec = vectorizer.transform([title]) # 
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [363]:
search("Dangerous")

Unnamed: 0,movieId,title,genres,clean_title
34866,146417,A Dangerous Game (2014),Documentary,A Dangerous Game 2014
14301,74263,Dangerous (1935),Drama,Dangerous 1935
434,439,Dangerous Game (1993),Drama,Dangerous Game 1993
19809,102908,"Dangerous Place, A (2012)",Thriller,Dangerous Place A 2012
54258,189167,A Dangerous Son (2018),Documentary,A Dangerous Son 2018


In [364]:
def search(title):
    title = "Dead"
    title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

    query_vec = vectorizer.transform([title]) # 
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [365]:
search("Dead")

Unnamed: 0,movieId,title,genres,clean_title
22962,116668,Dead Snow 2: Red vs. Dead (2014),Action|Comedy|Horror,Dead Snow 2 Red vs Dead 2014
3994,4098,"Dead, The (1987)",Drama,Dead The 1987
40907,160515,Dead 7 (2016),Horror|Western,Dead 7 2016
17799,92923,"Dead, The (2010)",Horror,Dead The 2010
1228,1261,Evil Dead II (Dead by Dawn) (1987),Action|Comedy|Fantasy|Horror,Evil Dead II Dead by Dawn 1987


In [366]:
def search(title):
    title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

    query_vec = vectorizer.transform([title]) # 
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [367]:
search("Dead")

Unnamed: 0,movieId,title,genres,clean_title
22962,116668,Dead Snow 2: Red vs. Dead (2014),Action|Comedy|Horror,Dead Snow 2 Red vs Dead 2014
3994,4098,"Dead, The (1987)",Drama,Dead The 1987
40907,160515,Dead 7 (2016),Horror|Western,Dead 7 2016
17799,92923,"Dead, The (2010)",Horror,Dead The 2010
1228,1261,Evil Dead II (Dead by Dawn) (1987),Action|Comedy|Fantasy|Horror,Evil Dead II Dead by Dawn 1987


In [368]:
def search(title):
    title = clean_title(title) # it cleans the title that we enter in the same way that we cleaned earlier.

    query_vec = vectorizer.transform([title]) # 
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results
search("Old")

Unnamed: 0,movieId,title,genres,clean_title
57777,197135,Old Man,(no genres listed),Old Man
60368,203148,Old Boys (2018),Comedy,Old Boys 2018
9336,27773,Old Boy (2003),Mystery|Thriller,Old Boy 2003
47068,173741,The Old Man (2012),Action|Adventure|Drama,The Old Man 2012
43650,166536,Old Stone (2016),Drama,Old Stone 2016


## BUILDING AN INTERACTIVE SEARCH BOX WITH JUPYTER

We now build an interactive Jupyter NoteBook vidgit, where we can actually type in the name of the movie and see the search results in jupyter Note Book
widgets are little interactive things, we can imbed into noteboks.that let us enter input and then use that input.
Display is a functin that we can use to actually show different things as output from jupyter cells 

In [369]:
# !pip install ipython
# import IPython

In [370]:

from IPython.display import display, update_display

In [371]:
import ipywidgets as widgets
from IPython.display import display

Things to keep in mind:-

'I' and 'P' in IPython are uppercase.

In [372]:
movie_input = widgets.Text(
       descriptin = "Movie Title",
       disabled = False
)
# creating output widget to show result
output_widget = widgets.Output()  # movie_list is the name of output widget

def on_type(data): # this function is going to be called when we type in some thing in box
    with output_widget:   #  i.e with our output widget
        output_widget.clear_output()   # it will clear anthing if its present in output widget
        title = data["new"]  # then we will grab our title from our input, so our input will be a dictionary and the new field 
                             # will give us new value that is entered into the input
        if len(title) > 5:
            display(search(title)) # its gonna search our set of titles for a title and its actually going to display it into 
                                   # our output widget which is movie list

# there are different types of events we can observe, we can observe value events
movie_input.observe(on_type, names = "value")

# we can display movie_input and movie list
display(movie_input, output_widget)



Text(value='')

Output()

By creating an output widget we finished the first half of our project, in 2nd half we will actually build a recommendation system.

## Reading in Movies Rating Data

Now we are going find recommendtions for movies that we like

In [397]:
ratings = pd.read_csv(r"C:\Users\User\OneDrive\Desktop\Boot Camp\recommender system\data\ml-25m\ratings.csv")

In [398]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [399]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

We are now going to find all of the users who also liked the movie that we type in, and then we want to find the other movies that they liked. And those movies are probably going to be good recommendations for us.

## Finding users who liked the same movie as us

In [400]:
movie_id = 1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()

In [401]:
similar_users
# These are the users that liked the same movie as us

array([     3,      5,      8, ..., 162530, 162533, 162534], dtype=int64)

Now we are going to find the other movies that they liked

In [402]:
similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]
similar_users_recs

Unnamed: 0,userId,movieId,rating,timestamp
255,3,29,4.5,1484754967
256,3,32,4.5,1439474635
257,3,50,5.0,1439474391
261,3,214,5.0,1484753888
263,3,293,5.0,1484753912
...,...,...,...,...
24999248,162534,101962,4.5,1526734434
24999269,162534,109487,4.5,1526714913
24999326,162534,164179,5.0,1526712632
24999329,162534,165549,5.0,1526713272


In [403]:
similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
similar_users_recs

255             29
256             32
257             50
261            214
263            293
             ...  
24999248    101962
24999269    109487
24999326    164179
24999329    165549
24999348    177593
Name: movieId, Length: 2321248, dtype: int64

Next we are going to find only the movies that greater than 10% of the users who are similar to us liked, because we want to narrow down the recommendations. So we are gonna look for movies that 10% or more of the users who are similar to us also liked.

In [404]:
similar_users_recs.value_counts()
# it counts how many times each movie appears in our particular data set.

1         18835
318       15884
260       13870
296       13324
356       12169
          ...  
59290         1
44317         1
188811        1
188685        1
88934         1
Name: movieId, Length: 22464, dtype: int64

Now we find only the movies that are greater than 10%

In [405]:
similar_users_recs = similar_users_recs.value_counts() / len(similar_users)
# By dividing it to total number of users, it is converted to percentage.
similar_users_recs

1         0.499483
318       0.421226
260       0.367817
296       0.353337
356       0.322708
            ...   
59290     0.000027
44317     0.000027
188811    0.000027
188685    0.000027
88934     0.000027
Name: movieId, Length: 22464, dtype: float64

In [406]:
similar_users_recs = similar_users_recs[similar_users_recs > .10]
# it gives the movies that are greater than 10% liked
similar_users_recs


1       0.499483
318     0.421226
260     0.367817
296     0.353337
356     0.322708
          ...   
1148    0.103609
1527    0.102867
4995    0.102522
778     0.102495
34      0.100162
Name: movieId, Length: 90, dtype: float64

So now we have 90 of the movies whom the users similar to us liked.

So the next thning we need to do is sum of these movies are movies that are specific to our ... so they are movies that people like us liked more than they generally liked the movies

## Finding how much all users like movies

In [409]:
# We now find how much all of the users in our dataset liked these movies

all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings["rating"] > 4)]
all_users
# These are all of the users who watched the movies that were recommended to us. so the movies in the above set of 90.

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000058,162541,4995,5.0,1240951903
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613


In [414]:
# we are going to find what percentage of all users recommend each of these movie i.e all of the users and not only the ones
# that are similar to us.
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
# so this is finding the percentage of all users who recommended the movies in similar_user_recs


what we want is the movies that are in high percentage in similar_users_recs,i.e the users similar to us liked that movie.
i.e higher percentage in similar_users_recs than the other set.
If a hundred percent of all users like Toy Story and a hundred percent of users similar to us also liked Toy Story,
Toy Story is probably not a good recommendation if you watched the avengers because even though Toy Story is highly recommended every body like it, but what you want is everyone who liked the avengers rated thor very highly, hundred percent of the people like the movie thor but of all of the users who watched thor only 40 % liked it,you want movies that have big differential in how they are recommended b/w people similar to you who have similar taste to you versus just the general set of everybody  so thats what we are doing, s lets take a look at all_users_recs

In [416]:
all_users_recs
#it basically gives us the percentage of all users who liked these mvies

318     0.345497
296     0.287399
2571    0.246370
356     0.237518
593     0.228071
          ...   
3114    0.054220
2716    0.053892
34      0.052729
1073    0.049232
1148    0.047922
Name: movieId, Length: 90, dtype: float64

## Creating a recomendation score

In [418]:
# we will now compare the percentages and for that we will use pandas concatenate methd to combine these two series together
#so each series will essentially be a column and then we will name our columns similar and all

rec_percentages = pd.concat([similar_users_recs, all_users_recs ], axis = 1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0,similar,all
1,0.499483,0.125923
318,0.421226,0.345497
260,0.367817,0.224334
296,0.353337,0.287399
356,0.322708,0.237518
...,...,...
1148,0.103609,0.047922
1527,0.102867,0.066762
4995,0.102522,0.076403
778,0.102495,0.075473


so this gives us each of the movies that were recommended to us and how much users similar to us like them and how much just the average person liked them. so we want movies that have a big difference between these two numbers. 
So we create a score which is just dividing one by the other

In [419]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

# after creating score we sort these recommendations

In [420]:
rec_percentages = rec_percentages.sort_values("score", ascending = False)

In [421]:
rec_percentages

Unnamed: 0,similar,all,score
1,0.499483,0.125923,3.966586
3114,0.170357,0.054220,3.141967
4886,0.166645,0.071489,2.331060
6377,0.166565,0.072960,2.282977
1073,0.111591,0.049232,2.266621
...,...,...,...
58559,0.180461,0.147871,1.220392
318,0.421226,0.345497,1.219189
4973,0.136148,0.113481,1.199744
2959,0.252380,0.218792,1.153517


So we can see the score is the ratio between how much users similar to us like the movie and how much just the average user liked the movie,that gives us the score, so higher is the score the better is the recomendation

We take our top ten recommendations and we merge them with our movies data, so that we can get the titles of these movies and we say left index equals true so for rec_percentages the index is the movie_id so we are saying use the index and merge that on the right with movie_id from the movies data set

In [422]:
rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,0.499483,0.125923,3.966586,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.170357,0.05422,3.141967,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
4780,0.166645,0.071489,2.33106,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
6258,0.166565,0.07296,2.282977,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
1047,0.111591,0.049232,2.266621,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
8246,0.154207,0.069109,2.231373,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
580,0.151449,0.068159,2.221989,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
1120,0.103609,0.047922,2.162033,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,Wallace Gromit The Wrong Trousers 1993
359,0.18473,0.086585,2.133522,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994
587,0.12806,0.060551,2.1149,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991


## Building a recommendatin function

In [428]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
    similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    # this is finding our recommendatins from users similar to us
    
    
    similar_users_recs = similar_users_recs.value_counts() / len(similar_users)
    similar_users_recs = similar_users_recs[similar_users_recs > .10]
    # This is adjusting so we only have recommendations where over 10 percent of users recommended that movie
    
    all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    # This is finding how commen the recommendations were among all of the users
    
    rec_percentages = pd.concat([similar_users_recs, all_users_recs ], axis = 1)
    rec_percentages.columns = ["similar", "all"]
    # Finally we are creating our score.we first concatenate the two
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    
    return rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")[["score", "title", "genres"]]
    #  by adding score, title, genres we only selected the columns that we need.



## Creating an interactive recommendation widget

In [433]:
movie_name_input = widgets.Text(
value = "Toy Story",
description = "Movie Title:",
disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]  # This grabs our title from the input widget
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names = "value")
display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()