In [1]:
import pandas as pd
movies=pd.read_csv("movies.csv")

In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
#clearing the space or brackets and other special characters
import re
def clean_title(title):
  return re.sub("[^a-zA-z0-9 ]","",title)

In [4]:
#now going through the title column and calling the clean_title function
movies["clean_title"]=movies["title"].apply(clean_title)

In [5]:
#for checking the title differently i.e toy story1995 or toystory 1995
from sklearn.feature_extraction.text import TfidfVectorizer

Vectorizer =TfidfVectorizer(ngram_range=(1,2))

tfidf = Vectorizer.fit_transform(movies["clean_title"])

In [6]:
#creating a search box with the help of cosine similarity5
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
  title = clean_title(title)
  query_vec =Vectorizer.transform([title])
  similarity= cosine_similarity(query_vec, tfidf).flatten()
  #find the title having greatest similarity in our title
  indices=np.argpartition(similarity, -5)[-5:]
  results = movies.iloc[indices][::-1]
  return results


In [7]:
#creating a search box
import ipywidgets as widgets
#used to show differents things as output from jupyter itself
from IPython.display import display

movie_input =widgets.Text(
    value= "Toy Story",
    description="Movie Title:",
    disabled=False
)

#creating a ouput widget
movie_list=widgets.Output()

def on_type(data):
    with movie_list:
      movie_list.clear_output()
      title=data["new"]
      if len(title) >5:
        display(search(title))


movie_input.observe(on_type, names="value")

display(movie_input,movie_list)


Text(value='Toy Story', description='Movie Title:')

Output()

In [8]:
ratings=pd.read_csv("ratings.csv")

In [9]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [10]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [11]:
movie_id=1

In [12]:
#now we gonna find similar user who like the same movies as us
similar_users= ratings[(ratings["movieId"]==movie_id)&(ratings["rating"]>=4)]["userId"].unique()

In [13]:
similar_users

array([     3,      5,      8, ..., 162530, 162533, 162534], dtype=int64)

In [14]:
similar_user_recs=ratings[(ratings["userId"].isin(similar_users))& (ratings["rating"] >=4)]["movieId"]

In [15]:
similar_user_recs

254              1
255             29
256             32
257             50
258            111
             ...  
24999332    166643
24999342    171763
24999348    177593
24999351    177765
24999378    198609
Name: movieId, Length: 5101989, dtype: int64

In [16]:
#displaying the user recommend the movie percentage
similar_user_recs= similar_user_recs.value_counts() / len(similar_users)
#displaying moving having more than 10% like
similar_user_recs= similar_user_recs[similar_user_recs > .1]

In [17]:
similar_user_recs

movieId
1       1.000000
318     0.549604
260     0.531518
356     0.517224
296     0.495744
          ...   
235     0.101249
1242    0.100931
1907    0.100772
3527    0.100613
2761    0.100135
Name: count, Length: 273, dtype: float64

In [18]:
#find how much user in the dataset like this movie
all_users=ratings[(ratings["movieId"].isin(similar_user_recs.index))& (ratings["rating"] >= 4)]

In [19]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
11,1,1653,4.0,1147868097
19,1,2692,5.0,1147869100
23,1,3949,5.0,1147868678
29,1,4973,4.5,1147869080
...,...,...,...,...
25000081,162541,7361,4.5,1240953484
25000084,162541,8961,4.0,1240953338
25000087,162541,33794,4.0,1240951792
25000090,162541,50872,4.5,1240953372


In [20]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [21]:
all_users_recs

movieId
318     0.440215
296     0.389659
356     0.367553
593     0.361897
2571    0.347994
          ...   
3175    0.049325
2081    0.047128
1282    0.044712
2761    0.039855
1907    0.039805
Name: count, Length: 273, dtype: float64

In [22]:
rec_percentages=pd.concat([similar_user_recs ,all_users_recs],axis=1)
rec_percentages.columns=["similar","all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.235415
318,0.549604,0.440215
260,0.531518,0.325251
356,0.517224,0.367553
296,0.495744,0.389659
...,...,...
235,0.101249,0.055281
1242,0.100931,0.050805
1907,0.100772,0.039805
3527,0.100613,0.056879


In [23]:
rec_percentages["score"]=rec_percentages["similar"] / rec_percentages["all"]

In [24]:
rec_percentages=rec_percentages.sort_values("score",ascending=False)

In [25]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.235415,4.247819
3114,0.328914,0.102241,3.217054
78499,0.161924,0.057710,2.805840
2355,0.191095,0.068978,2.770367
2081,0.120714,0.047128,2.561408
...,...,...,...
99114,0.112732,0.091209,1.235967
2959,0.351826,0.292519,1.202745
6016,0.118380,0.099007,1.195678
109487,0.117426,0.102603,1.144469


In [26]:
rec_percentages.head().merge(movies, left_index=True ,right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.235415,4.247819,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.328914,0.102241,3.217054,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,0.161924,0.05771,2.80584,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
2264,0.191095,0.068978,2.770367,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
1992,0.120714,0.047128,2.561408,2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance,Little Mermaid The 1989


In [27]:
#building a recommendation system
def find_similar_movies(movie_id):
    similar_users= ratings[(ratings["movieId"]==movie_id)&(ratings["rating"]>=4)]["userId"].unique()
    similar_user_recs=ratings[(ratings["userId"].isin(similar_users))& (ratings["rating"] >=4)]["movieId"]


    similar_user_recs= similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs= similar_user_recs[similar_user_recs > .1]

    all_users=ratings[(ratings["movieId"].isin(similar_user_recs.index))& (ratings["rating"] >= 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages=pd.concat([similar_user_recs ,all_users_recs],axis=1)
    rec_percentages.columns=["similar","all"]

    rec_percentages["score"]=rec_percentages["similar"]/ rec_percentages["all"]

    rec_percentages=rec_percentages.sort_values("score",ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score","title","genres"]]


In [28]:
#creating a final search box
movie_name_input =widgets.Text(
    value= "",
    description="Movie Title:",
    disabled=False
)

#creating a ouput widget
recommendation_list=widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title=data["new"]
        if len(title) >5:
            results=search(title)
            movie_id=results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))


movie_name_input.observe(on_type, names="value")
display(movie_name_input,recommendation_list)


Text(value='', description='Movie Title:')

Output()