<a href="https://colab.research.google.com/github/Claudbest1/Big-data-Management/blob/main/Movie%20Suggestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Reading our movie data with pandas
import pandas as pd

movies=pd.read_csv("movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
#cleaning movie title with regex
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9]"," ",title)

In [5]:
movies["clean_title"]=movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [6]:
#Build search engine using TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf= vectorizer.fit_transform(movies["clean_title"])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    
    #Next Find titles that have greatest similarity to our search term
    indices=np.argpartition(similarity, -5)[-5:] #Display the first five most similar title
    results=movies.iloc[indices][::-1] #Used to bring the most similar result to the top
    return results

#Building an interactive search box

In [8]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled = False
)
#To display output 
movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data["new"]
    if len(title) > 3:
      display(search(title)) 

movie_input.observe(on_type, names="value")

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

#Reading in Movie Ratings Data

In [9]:
ratings = pd.read_csv("ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
1,1,306,3.5,1.147869e+09
2,1,307,5.0,1.147869e+09
3,1,665,5.0,1.147879e+09
4,1,899,3.5,1.147869e+09
...,...,...,...,...
250443,1755,90647,4.0,1.555935e+09
250444,1755,91542,4.5,1.556107e+09
250445,1755,95105,4.0,1.555936e+09
250446,1755,95543,4.0,1.555936e+09


In [10]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp    float64
dtype: object

#Finding all users who liked the movie we searched/watched

In [11]:
movie_id = 1

In [12]:
#The code below shows those who liked the movie and those who rated it above 4
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
similar_users



array([  36,   75,   86,   90,   93,   95,   96,   98,  111,  120,  127,
        143,  152,  158,  160,  162,  171,  186,  188,  211,  217,  229,
        230,  235,  249,  257,  259,  297,  298,  302,  323,  329,  355,
        359,  369,  371,  381,  392,  402,  411,  428,  435,  439,  447,
        449,  468,  469,  477,  484,  513,  519,  537,  540,  541,  548,
        551,  553,  561,  567,  573,  582,  593,  607,  609,  611,  623,
        624,  626,  628,  631,  644,  653,  654,  670,  683,  686,  694,
        697,  702,  709,  727,  733,  741,  749,  752,  765,  768,  773,
        785,  791,  793,  796,  803,  805,  807,  811,  830,  834,  839,
        848,  856,  896,  904,  905,  911,  927,  947,  950,  956,  966,
        969,  986,  997, 1007, 1010, 1013, 1036, 1038, 1042, 1065, 1079,
       1092, 1096, 1101, 1118, 1123, 1131, 1138, 1140, 1141, 1143, 1146,
       1150, 1159, 1166, 1167, 1169, 1171, 1176, 1179, 1192, 1196, 1198,
       1199, 1200, 1228, 1230, 1232, 1240, 1242, 12

In [13]:
#Finding other movies that these set of people likes
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
similar_user_recs


5101          1
5105         34
5111        110
5114        150
5127        260
          ...  
250433    79293
250439    87869
250440    87876
250442    89745
250444    91542
Name: movieId, Length: 14897, dtype: int64

In [14]:
#Find only movies that >10% of the users similar to us also liked

#Now lets find how many users rated each movies rated >4
similar_user_recs.value_counts()

1        211
318       89
260       76
593       70
527       68
        ... 
1401       1
8254       1
8221       1
66203      1
87876      1
Name: movieId, Length: 3850, dtype: int64

In [15]:
#Now lets find the percentage of times users rated each movies rated >4 
similar_user_recs=similar_user_recs.value_counts() / len(similar_users)
#Take the ones greater than 10 percent
similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [16]:
similar_user_recs

1        1.000000
318      0.421801
260      0.360190
593      0.331754
527      0.322275
           ...   
1304     0.104265
1206     0.104265
89745    0.104265
2918     0.104265
4973     0.104265
Name: movieId, Length: 104, dtype: float64

#Finding out how much all users in our dataset likes this movies

In [17]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
29,1,4973,4.5,1.147869e+09
48,1,7361,5.0,1.147880e+09
72,2,110,5.0,1.141417e+09
76,2,260,5.0,1.141417e+09
...,...,...,...,...
250402,1754,33794,5.0,1.120152e+09
250404,1755,1,4.5,1.556107e+09
250419,1755,8961,4.5,1.555935e+09
250424,1755,60069,5.0,1.555935e+09


In [18]:
#Finding what percentage of all users recommends this particular movie

all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
all_users_recs

318      0.339286
296      0.282020
2571     0.238916
593      0.222906
527      0.221059
           ...   
50872    0.035714
1954     0.032635
78499    0.028941
2355     0.025246
1028     0.025246
Name: movieId, Length: 104, dtype: float64

#Creating a recommendation score

In [19]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.129926
32,0.127962,0.097906
34,0.132701,0.049261
47,0.218009,0.134236
50,0.251185,0.211207
...,...,...
68157,0.104265,0.060961
68954,0.170616,0.064655
78499,0.132701,0.028941
79132,0.161137,0.130542


In [20]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.129926,7.696682
3114,0.236967,0.048030,4.933771
78499,0.132701,0.028941,4.585258
2355,0.109005,0.025246,4.317651
1028,0.104265,0.025246,4.129927
...,...,...,...
296,0.312796,0.282020,1.109129
7361,0.118483,0.108990,1.087102
4973,0.104265,0.100369,1.038816
2959,0.213270,0.205665,1.036978


In [21]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.129926,7.696682,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.236967,0.04803,4.933771,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,0.132701,0.028941,4.585258,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
2264,0.109005,0.025246,4.317651,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bug s Life A 1998
1005,0.104265,0.025246,4.129927,1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical,Mary Poppins 1964
1865,0.109005,0.032635,3.34007,1954,Rocky (1976),Drama,Rocky 1976
4780,0.218009,0.065887,3.308854,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
1047,0.14218,0.04803,2.960262,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
11361,0.104265,0.035714,2.919431,50872,Ratatouille (2007),Animation|Children|Drama,Ratatouille 2007
1226,0.132701,0.046182,2.873428,1259,Stand by Me (1986),Adventure|Drama,Stand by Me 1986


#Building a recommendation function

In [22]:
def find_similar_movies(movie_id):
  #The code below shows those who liked the movie and those who rated it above 4
  similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
  similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
  
  #Now lets find the percentage of times users rated each movies rated >4 
  similar_user_recs=similar_user_recs.value_counts() / len(similar_users)
  #Take the ones greater than 10 percent
  similar_user_recs = similar_user_recs[similar_user_recs > .10]

  all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
  all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

  rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
  rec_percentages.columns = ["similar", "all"]

  rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

  rec_percentages = rec_percentages.sort_values("score", ascending=False)
  return rec_percentages.head(10).merge(movies, left_index=True, 
                                        right_on="movieId")[["score", "title", 
                                        "genres"]]


#Creating an interractive recommendation widget 

In [23]:
movie_name_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled = False
)
#To display output 
recommendation_list = widgets.Output()

def on_type(data):
  with recommendation_list:
    recommendation_list.clear_output()
    title = data["new"]
    if len(title) > 2:
      results = search(title)
      movie_id = results.iloc[0]["movieId"]
      display(find_similar_movies(movie_id))


movie_name_input.observe(on_type, names="value")

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()