<a href="https://colab.research.google.com/github/BruceTsai0909/Machine-Learning-Projects/blob/master/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!ls

ml-25m.zip  sample_data


In [None]:
import pandas as pd

In [None]:
movies = pd.read_csv("movies.csv")

In [None]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [None]:
import re

def clean_title(title):
  return re.sub('[^a-zA-Z0-9 ]', '', title)


In [None]:
movies['clean_title'] = movies['title'].apply(clean_title)

In [None]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies['clean_title'])

In [None]:
#define search movie function
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -5)[-5:]
  results = movies.iloc[indices][::-1]
  return results

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = '#search here',
    description = 'Moive Title:',
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data['new']
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

In [None]:
ratings = pd.read_csv('ratings.csv')

In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
1,1,306,3.5,1.147869e+09
2,1,307,5.0,1.147869e+09
3,1,665,5.0,1.147879e+09
4,1,899,3.5,1.147869e+09
...,...,...,...,...
18444647,119483,357,5.0,1.462048e+09
18444648,119483,597,4.5,1.462128e+09
18444649,119483,1307,4.5,1.462048e+09
18444650,119483,1569,4.5,1.462129e+09


In [None]:
movie_id = 1

In [None]:
similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] > 4)]['userId'].unique()

In [None]:
similar_users

array([    36,     75,     86, ..., 119452, 119460, 119475])

In [None]:
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']

In [None]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
18443393     7143
18443426     8360
18443451     8957
18443467    33166
18443470    34162
Name: movieId, Length: 1000711, dtype: int64

In [None]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > 0.1]

In [None]:
similar_user_recs

1        1.000000
318      0.444116
260      0.399091
356      0.369291
296      0.367343
           ...   
59315    0.104120
953      0.103543
48780    0.100873
551      0.100729
745      0.100368
Name: movieId, Length: 112, dtype: float64

In [None]:
all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]

In [None]:
all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

In [None]:
all_users_recs

318      0.341603
296      0.285297
2571     0.243897
356      0.234371
593      0.225729
           ...   
551      0.041545
50872    0.039742
745      0.037057
78499    0.035363
2355     0.025323
Name: movieId, Length: 112, dtype: float64

In [None]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ['similar', 'all']


In [None]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.124895
318,0.444116,0.341603
260,0.399091,0.221818
356,0.369291,0.234371
296,0.367343,0.285297
...,...,...
59315,0.104120,0.054071
953,0.103543,0.046213
48780,0.100873,0.068751
551,0.100729,0.041545


In [None]:
rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']

In [None]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124895,8.006710
318,0.444116,0.341603,1.300092
260,0.399091,0.221818,1.799184
356,0.369291,0.234371,1.575666
296,0.367343,0.285297,1.287579
...,...,...,...
59315,0.104120,0.054071,1.925614
953,0.103543,0.046213,2.240567
48780,0.100873,0.068751,1.467215
551,0.100729,0.041545,2.424592


In [None]:
rec_percentages = rec_percentages.sort_values('score', ascending=False)

In [None]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124895,8.006710
3114,0.283426,0.054269,5.222577
2355,0.111624,0.025323,4.407965
78499,0.153546,0.035363,4.342069
4886,0.234938,0.070491,3.332888
...,...,...,...
296,0.367343,0.285297,1.287579
2858,0.215456,0.167431,1.286831
79132,0.167905,0.131122,1.280523
4973,0.143950,0.112576,1.278689


In [None]:
rec_percentages.head(10).merge(movies, left_index=True, right_on='movieId')

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124895,8.00671,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.283426,0.054269,5.222577,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.111624,0.025323,4.407965,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.153546,0.035363,4.342069,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.234938,0.070491,3.332888,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
6258,0.228011,0.071842,3.173759,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
580,0.213075,0.067291,3.166441,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.179234,0.059893,2.992577,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.204488,0.068418,2.988806,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.249946,0.085694,2.916736,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [82]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [81]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()