# New Section

In [8]:
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
movies=pd.read_csv('/content/drive/MyDrive/movies.csv')
ratings=pd.read_csv('/content/drive/MyDrive/ratings.csv')
movies.head()
ratings.head()
movies_name=movies.set_index('movieId')['title'].to_dict()
n_users = len(ratings.userId.unique())
n_items = len(ratings.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of unique users: 88237
Number of unique movies: 54860


In [7]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
movies = pd.read_csv('/content/drive/MyDrive/movies.csv')
if "title" not in movies.columns:
    raise KeyError("The dataset does not contain a 'title' column.")
def clean_title(title):
    title = re.sub(r"[^a-zA-Z0-9 ]", "", title)
    return title
movies["clean_title"] = movies["title"].apply(clean_title)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies["clean_title"])
print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  \
0  Adventure|Animation|Children|Comedy|Fantasy   
1                   Adventure|Children|Fantasy   
2                               Comedy|Romance   
3                         Comedy|Drama|Romance   
4                                       Comedy   

                        clean_title  
0                    Toy Story 1995  
1                      Jumanji 1995  
2             Grumpier Old Men 1995  
3            Waiting to Exhale 1995  
4  Father of the Bride Part II 1995  


In [34]:
movies


{'action': ['Mad Max: Fury Road', 'John Wick', 'Die Hard'],
 'comedy': ['Superbad', 'The Hangover', 'Step Brothers'],
 'drama': ['The Shawshank Redemption', 'Forrest Gump', 'The Godfather'],
 'horror': ['Get Out', 'A Quiet Place', 'The Conjuring'],
 'romance': ['The Notebook', 'Pride and Prejudice', 'La La Land']}

In [35]:
from sklearn.metrics.pairwise import cosine_similarity


def search(title):
    title=clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]

    return results

In [37]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='We',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='We', description='Movie Title:')

Output()

In [13]:
import pandas as pd
movies=pd.read_csv('/content/drive/MyDrive/movies.csv')
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [38]:
ratings = pd.read_csv("/content/drive/MyDrive/ratings.csv")
ratings.dtypes

Unnamed: 0,0
userId,int64
movieId,int64
rating,float64
timestamp,int64


In [39]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
89745,1.000000,0.040459
58559,0.573393,0.148256
59315,0.530649,0.054931
79132,0.519715,0.132987
2571,0.496687,0.247010
...,...,...
47610,0.103545,0.022770
780,0.103380,0.054723
88744,0.103048,0.010383
1258,0.101226,0.083887


In [45]:
import pandas as pd
movies = pd.read_csv('/content/drive/MyDrive/movies.csv')

rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres
17067,1.0,0.040459,24.716368,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
20513,0.103711,0.005289,19.610199,106072,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX
25058,0.241054,0.012367,19.49177,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi
19678,0.216534,0.012119,17.867419,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX
16725,0.215043,0.012052,17.843074,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War
16312,0.175447,0.010142,17.299824,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX
21348,0.287608,0.016737,17.183667,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX
25071,0.214049,0.012856,16.649399,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller
25061,0.136017,0.008573,15.865628,122900,Ant-Man (2015),Action|Adventure|Sci-Fi
14628,0.242876,0.015517,15.651921,77561,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX


In [41]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [47]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

Unnamed: 0,score,title,genres
14813,28.697644,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
18747,11.772592,Wreck-It Ralph (2012),Animation|Comedy
21068,10.528393,The Lego Movie (2014),Action|Adventure|Animation|Children|Comedy|Fan...
15540,9.973482,Tangled (2010),Animation|Children|Comedy|Fantasy|Musical|Roma...
20638,9.409064,Frozen (2013),Adventure|Animation|Comedy|Fantasy|Musical|Rom...
14929,8.760672,Despicable Me (2010),Animation|Children|Comedy|Crime
14477,8.092905,How to Train Your Dragon (2010),Adventure|Animation|Children|Fantasy|IMAX
29850,7.875094,Inside Out (2015),Adventure|Animation|Children|Comedy|Drama|Fantasy
13362,7.83062,Up (2009),Adventure|Animation|Children|Drama
12377,7.808506,Kung Fu Panda (2008),Action|Animation|Children|Comedy|IMAX
