In [None]:
# %pip install pandas
# %pip install scikit-learn
# %pip install ipywidgets

In [2]:
# import necessary libraries

import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import ipywidgets as widgets
from IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
# read in file

movies = pd.read_csv("ml-25m/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Remove special characters from title

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9]", "", title)
    return title


movies['cleaned_title'] = movies['title'].apply(clean_title)
#movies.head()

In [5]:
# convert the cleaned titles into numerical format suitable for machine learning algorithms

vectorizer = TfidfVectorizer(ngram_range=(1,2))
formatted = vectorizer.fit_transform(movies["cleaned_title"])

In [6]:
# find the most similar movie titles to a given input title based on TF-IDF similarity scores

def search(title):
    title = clean_title(title)
    query = vectorizer.transform([title])
    similarity = cosine_similarity(query, formatted).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    result = movies.iloc[indices][::-1]
    return result

In [None]:
# create an interactive text widget that allows users to input a movie title
# that displas a list of similar movie titles as the user types

movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names="value")

display(movie_input, movie_list)

In [8]:
# read in ratings file to be used for similar movie recommendations

ratings = pd.read_csv("ml-25m/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [9]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [10]:
movie_id = 657

In [11]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [12]:
similar_users_recs = similar_users_recs.value_counts()/len(similar_users)
similar_users_recs = similar_users_recs[similar_users_recs>.10]
similar_users_recs

movieId
657       1.000000
38        0.333333
66097     0.333333
70286     0.333333
69844     0.333333
            ...   
2355      0.333333
2321      0.333333
2294      0.333333
2174      0.333333
196417    0.333333
Name: count, Length: 312, dtype: float64

In [13]:
all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings["rating"] > 4)]
all_users_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())

In [14]:
rec_percentages = pd.concat([similar_users_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [15]:
rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [16]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,cleaned_title
646,1.0,2.2e-05,45133.0,657,Yankee Zulu (1994),Comedy|Drama,YankeeZulu1994
670,0.333333,2.2e-05,15044.333333,683,"Eye of Vichy, The (Oeil de Vichy, L') (1993)",Documentary,EyeofVichyTheOeildeVichyL1993
395,0.333333,2.2e-05,15044.333333,400,Homage (1995),Drama,Homage1995
723,0.333333,3e-05,11283.25,738,"Garçu, Le (1995)",Drama,GaruLe1995
733,0.333333,3e-05,11283.25,749,"Man from Down Under, The (1943)",Drama,ManfromDownUnderThe1943
736,0.333333,3e-05,11283.25,752,Vermont Is For Lovers (1992),Comedy|Documentary|Romance,VermontIsForLovers1992
190,0.333333,3.7e-05,9026.6,192,The Show (1995),Documentary,TheShow1995
641,0.333333,4.4e-05,7522.166667,652,"301, 302 (301/302) (1995)",Horror|Mystery|Thriller,3013023013021995
712,0.333333,4.4e-05,7522.166667,727,War Stories (1995),Documentary,WarStories1995
36449,0.333333,9.6e-05,3471.769231,150262,Megamind: The Button Of Doom (2011),Action|Animation|Children|Comedy,MegamindTheButtonOfDoom2011


In [17]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    similar_users_recs = similar_users_recs.value_counts()/len(similar_users)
    similar_users_recs = similar_users_recs[similar_users_recs>.10]

    all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_users_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]