In [None]:
import pandas as pd


movies = pd.read_csv("movies.csv")

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [None]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [None]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)


Text(value='Toy Story', description='Movie Title:')

Output()

In [None]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [None]:
ratings = pd.read_csv("ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
819469,5516,55908,5.0,1426894957
819470,5516,68157,4.0,1426894879
819471,5516,68237,5.0,1426894972
819472,5516,68791,2.5,1426895408


In [None]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [None]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [None]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [None]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [None]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [None]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [None]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [None]:
rec_percentages

Unnamed: 0,similar,all
1,0.236181,0.125050
47,0.185930,0.143888
50,0.190955,0.211423
110,0.115578,0.155711
260,0.351759,0.216433
...,...,...
134853,0.206030,0.034469
152081,0.120603,0.019238
164179,0.130653,0.024649
166528,0.100503,0.012024


In [None]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [None]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [None]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
17067,1.0,0.03988,25.075377,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
21348,0.276382,0.016232,17.02649,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,Captain America The Winter Soldier 2014
19678,0.190955,0.011222,17.015434,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013
16725,0.155779,0.009218,16.898624,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,Captain America The First Avenger 2011
25058,0.20603,0.012224,16.853942,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,Avengers Age of Ultron 2015
25061,0.120603,0.007214,16.716918,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan 2015
16312,0.135678,0.008216,16.513053,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,Thor 2011
25071,0.221106,0.014629,15.113926,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,Captain America Civil War 2016
21606,0.241206,0.016633,14.501423,111362,X-Men: Days of Future Past (2014),Action|Adventure|Sci-Fi,XMen Days of Future Past 2014
14628,0.180905,0.013226,13.677478,77561,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX,Iron Man 2 2010


In [None]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).reset_index().rename(columns={"index": "movieId"}).merge(movies, left_on="movieId", right_on="movieId")[["score", "title", "genres"]]


In [None]:
find_similar_movies(89745)

Unnamed: 0,score,title,genres
0,25.075377,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
1,17.02649,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX
2,17.015434,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX
3,16.898624,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War
4,16.853942,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi
5,16.716918,Ant-Man (2015),Action|Adventure|Sci-Fi
6,16.513053,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX
7,15.113926,Captain America: Civil War (2016),Action|Sci-Fi|Thriller
8,14.501423,X-Men: Days of Future Past (2014),Action|Adventure|Sci-Fi
9,13.677478,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX


In [None]:
similar_movies = find_similar_movies(3)
similar_movies["title"]
similar_movies_with_ids = similar_movies.merge(movies, on="title")[[ "movieId"]]

In [None]:
similar_movies_with_ids.values

array([[   3],
       [3450],
       [   5],
       [1367],
       [ 494],
       [ 317],
       [ 736],
       [ 104],
       [  11],
       [ 788]])

In [None]:
def predict_rating(user_id, movie_id):
    similar_movies = find_similar_movies(movie_id)
    similar_movies_with_ids = similar_movies.merge(movies, on="title")[["score", "movieId"]]
    similar_movies_ids = similar_movies_with_ids["movieId"].tolist()
    user_ratings = ratings[(ratings["userId"] == user_id) & (ratings["movieId"].isin(similar_movies_ids))]
    avg_rating = user_ratings["rating"].mean()
    if np.isnan(avg_rating):
        avg_rating = ratings["rating"].mean()

    return avg_rating


In [None]:
predict_rating(12,1)

4.0

In [None]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [None]:
from sklearn.metrics import mean_squared_error, precision_score
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(ratings, test_size=1000)
train_size = X_train.shape[0]
test_size = X_test.shape[0]
print("Test size:", test_size)
error = 0
for k in range(test_size): 
    u = X_test.iloc[k,0]
    i = X_test.iloc[k,1]
    r = X_test.iloc[k,2]
    error += np.abs(r - predict_rating(u,i))
print(error/test_size)

Test size: 1000
0.3335590909127884


In [None]:
#RMSE
np.sqrt(error/test_size)

0.5775457478960333

In [None]:
def top_N_pred_sort(N, u):
    preds = pd.Series([], dtype='float')
    # find the movies not rated by u
    movies_not_rated = ratings.query("userId != @u").movieId.unique()
    for m in movies_not_rated:
        preds[m] = predict_rating(u, m)
    return preds.sort_values(ascending=False)[:N]  

In [None]:
top_N_pred_sort(10, 1)

1575      5.0
73290     5.0
59814     5.0
101962    5.0
665       5.0
5673      5.0
4848      5.0
74428     5.0
27005     5.0
3569      5.0
dtype: float64

In [38]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()