In [74]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors


# Nearest Neighbor Item Based Collaborative Filtering Model 

This model is adapated from:
- https://github.com/krishnaik06/Recommendation_complete_tutorial/tree/master/KNN%20Movie%20Recommendation
- https://github.com/topspinj/tmls-2020-recommender-workshop/tree/master

In [75]:
# Import only first dataset 

ratings = pd.read_csv("../data/interim/ratings_TI1.csv")
ratings.drop(columns=["time_interval", "timestamp"], inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,3,1,4.0
1,3,24,3.0
2,3,32,4.0
3,3,50,5.0
4,3,160,3.0


In [76]:
ratings.shape

(3977249, 3)

In [77]:
movies = pd.read_csv("../data/interim/movies_TI1.csv")
movies.shape

(3221, 4)

In [78]:
ratings = ratings.merge(movies[["title", "movieId"]], on="movieId")

In [79]:
ratings_count = ratings.groupby("movieId")["rating"].count().reset_index().rename(columns={"rating": "rating_count"})
ratings_count.head()

Unnamed: 0,movieId,rating_count
0,1,18014
1,2,8926
2,3,7731
3,4,1812
4,5,7786


In [80]:
# Bayesian Average Rating


In [81]:
ratings = pd.merge(ratings, ratings_count, on="movieId", how="left")
ratings.head()

Unnamed: 0,userId,movieId,rating,title,rating_count
0,3,1,4.0,Toy Story (1995),18014
1,3,24,3.0,Powder (1995),3036
2,3,32,4.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),19435
3,3,50,5.0,"Usual Suspects, The (1995)",15349
4,3,160,3.0,Congo (1995),9362


In [82]:
# Check if there are films with less than 50 ratings
ratings[ratings["rating_count"] < 50].head()

Unnamed: 0,userId,movieId,rating,title,rating_count
743,53,1486,4.0,"Quiet Room, The (1996)",46
763,53,1695,3.0,Artemisia (1997),49
776,53,1871,4.0,"Friend of the Deceased, A (Priyatel pokonika) ...",24
788,53,2489,4.0,Spanish Fly (1998),13
1259,104,1121,1.0,Glory Daze (1995),30


In [83]:
# Drop movies with less than 50 ratings
ratings = ratings[ratings["rating_count"] >= 50]

In [84]:
# Create Pivot Table

ratings_pivot = ratings.pivot_table(
        index="movieId", columns="userId", values="rating"
    ).fillna(0)


In [85]:
ratings_pivot.tail()

userId,3,4,5,6,8,10,12,13,15,17,...,138448,138450,138452,138453,138463,138466,138480,138483,138484,138488
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
3178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
# Save to csv
# ratings_pivot.to_csv("../data/processed/movie_rating_pivot_TI1.csv")

## Part 2: Fit Model

In [87]:
movie_rating_matrix = pd.read_csv("../data/processed/movie_rating_pivot_TI1.csv")

In [88]:
movies = pd.read_csv("../data/raw/movies.csv")

In [89]:
movie_rating_matrix.head()

Unnamed: 0,movieId,3,4,5,6,8,10,12,13,15,...,138448,138450,138452,138453,138463,138466,138480,138483,138484,138488
0,1,4.0,0.0,0.0,5.0,4.0,4.0,4.0,4.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,3.0
1,2,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0
2,3,0.0,0.0,0.0,3.0,5.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
# Create sparse matrix
ratings_pivot_sparse = csr_matrix(movie_rating_matrix.values)

In [91]:

model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=5)
model_knn.fit(ratings_pivot_sparse)

In [92]:
# Select example movie (which could be the last movie someone watched)
test_movie_id = 1

In [93]:
test_movie = movie_rating_matrix.loc[2].values
test_movie

array([3., 0., 0., ..., 0., 4., 0.])

In [94]:
# Define helper functions to get movie title

def get_movie_title_by_index(index):
    id = movie_rating_matrix.iloc[index]["movieId"]
    return movies[movies["movieId"] == id]["title"].values[0]

def get_movie_title_by_id(movie_id):
    return movies[movies["movieId"] == movie_id]["title"].values[0]

get_movie_title_by_id(2)


'Jumanji (1995)'

In [98]:
# Get similar movies

distances, indices = model_knn.kneighbors(test_movie.reshape(1, -1), n_neighbors=6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print(f"Because you watched {get_movie_title(test_movie_id)}:")
    else:
        print(f"{i}: {get_movie_title_by_index(indices.flatten()[i])} with distance {distances.flatten()[i]}, movieId: {movie_rating_matrix.iloc[indices.flatten()[i]]['movieId']}")

Because you watched Toy Story (1995):
1: Father of the Bride Part II (1995) with distance 0.47839000342596816, movieId: 5.0
2: Mr. Holland's Opus (1995) with distance 0.5405354459132056, movieId: 62.0
3: Sabrina (1995) with distance 0.5504178490966711, movieId: 7.0
4: Happy Gilmore (1996) with distance 0.5659064002422969, movieId: 104.0
5: Broken Arrow (1996) with distance 0.581172477143125, movieId: 95.0


In [None]:
# Evaluate 

# Get real ratings of user
ratings = pd.read_csv("../data/interim/ratings_TI1.csv")


In [119]:
# Select random user

user_id = 3
latest_timestamp = ratings[ratings["userId"] == user_id]["timestamp"].max()
latest_best_rating = ratings[(ratings["userId"] == user_id) & (ratings["timestamp"] == latest_timestamp)].sort_values("rating", ascending=False).head(1)
latest_best_rating

Unnamed: 0,userId,movieId,rating,timestamp,time_interval
115,3,2034,4.0,1999-12-14 12:54:59,TI1


In [121]:
test_movie_id = latest_best_rating["movieId"].values[0]
test_movie_id

np.int64(2034)

In [124]:
test_movie = movie_rating_matrix[movie_rating_matrix["movieId"] == test_movie_id].values[0]

In [126]:
# Get recommendation for user

distances, indices = model_knn.kneighbors(test_movie.reshape(1, -1), n_neighbors=6)
movie_ids = []
for i in range(0, len(distances.flatten())):
    if i == 0:
        print(f"Because you watched {get_movie_title(test_movie_id)}:")
    else:
        movie_ids.append(movie_rating_matrix.iloc[indices.flatten()[i]]["movieId"])
        print(f"{i}: {get_movie_title_by_index(indices.flatten()[i])} with distance {distances.flatten()[i]}, movieId: {movie_rating_matrix.iloc[indices.flatten()[i]]['movieId']}")

Because you watched Black Hole, The (1979):
1: Saturn 3 (1980) with distance 0.0005183098555023857, movieId: 2851.0
2: Meteor (1979) with distance 0.000537674118260667, movieId: 2526.0
3: Swamp Thing (1982) with distance 0.0005506909410584182, movieId: 2668.0
4: Piranha (1978) with distance 0.0005518540882636369, movieId: 3024.0
5: Superman IV: The Quest for Peace (1987) with distance 0.0005593536207237904, movieId: 2643.0


In [136]:
def get_movie_rating(user_id, movie_id):
    result = ratings[(ratings["userId"] == user_id) & (ratings["movieId"] == movie_id)]["rating"]
    if len(result) > 0:
        result.values[0]
    else:
        return "not rated"

In [137]:
# Check, how the user actually rated the recommended movies

for movie in movie_ids:
    print(f"Movie {get_movie_title_by_id(movie)} was rated {get_movie_rating(user_id, movie)} by user {user_id}")

Movie Saturn 3 (1980) was rated not rated by user 3
Movie Meteor (1979) was rated not rated by user 3
Movie Swamp Thing (1982) was rated None by user 3
Movie Piranha (1978) was rated not rated by user 3
Movie Superman IV: The Quest for Peace (1987) was rated None by user 3


In [139]:
ratings[(ratings["userId"] == user_id)]["movieId"].unique()

array([   1,   24,   32,   50,  160,  173,  175,  196,  223,  260,  316,
        318,  329,  337,  440,  442,  457,  480,  490,  512,  541,  589,
        593,  610,  718,  780,  788,  858,  904,  905,  919,  924,  953,
        968, 1037, 1060, 1073, 1077, 1079, 1084, 1089, 1094, 1097, 1103,
       1125, 1127, 1129, 1179, 1188, 1193, 1196, 1197, 1198, 1199, 1200,
       1206, 1208, 1210, 1213, 1214, 1215, 1219, 1220, 1221, 1222, 1225,
       1228, 1230, 1240, 1242, 1247, 1257, 1258, 1259, 1261, 1266, 1270,
       1272, 1276, 1278, 1288, 1304, 1307, 1321, 1330, 1333, 1345, 1356,
       1372, 1373, 1374, 1375, 1376, 1396, 1544, 1584, 1603, 1653, 1674,
       1676, 1721, 1762, 1779, 1810, 1831, 1876, 1882, 1909, 1917, 1921,
       2009, 2011, 2012, 2018, 2028, 2034, 2046, 2053, 2054, 2076, 2093,
       2105, 2117, 2118, 2140, 2150, 2236, 2288, 2311, 2329, 2366, 2371,
       2391, 2407, 2428, 2448, 2455, 2505, 2528, 2529, 2530, 2531, 2532,
       2533, 2541, 2551, 2567, 2571, 2574, 2613, 26