In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors


# Nearest Neighbor Item Based Collaborative Filtering Model 

This model is adapated from:
- https://github.com/krishnaik06/Recommendation_complete_tutorial/tree/master/KNN%20Movie%20Recommendation
- https://github.com/topspinj/tmls-2020-recommender-workshop/tree/master

In [2]:
# Import only first dataset 

ratings = pd.read_csv("../data/interim/ratings_TI1.csv")
ratings.drop(columns=["time_interval", "timestamp"], inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,3,1,4.0
1,3,24,3.0
2,3,32,4.0
3,3,50,5.0
4,3,160,3.0


In [3]:
ratings.shape

(3977249, 3)

In [4]:
movies = pd.read_csv("../data/interim/movies_TI1.csv")
movies.shape

(3221, 4)

In [5]:
ratings = ratings.merge(movies[["title", "movieId"]], on="movieId")

In [6]:
ratings_count = ratings.groupby("movieId")["rating"].count().reset_index().rename(columns={"rating": "rating_count"})
ratings_count.head()

Unnamed: 0,movieId,rating_count
0,1,18014
1,2,8926
2,3,7731
3,4,1812
4,5,7786


In [7]:
# Bayesian Average Rating


In [8]:
ratings = pd.merge(ratings, ratings_count, on="movieId", how="left")
ratings.head()

Unnamed: 0,userId,movieId,rating,title,rating_count
0,3,1,4.0,Toy Story (1995),18014
1,3,24,3.0,Powder (1995),3036
2,3,32,4.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),19435
3,3,50,5.0,"Usual Suspects, The (1995)",15349
4,3,160,3.0,Congo (1995),9362


In [9]:
# Check if there are films with less than 50 ratings
ratings[ratings["rating_count"] < 50].head()

Unnamed: 0,userId,movieId,rating,title,rating_count
743,53,1486,4.0,"Quiet Room, The (1996)",46
763,53,1695,3.0,Artemisia (1997),49
776,53,1871,4.0,"Friend of the Deceased, A (Priyatel pokonika) ...",24
788,53,2489,4.0,Spanish Fly (1998),13
1259,104,1121,1.0,Glory Daze (1995),30


In [10]:
# Drop movies with less than 50 ratings
ratings = ratings[ratings["rating_count"] >= 50]

In [11]:
# Create Pivot Table

ratings_pivot = ratings.pivot_table(
        index="movieId", columns="userId", values="rating"
    ).fillna(0)


In [49]:
ratings_pivot.tail()

userId,3,4,5,6,8,10,12,13,15,17,...,138448,138450,138452,138453,138463,138466,138480,138483,138484,138488
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
3178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Save to csv
# ratings_pivot.to_csv("../data/processed/movie_rating_pivot_TI1.csv")

## Part 2: Fit Model

In [50]:
movie_rating_matrix = pd.read_csv("../data/processed/movie_rating_pivot_TI1.csv")

In [52]:
movie_rating_matrix.tail()

Unnamed: 0,movieId,3,4,5,6,8,10,12,13,15,...,138448,138450,138452,138453,138463,138466,138480,138483,138484,138488
2531,4970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2532,5060,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2533,5649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2534,6425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2535,6918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
movie_rating_matrix.iloc[405]["movieId"]

np.float64(441.0)

In [38]:
# Create sparse matrix
ratings_pivot_sparse = csr_matrix(movie_rating_matrix.values)

In [39]:

model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=5)
model_knn.fit(ratings_pivot_sparse)

In [45]:
# Select example movie (which could be the last movie someone watched)
test_movie_id = 2

In [46]:
test_movie = movie_rating_matrix.loc[2].values
test_movie

array([0., 0., 3., ..., 3., 3., 3.])

In [63]:
# Define helper functions to get movie title

def get_movie_title_by_index(index):
    id = movie_rating_matrix.iloc[index]["movieId"]
    return movies[movies["movieId"] == id]["title"].values[0]

def get_movie_title_by_id(movie_id):
    return movies[movies["movieId"] == movie_id]["title"].values[0]

get_movie_title_by_id(2)


'Jumanji (1995)'

In [56]:
movies[movies["movieId"] == 

405

In [64]:
get_movie_title_by_index(444)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [62]:
movie_rating_matrix.iloc[1]

movieId    2.0
3          0.0
4          0.0
5          3.0
6          0.0
          ... 
138466     0.0
138480     0.0
138483     3.0
138484     3.0
138488     3.0
Name: 1, Length: 47742, dtype: float64

In [60]:
# Get similar movies

distances, indices = model_knn.kneighbors(test_movie.reshape(1, -1), n_neighbors=6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print(f"Because you watched {get_movie_title(test_movie_id)}:")
    else:
        print(indices.flatten()[i])
        print(f"{i}: {get_movie_title_by_index(indices.flatten()[i])} with distance {distances.flatten()[i]}")

[[  1 444 464 341 344 295]]
Because you watched Jumanji (1995):
444


IndexError: index 0 is out of bounds for axis 0 with size 0