# Building a Movie Recommendation System

#### Get the data

In [1]:
ratings_data = "../data/ml-100k/u.data"
movies_data = "../data/ml-100k/u.item"

In [2]:
from collections import defaultdict

user_ratings = defaultdict(dict)
movie_ratings = defaultdict(dict)

with open(ratings_data, 'r') as f:
    for line in f:
        user, movie, stars, _ = line.split('\t')
        user_ratings[user][movie] = float(stars)
        movie_ratings[movie][user] = float(stars)

In [3]:
len(user_ratings)

943

In [4]:
len(movie_ratings)

1682

In [5]:
user_ratings["1"]  # userID = 1

{'61': 4.0,
 '189': 3.0,
 '33': 4.0,
 '160': 4.0,
 '20': 4.0,
 '202': 5.0,
 '171': 5.0,
 '265': 4.0,
 '155': 2.0,
 '117': 3.0,
 '47': 4.0,
 '222': 4.0,
 '253': 5.0,
 '113': 5.0,
 '227': 4.0,
 '17': 3.0,
 '90': 4.0,
 '64': 5.0,
 '92': 3.0,
 '228': 5.0,
 '266': 1.0,
 '121': 4.0,
 '114': 5.0,
 '132': 4.0,
 '74': 1.0,
 '134': 4.0,
 '98': 4.0,
 '186': 4.0,
 '221': 5.0,
 '84': 4.0,
 '31': 3.0,
 '70': 3.0,
 '60': 5.0,
 '177': 5.0,
 '27': 2.0,
 '260': 1.0,
 '145': 2.0,
 '174': 5.0,
 '159': 3.0,
 '82': 5.0,
 '56': 4.0,
 '272': 3.0,
 '80': 4.0,
 '229': 4.0,
 '140': 1.0,
 '225': 2.0,
 '235': 5.0,
 '120': 1.0,
 '125': 3.0,
 '215': 3.0,
 '6': 5.0,
 '104': 1.0,
 '49': 3.0,
 '206': 4.0,
 '76': 4.0,
 '72': 4.0,
 '185': 4.0,
 '96': 5.0,
 '213': 2.0,
 '233': 2.0,
 '258': 5.0,
 '81': 5.0,
 '78': 1.0,
 '212': 4.0,
 '143': 1.0,
 '151': 4.0,
 '51': 4.0,
 '175': 5.0,
 '107': 4.0,
 '218': 3.0,
 '209': 4.0,
 '259': 1.0,
 '108': 5.0,
 '262': 3.0,
 '12': 5.0,
 '14': 5.0,
 '97': 3.0,
 '44': 5.0,
 '53': 3.0,
 '163

In [6]:
movies = {}
with open(movies_data, 'r', encoding="latin-1") as f:
    for line in f:
        movie_id, title, *_ = line.split('|')
        movies[movie_id] = title
        
len(movies)

1682

In [7]:
movies["127"], movies["187"], movies["29"]  # movie ID = 127, 187, 29

('Godfather, The (1972)',
 'Godfather: Part II, The (1974)',
 'Batman Forever (1995)')

In [8]:
movie_ratings["127"]

{'269': 4.0,
 '109': 2.0,
 '256': 4.0,
 '175': 5.0,
 '280': 5.0,
 '103': 4.0,
 '294': 5.0,
 '159': 5.0,
 '211': 4.0,
 '7': 5.0,
 '115': 5.0,
 '15': 2.0,
 '272': 5.0,
 '288': 5.0,
 '248': 5.0,
 '185': 5.0,
 '323': 5.0,
 '231': 3.0,
 '250': 4.0,
 '73': 5.0,
 '59': 5.0,
 '25': 3.0,
 '45': 5.0,
 '150': 5.0,
 '157': 5.0,
 '89': 5.0,
 '82': 2.0,
 '46': 5.0,
 '192': 4.0,
 '123': 5.0,
 '299': 5.0,
 '153': 3.0,
 '161': 3.0,
 '253': 5.0,
 '263': 4.0,
 '318': 5.0,
 '54': 4.0,
 '382': 3.0,
 '327': 4.0,
 '243': 4.0,
 '139': 5.0,
 '169': 4.0,
 '315': 5.0,
 '72': 5.0,
 '197': 5.0,
 '373': 2.0,
 '374': 4.0,
 '276': 5.0,
 '308': 4.0,
 '324': 4.0,
 '405': 5.0,
 '85': 5.0,
 '399': 2.0,
 '188': 4.0,
 '158': 5.0,
 '193': 5.0,
 '402': 5.0,
 '194': 5.0,
 '8': 5.0,
 '360': 5.0,
 '286': 4.0,
 '325': 5.0,
 '26': 5.0,
 '292': 5.0,
 '10': 5.0,
 '222': 5.0,
 '332': 5.0,
 '189': 4.0,
 '417': 4.0,
 '210': 5.0,
 '347': 5.0,
 '391': 5.0,
 '429': 4.0,
 '90': 4.0,
 '406': 4.0,
 '379': 5.0,
 '24': 5.0,
 '95': 4.0,
 '37':

In [9]:
sum(movie_ratings["127"].values()) / len(movie_ratings["127"])

4.283292978208232

In [10]:
import pandas as pd
import numpy as np

ratings = pd.read_csv(ratings_data, sep='\t', names=['user', 'movie', 'rating', 'timestamp'])

ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [11]:
ratings.shape

(100000, 4)

In [13]:
n_movies = ratings["movie"].unique().shape
n_movies

(1682,)

In [14]:
n_users = ratings["user"].unique().shape
n_users

(943,)

In [22]:
data_matrix = np.zeros((ratings.user.max(), ratings.movie.max()))

In [23]:
for item in ratings.itertuples():
    data_matrix[item.user-1, item.movie-1] = item.rating

In [24]:
data_matrix

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

In [18]:
data_matrix.shape
# users, movies

(943, 1682)

#### Distance / Similarity

https://en.wikipedia.org/wiki/Euclidean_distance

$\mbox{euclidean}(x, y) = \big{|}\big{|} x - y \big{|}\big{|}_{2} = \sqrt{\sum_{i=0}^{n} (x_{i} - y_{i})^{2}}$

https://en.wikipedia.org/wiki/Cosine_similarity

$\mbox{cosine}(x, y) = 1 - \frac{x \cdot y}{|| x ||_{2} || y ||_{2}}$, i.e. one minus cosine similarity

In [25]:
from scipy.spatial.distance import cosine

cosine(data_matrix[:, 126], data_matrix[:, 186])  
# Godfather vs Godfather II

0.33459864313383969

In [26]:
cosine(data_matrix[:, 126], data_matrix[:, 28])  
# Godfather vs Batman Forever

0.70517598811748849

In [27]:
cosine(data_matrix[0, :], data_matrix[2, :])  # user 1 vs user 3

0.95254045717446822

In [28]:
cosine(data_matrix[0, :], data_matrix[915, :])  # user 1 vs user 916

0.43093426847201199

In [29]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data_matrix, test_size=0.2)

In [30]:
train_data.shape, test_data.shape

((754, 1682), (189, 1682))

In [33]:
from sklearn.metrics.pairwise import pairwise_distances

user_distance = pairwise_distances(train_data, metric='cosine')
item_distance = pairwise_distances(train_data.T, metric='cosine')

In [34]:
user_distance

array([[ 0.        ,  0.84355605,  0.63142252, ...,  0.77133182,
         0.82256341,  0.7948404 ],
       [ 0.84355605,  0.        ,  0.79922665, ...,  0.92657996,
         0.91025902,  0.97252586],
       [ 0.63142252,  0.79922665,  0.        , ...,  0.85006307,
         0.85144104,  0.77452894],
       ..., 
       [ 0.77133182,  0.92657996,  0.85006307, ...,  0.        ,
         0.67966581,  0.51869668],
       [ 0.82256341,  0.91025902,  0.85144104, ...,  0.67966581,
         0.        ,  0.42529225],
       [ 0.7948404 ,  0.97252586,  0.77452894, ...,  0.51869668,
         0.42529225,  0.        ]])

In [35]:
user_similarity = 1 - user_distance
item_similarity = 1 - item_distance

In [36]:
user_similarity.shape, item_similarity.shape

((754, 754), (1682, 1682))

In [37]:
train_data.shape

(754, 1682)

#### Prediction

$r_{u,i}$ = rating user u gave to item i

$\hat{r}_{u,i}$ = rating prediction for user u and item i

$\mbox{sim}(u, v)$ = similarity between user u and user v

$\hat{r}_{u,i} = \frac{\sum_{v} \mbox{sim}(u, v)r_{v,i}}{\sum_{v} \big{|}\mbox{sim}(u, v)\big{|}}$

In [39]:
def make_user_prediction(data, u_similarity):
    return u_similarity.dot(data) / np.array([np.abs(u_similarity).sum(axis=1)]).T

def make_item_prediction(data, i_similarity):
    return data.dot(i_similarity) / np.array([np.abs(i_similarity).sum(axis=1)])

user_pred = make_user_prediction(train_data, user_similarity)
item_pred = make_item_prediction(train_data, item_similarity)

In [40]:
user_pred.shape

(754, 1682)

In [41]:
item_pred.shape

(754, 1682)

In [42]:
from sklearn.metrics import mean_squared_error

def matrix_mse(prediction, actual):
    prediction = prediction[actual.nonzero()].flatten()  # ignore zero terms
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(prediction, actual)

matrix_mse(user_pred, train_data)

7.7622006216106483

In [None]:
matrix_mse(item_pred, train_data)