# Exercise: Build a Movie Recommendation System
Based on the Colab notebook [Recommendation Systems Colab](https://colab.research.google.com/github/google/eng-edu/blob/main/ml/recommendation-systems/recommendation-systems.ipynb?utm_source=ss-recommendation-systems&utm_campaign=colab-external&utm_medium=referral&utm_content=recommendation-systems)     



Uses [MovieLens 100K movie ratings](https://grouplens.org/datasets/movielens/). 100,000 ratings from 1000 users on 1700 movies. Released 4/1998.   


# Imports

In [60]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn

# Load Data

In [4]:
import zipfile
zip_ref = zipfile.ZipFile('ml-100k.zip', 'r')
zip_ref.extractall()
print(str(zip_ref.read('ml-100k/u.info')))

b'943 users\n1682 items\n100000 ratings\n'


In [49]:
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols

movies = pd.read_csv('ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

# Make ids start at 0
users["user_id"] = users["user_id"].apply(lambda x: int(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: int(x-1))
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: int(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: int(x-1))
# Add year
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

# Compute the number of movies to which a genre is assigned
genre_occurences = movies[genre_cols].sum().to_dict()

# Add two cols to movies
# all_genres: all the active genres of the movie
# genre: randomly sampled genre from the active genres
def mark_genres(moveis, genres):
    def get_random_genre(gs):
        active = [genre for genre, g in zip(genres, gs) if g==1]
        if len(active) == 0:
            return 'Other'
        return np.random.choice(active)
    def get_all_genres(gs):
        active = [genre for genre, g in zip(genres, gs) if g==1]
        if len(active) == 0:
            return 'Other'
        return '-'.join(active)
    
    movies['genre'] = [get_random_genre(gs) for gs in movies.loc[:, genres].to_numpy()]
    
mark_genres(movies, genre_cols)

# Create one merged DataFrame containing all data (rating, movie, user)
movielens = ratings.merge(movies, on='movie_id').merge(users, on='user_id')
movielens.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,release_date,video_release_date,imdb_url,genre_unknown,Action,...,Sci-Fi,Thriller,War,Western,year,genre,age,sex,occupation,zip_code
0,195,241,3.0,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,1997,Comedy,49,M,writer,55105
1,195,256,2.0,881251577,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...,0,1,...,1,0,0,0,1997,Adventure,49,M,writer,55105
2,195,110,4.0,881251793,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,,http://us.imdb.com/M/title-exact?Truth%20About...,0,0,...,0,0,0,0,1996,Comedy,49,M,writer,55105
3,195,24,4.0,881251955,"Birdcage, The (1996)",08-Mar-1996,,"http://us.imdb.com/M/title-exact?Birdcage,%20T...",0,0,...,0,0,0,0,1996,Comedy,49,M,writer,55105
4,195,381,4.0,881251843,"Adventures of Priscilla, Queen of the Desert, ...",01-Jan-1994,,http://us.imdb.com/M/title-exact?Adventures%20...,0,0,...,0,0,0,0,1994,Drama,49,M,writer,55105


# Preliminaries
Our goal is to factorize the ratings matrix $A$ into the product of a user embedding matrix $U$ and movie embedding matrix $V$, such that $A \approx UV^\top$ with
$U = \begin{bmatrix} u_{1} \\ \hline \vdots \\ \hline u_{N} \end{bmatrix}$ and
$V = \begin{bmatrix} v_{1} \\ \hline \vdots \\ \hline v_{M} \end{bmatrix}$.

Here
- $N$ is the number of users,
- $M$ is the number of movies,
- $A_{ij}$ is the rating of the $j$th movies by the $i$th user,
- each row $U_i$ is a $d$-dimensional vector (embedding) representing user $i$,
- each row $V_j$ is a $d$-dimensional vector (embedding) representing movie $j$,
- the prediction of the model for the $(i, j)$ pair is the dot product $\langle U_i, V_j \rangle$.

## Sparse Representation of the Rating Matrix

The rating matrix could be very large and, in general, most of the entries are unobserved, since a given user will only rate a small subset of movies. For effcient representation, we will use a [torch.sparse](https://pytorch.org/docs/stable/sparse.html)


In [119]:
def build_sparse_ratings(ratings_df):
    indices = ratings_df[['user_id', 'movie_id']].values
    values = ratings_df['rating'].values.astype('float32')
    return torch.sparse_coo_tensor(
        torch.tensor(indices).t(), 
        values, 
        (users.shape[0], movies.shape[0])
    ).coalesce()

## MSE Loss function

In [120]:
def spares_mean_square_eror(sparse_ratings, user_embeddings, movie_embeddings):
    # sparse_ratings : torch.sparse_coo_tensor
    # user_embeddings, movie_embeddings : torch.tensor
    indices = sparse_ratings.indices()
    pred = user_embeddings[indices[:,0]] * movie_embeddings[indices[:,1]].t()
    loss = nn.functional.mse_loss(pred.sum(dim=1), sparse_ratings.values())

# Training a Matrix Factorization model
Train a simple Matrix Factorization, Collaborative Filtering Model using Gradient Descent.     

In [121]:
class CFModel(nn.Module):
    def __init__(self, user_n, movie_n, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(user_n, embedding_dim)
        self.movie_embedding = nn.Embedding(movie_n, embedding_dim)
    
    def forward(self, sparse_ratings):
        indices = sparse_ratings.indices()
        pred = self.user_embedding.weight[indices[0]] * self.movie_embedding.weight[indices[1]]
        return pred.sum(dim=1)


In [122]:
embedding_dim = 30
epoch = 1000

In [126]:
train_ratings, val_ratings = train_test_split(ratings, random_state=300)
A_train = build_sparse_ratings(train_ratings)
A_val = build_sparse_ratings(val_ratings)

model = CFModel(users.shape[0], movies.shape[0], embedding_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.MSELoss()

for e in range(epoch):
    # train
    pred = model(A_train)
    loss = criterion(pred, A_train.values())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # valid
    with torch.no_grad():
        pred = model(A_val)
        val_loss = criterion(pred, A_val.values())
    
    if e % 100 == 99:
        print('---------------')
        print(f'Epoch {e+1}')
        print(f'Train loss: {loss.item()}\nValidation loss: {val_loss.item()}')

---------------
Epoch 100
Train loss: 33.570106506347656
Validation loss: 37.0751838684082
---------------
Epoch 200
Train loss: 26.825450897216797
Validation loss: 32.55030822753906
---------------
Epoch 300
Train loss: 21.586639404296875
Validation loss: 28.70364761352539
---------------
Epoch 400
Train loss: 16.78782844543457
Validation loss: 24.53650665283203
---------------
Epoch 500
Train loss: 12.372841835021973
Validation loss: 20.0651912689209
---------------
Epoch 600
Train loss: 8.95942497253418
Validation loss: 16.226425170898438
---------------
Epoch 700
Train loss: 6.65017032623291
Validation loss: 13.4307861328125
---------------
Epoch 800
Train loss: 5.116293430328369
Validation loss: 11.462285041809082
---------------
Epoch 900
Train loss: 4.058783054351807
Validation loss: 10.030671119689941
---------------
Epoch 1000
Train loss: 3.2969810962677
Validation loss: 8.943120002746582


In [116]:
A_train.indices()[:, 0]

tensor([0, 0])