**Imports**

In [67]:
# fastai imports
from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)

In [68]:
# pytorch imports
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd

**Prepare the dataset**

In [69]:
# read the file with pandas, the file is tab seperated with columns: 'user','movie','rating','timestamp'
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=['user','movie','rating','timestamp'])
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [70]:
# we can use the items dataset to see movie titles
movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie','title'), header=None)
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [71]:
# merge with ratings to see user ratings for titles
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


Pick a user that is not presented in the datasets (99999) and then rate some movies and add the user to the main dataframe.

In [72]:
rows = []
user = 99999
titles = ["Toy Story (1995)", "GoldenEye (1995)", "Four Rooms (1995)", "Get Shorty (1995)", "Copycat (1995)"]
movie_ratings = [5, 3, 2, 3, 4]
for i in range(len(titles)):
    rows.append( dict( {'title' : titles[i], 'user' : user, 'rating' : movie_ratings[i]}))

user_data = pd.DataFrame(rows)
user_data.head()

Unnamed: 0,title,user,rating
0,Toy Story (1995),99999,5
1,GoldenEye (1995),99999,3
2,Four Rooms (1995),99999,2
3,Get Shorty (1995),99999,3
4,Copycat (1995),99999,4


In [73]:
data_reduced = pd.concat([ratings, user_data], axis=0)
data_reduced.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242.0,3,881250949.0,Kolya (1996)
1,63,242.0,3,875747190.0,Kolya (1996)
2,226,242.0,5,883888671.0,Kolya (1996)
3,154,242.0,3,879138235.0,Kolya (1996)
4,306,242.0,5,876503793.0,Kolya (1996)


In [74]:
data_reduced.tail()

Unnamed: 0,user,movie,rating,timestamp,title
0,99999,,5,,Toy Story (1995)
1,99999,,3,,GoldenEye (1995)
2,99999,,2,,Four Rooms (1995)
3,99999,,3,,Get Shorty (1995)
4,99999,,4,,Copycat (1995)


In [75]:
data_reduced = data_reduced[['user', 'rating', 'title']]
data_reduced.tail()

Unnamed: 0,user,rating,title
0,99999,5,Toy Story (1995)
1,99999,3,GoldenEye (1995)
2,99999,2,Four Rooms (1995)
3,99999,3,Get Shorty (1995)
4,99999,4,Copycat (1995)


**Create the data loader and the model**

In [76]:
data_collab = CollabDataLoaders.from_df(data_reduced, seed=42, valid_pct=0.2, user_name='user', item_name='title', rating_name='rating')
learn = collab_learner(data_collab, n_factors=50, y_range=(0, 5.5))

**See how well the model does in training**

In [77]:
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.889584,0.938454,00:14
1,0.688486,0.874166,00:15
2,0.541393,0.862629,00:19
3,0.455349,0.846186,00:15
4,0.456991,0.843138,00:15


Model layers

In [78]:
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(945, 50)
  (i_weight): Embedding(1665, 50)
  (u_bias): Embedding(945, 1)
  (i_bias): Embedding(1665, 1)
)

See the movies with the greatest bias. These are the movies that people liked the most overall, despite their latent preferences.

In [79]:
movie_bias = learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[data_collab.classes['title'][i] for i in idxs]

['Shawshank Redemption, The (1994)',
 'Good Will Hunting (1997)',
 'L.A. Confidential (1997)',
 'Titanic (1997)',
 'Close Shave, A (1995)']

What movie is the most similar to Silence of the lambs?

In [80]:
movie_factors = learn.model.i_weight.weight
idx = data_collab.classes['title'].o2i['Silence of the Lambs, The (1991)']
distances = nn.CosineSimilarity(dim=1)(movie_factors, movie_factors[idx][None])
idx = distances.argsort(descending=True)[1]
data_collab.classes['title'][idx]

'Usual Suspects, The (1995)'

In [81]:
learn.save("/content/models")

Path('/content/models.pth')

**Movie reccomendations for user 99999**

In [84]:
user_id = 99999
# list of all movies user did not rate yet, to get full predictions
movies = list(data_reduced.drop_duplicates(subset='title', keep='first').title)
movies_to_predict_for = [movie for movie in movies if movie not in titles]


# create a dataframe with pairs [user_id, movie_id]
df = pd.DataFrame({
    'user': [user_id] * len(movies_to_predict_for),
    'title': movies_to_predict_for
})

# covert it to DataLoader and make predictions
dl = learn.dls.test_dl(df)
preds = learn.get_preds(dl=dl)

# merge predictions with movie ids
preds_df = pd.DataFrame({
    'item': movies_to_predict_for,
    'prediction': preds[0].numpy().flatten()
}).sort_values('prediction', ascending=False)

# show top 10 ratings
display(preds_df[:10])

Unnamed: 0,item,prediction
351,Star Wars (1977),4.381354
402,Braveheart (1995),4.18726
98,Raiders of the Lost Ark (1981),4.115075
233,"Shawshank Redemption, The (1994)",4.110715
169,Casablanca (1942),4.105816
341,Indiana Jones and the Last Crusade (1989),4.100353
50,Return of the Jedi (1983),4.093261
315,"Princess Bride, The (1987)",4.063164
606,Wallace & Gromit: The Best of Aardman Animation (1996),4.053527
195,Schindler's List (1993),4.051422


This is really interesting. It makes sense to reccomend this user all of the classic movies... but Wallace & Gromit: The Best of Aardman Animation (1996)?

This was likey reccommended because the user rated Toy Story a 5 and Wallace & Gromit is a similar movie.