In [1]:
# Importing necessary dependecies
from fastai.collab import *
from fastai.tabular.all import *

In [2]:
# Moving files to the working directory
import shutil
source_path = "/kaggle/input/movielens-1m"
dest_path = "/kaggle/working/"
try:
    shutil.move(source_path, dest_path)
except Exception as e:
    print(e)

[Errno 30] Read-only file system: 'movies.csv'


In [3]:
path = Path("/kaggle/working/movielens-1m")
path.ls()

(#3) [Path('/kaggle/working/movielens-1m/users.csv'),Path('/kaggle/working/movielens-1m/ratings.csv'),Path('/kaggle/working/movielens-1m/movies.csv')]

In [4]:
# Reading the ratings csv file
ratings = pd.read_csv(path/'ratings.csv', delimiter=',')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
# Reading the movies csv file
movies = pd.read_csv(path/'movies.csv',  delimiter=',')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Merging movies and ratings into one table
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [7]:
# Turning it into a dataframe
ratings = pd.DataFrame(ratings)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [8]:
god = ratings[ratings['title'] == 'GodFather, The(1972)']
god

Unnamed: 0,userId,movieId,rating,timestamp,title,genres


In [9]:
# Creating dataloaders
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,userId,title,rating
0,4508,Of Mice and Men (1992),4
1,5791,Fargo (1996),5
2,5430,Galaxy Quest (1999),1
3,1741,Rebecca (1940),5
4,4823,"Contender, The (2000)",3
5,4451,Amos & Andrew (1993),3
6,5333,Braveheart (1995),4
7,3965,Rushmore (1998),1
8,2250,X-Men (2000),1
9,2955,One Flew Over the Cuckoo's Nest (1975),3


In [10]:
# Finding the number of unique cateories in the columns
n_users  = len(dls.classes['userId'])
n_movies = len(dls.classes['title'])
print("n_movies: ", n_movies)
print("n_users: ", n_users)

n_movies:  3707
n_users:  6041


In [11]:
# This class creates learnable embeddings for both users and movies with biases
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)

In [12]:
model = DotProductBias(n_users, n_movies, 110)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(10, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.84184,0.879097,01:07
1,0.816643,0.902955,01:06
2,0.839137,0.90714,01:06
3,0.825101,0.894022,01:06
4,0.800441,0.873934,01:06
5,0.764666,0.842918,01:06
6,0.729922,0.807811,01:06
7,0.664316,0.772246,01:06
8,0.649007,0.750603,01:06
9,0.626068,0.745343,01:06


In [13]:
learn.model

DotProductBias(
  (user_factors): Embedding(6041, 110)
  (user_bias): Embedding(6041, 1)
  (movie_factors): Embedding(3707, 110)
  (movie_bias): Embedding(3707, 1)
)

In [14]:
# Finding movies with similar embedding distances
movie_name = "One Flew Over the Cuckoo's Nest (1975)"
movie_factors = learn.model.movie_factors.weight
idx = dls.classes['title'].o2i[movie_name]
distances = nn.CosineSimilarity(dim=1)(movie_factors, movie_factors[idx][None])
idx = distances.argsort(descending=True)[1:6]
print(f"Users who Liked {movie_name} also liked:")
for t in dls.classes['title'][idx]:
    print(f"\n{t}")

Users who Liked One Flew Over the Cuckoo's Nest (1975) also liked:

Some Folks Call It a Sling Blade (1993)

Deer Hunter, The (1978)

Apocalypse Now (1979)

Tigerland (2000)

Stalag 17 (1953)


In [15]:
learn.export("PMF_collab_model_1.pkl")