In [1]:
# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19.

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0   469k      0  0:00:02  0:00:02 --:--:--  469k


In [2]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [3]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')


In [4]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)


The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [5]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")


Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [8]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [9]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [10]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0252, 0.0314, 0.0193,  ..., 0.0189, 0.0219, 0.0495],
        [0.0243, 0.0468, 0.0483,  ..., 0.0108, 0.0212, 0.0321],
        [0.0003, 0.0243, 0.0329,  ..., 0.0064, 0.0267, 0.0125],
        ...,
        [0.0006, 0.0093, 0.0081,  ..., 0.0210, 0.0226, 0.0092],
        [0.0458, 0.0370, 0.0269,  ..., 0.0024, 0.0366, 0.0182],
        [0.0407, 0.0496, 0.0107,  ..., 0.0310, 0.0156, 0.0002]])
item_factors.weight tensor([[1.4024e-02, 1.1081e-03, 1.7415e-02,  ..., 1.6053e-02, 3.0233e-03,
         1.9704e-02],
        [2.0690e-02, 1.0746e-02, 3.7660e-02,  ..., 3.4933e-02, 7.1094e-03,
         4.0114e-02],
        [3.4416e-05, 4.8098e-02, 3.6741e-03,  ..., 4.0333e-02, 2.0151e-02,
         4.4308e-02],
        ...,
        [1.8312e-02, 3.8347e-02, 2.0384e-03,  ..., 3.5859e-02, 4.1355e-02,
         1.7292e-02],
        [2.0646e-02, 2.5608e-02, 2.0481e

In [15]:
from tqdm.notebook import tqdm # Use the correct import for tqdm

for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
        if cuda:
            x, y = x.cuda(), y.cuda()

        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        losses.append(loss.item())
        loss.backward()
        optimizer.step()

    # Calculate average loss only if losses is not empty
    avg_loss = sum(losses) / len(losses) if losses else 0
    print("iter #{}".format(it), "Loss:", avg_loss)

  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.055084362852996
iter #1 Loss: 4.740504200991035
iter #2 Loss: 2.475697196679672
iter #3 Loss: 1.7219691222089195
iter #4 Loss: 1.3459493192621899
iter #5 Loss: 1.1287333922186478
iter #6 Loss: 0.9919401828558917
iter #7 Loss: 0.9005210076643125
iter #8 Loss: 0.8374800082239403
iter #9 Loss: 0.792209458873054
iter #10 Loss: 0.759171649624551
iter #11 Loss: 0.7348255922422191
iter #12 Loss: 0.7161922525179568
iter #13 Loss: 0.7014961804305841
iter #14 Loss: 0.6906294570432097
iter #15 Loss: 0.6818751914020117
iter #16 Loss: 0.6748214956694448
iter #17 Loss: 0.6699460244118259
iter #18 Loss: 0.6657629055798356
iter #19 Loss: 0.6629596966399154
iter #20 Loss: 0.6604896315236382
iter #21 Loss: 0.6592174809293698
iter #22 Loss: 0.6576118143606307
iter #23 Loss: 0.6570450862879076
iter #24 Loss: 0.6560587988135779
iter #25 Loss: 0.6556184581542378
iter #26 Loss: 0.6545598641945626
iter #27 Loss: 0.653898060889111
iter #28 Loss: 0.6530607779543411
iter #29 Loss: 0.652309833519

In [16]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 1.0879,  1.2872,  1.1833,  ...,  1.0340,  1.2881,  1.4378],
        [ 0.9986,  2.1311,  0.8596,  ...,  0.4033,  2.0873,  0.7171],
        [-0.3798,  0.6804,  1.2802,  ...,  0.9176,  0.2348,  1.8631],
        ...,
        [-0.4062, -0.2229,  1.8595,  ...,  1.8550,  0.6749,  2.0353],
        [ 0.5257,  0.9246,  0.9593,  ...,  1.4121,  1.3834,  0.4250],
        [ 1.2510,  1.7976,  1.1319,  ...,  1.6443,  0.6188,  0.3305]])
item_factors.weight tensor([[0.6229, 0.3583, 0.1471,  ..., 0.6356, 0.6155, 0.4417],
        [0.2075, 0.5100, 0.5297,  ..., 0.0972, 0.6461, 0.3642],
        [0.3367, 0.7084, 0.5748,  ..., 0.8142, 0.2761, 0.2250],
        ...,
        [0.3168, 0.3383, 0.3012,  ..., 0.3340, 0.3410, 0.3090],
        [0.4132, 0.4181, 0.4128,  ..., 0.3995, 0.4080, 0.4164],
        [0.4009, 0.3910, 0.3576,  ..., 0.3839, 0.4000, 0.3448]])


In [17]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [18]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [19]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [20]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []
    # Find movie indices belonging to the current cluster
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        # Check how many ratings this movie has
        rat_count = len(ratings_df.loc[ratings_df['movieId'] == movid])
        movs.append((movie_names[movid], rat_count))
    # Sort movies by rating count in descending order and print top 10
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])

Cluster #0
	 Titanic (1997)
	 Batman Forever (1995)
	 Net, The (1995)
	 Clear and Present Danger (1994)
	 Crimson Tide (1995)
	 Firm, The (1993)
	 While You Were Sleeping (1995)
	 Back to the Future Part II (1989)
	 Demolition Man (1993)
	 Mr. Holland's Opus (1995)
Cluster #1
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Star Wars: Episode IV - A New Hope (1977)
	 Toy Story (1995)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Lord of the Rings: The Fellowship of the Ring, The (2001)
	 Star Wars: Episode VI - Return of the Jedi (1983)
	 Lord of the Rings: The Two Towers, The (2002)
	 Lord of the Rings: The Return of the King, The (2003)
	 Lion King, The (1994)
Cluster #2
	 Jurassic Park (1993)
	 Braveheart (1995)
	 Apollo 13 (1995)
	 Batman (1989)
	 Aladdin (1992)
	 True Lies (1994)
	 Gladiator (2000)
	 Men in Black (a.k.a. MIB) (1997)
	 Mission: Impossible (1996)
	 Beauty and the Beast (1991)
Cluster #3
	 Independence Day (a.k.a. ID4) (