In [175]:
# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on 
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>

# ! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

In [176]:
# import zipfile
# with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
#     zip_ref.extractall('data')

In [177]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('movie.csv')
ratings_df = pd.read_csv('generated_ratings.csv', delimiter=';')

In [178]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (100, 2) 
The dimensions of ratings dataframe are: (18809, 3)


In [179]:
# Take a look at movies_df
# movies_df.rename(columns={'id': 'movie_id'}, inplace=True)
# movies_df['movie_id'] = movies_df['movie_id'].str.replace('media-', '', regex=False)

# # Si quieres convertirlo a enteros
# movies_df['movie_id'] = movies_df['movie_id'].astype(int)
movies_df.head()

Unnamed: 0,id,genres
0,media-1,Action|Thriller
1,media-10,Adventure|Comedy|Fantasy|Horror
2,media-100,Comedy|Drama|Family
3,media-101,Action & Adventure|Crime|Drama
4,media-102,Action & Adventure|Animation|Comedy|Sci-Fi & F...


In [180]:
# Take a look at ratings_df
# ratings_df['movie_id'] = ratings_df['movie_id'].str.replace('media-', '', regex=False)

# # Si quieres convertirlo a enteros
# ratings_df['movie_id'] = ratings_df['movie_id'].astype(int)
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating
0,1000,media-59,4.0
1,1000,media-167,4.0
2,1000,media-354,4.0
3,1000,media-346,4.0
4,1000,media-366,4.0


In [181]:
ratings_df['movie_id'].unique()

array(['media-59', 'media-167', 'media-354', 'media-346', 'media-366',
       'media-355', 'media-350', 'media-418', 'media-218', 'media-486',
       'media-365', 'media-293', 'media-33', 'media-174', 'media-448',
       'media-82', 'media-241', 'media-197', 'media-81', 'media-46',
       'media-188', 'media-412', 'media-245', 'media-166', 'media-401',
       'media-296', 'media-323', 'media-159', 'media-317', 'media-375',
       'media-210', 'media-313', 'media-509', 'media-332', 'media-503',
       'media-362', 'media-351', 'media-483', 'media-180', 'media-290',
       'media-6', 'media-13', 'media-363', 'media-150', 'media-70',
       'media-336', 'media-519', 'media-328', 'media-498', 'media-261',
       'media-357', 'media-206', 'media-480', 'media-55', 'media-331',
       'media-339', 'media-138', 'media-303', 'media-481', 'media-352',
       'media-67', 'media-252', 'media-455', 'media-257', 'media-286',
       'media-14', 'media-26', 'media-315', 'media-476', 'media-262',
     

In [182]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('id')['genres'].to_dict()
n_users = len(ratings_df.user_id.unique())
n_items = len(ratings_df.movie_id.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 2907
Number of unique movies: 518
The full rating matrix will have: 1505826 elements.
----------
Number of ratings: 18809
Therefore:  1.2490818992366979 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [183]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm.notebook import tqdm  # Forma correcta en tqdm >= 5.0.0

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        print("n_users:", n_users)
        print("n_items:", n_items)
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)

In [184]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        # Extract all user IDs and movie IDs
        users = ratings_df.user_id.unique()
        movies = ratings_df.movie_id.unique()
        
        #--- Producing new continuous IDs for users and movies ---
        
        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}
        
        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}
        
        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings['user_id'] = self.ratings['user_id'].map(self.userid2idx)
        self.ratings['movie_id'] = self.ratings['movie_id'].map(self.movieid2idx)
        
        
        self.x = self.ratings[['user_id', 'movie_id']].values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [185]:
num_epochs = 128
cuda = torch.cuda.is_available()
print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
# print(model)
# for name, param in model.named_parameters():
    # if param.requires_grad:
        # print(name, param.data)
# GPU enable if you have a GPU...
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
n_users: 2907
n_items: 518


In [186]:
for it in range(num_epochs):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

iter #0 Loss: 14.790775292584685
iter #1 Loss: 13.902810038352499
iter #2 Loss: 12.23547311380607
iter #3 Loss: 10.102597612912962
iter #4 Loss: 7.9440888450259255
iter #5 Loss: 6.110254320157629
iter #6 Loss: 4.750041204244912
iter #7 Loss: 3.819229078941605
iter #8 Loss: 3.1842520220750044
iter #9 Loss: 2.725449033334953
iter #10 Loss: 2.372621524901617
iter #11 Loss: 2.0881518133643535
iter #12 Loss: 1.8521503250615126
iter #13 Loss: 1.6554460557950597
iter #14 Loss: 1.4904512150757978
iter #15 Loss: 1.3525751241210366
iter #16 Loss: 1.2375991936443613
iter #17 Loss: 1.1418493815830775
iter #18 Loss: 1.063467680596981
iter #19 Loss: 0.9988051997561033
iter #20 Loss: 0.9453864393591069
iter #21 Loss: 0.9016910479182288
iter #22 Loss: 0.8660371777962665
iter #23 Loss: 0.8368113819433718
iter #24 Loss: 0.8123559392228419
iter #25 Loss: 0.7925153930171006
iter #26 Loss: 0.7763454901928805
iter #27 Loss: 0.7624316162803546
iter #28 Loss: 0.7512610060017125
iter #29 Loss: 0.74156965366026

In [187]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[0.5972, 0.3477, 0.3726,  ..., 0.6238, 0.3266, 0.5852],
        [0.5359, 0.4651, 0.3966,  ..., 0.4370, 0.4678, 0.5769],
        [0.3198, 0.5234, 0.4740,  ..., 0.3458, 0.5517, 0.3536],
        ...,
        [0.3859, 0.4440, 0.4003,  ..., 0.3656, 0.3545, 0.3268],
        [0.5450, 0.5222, 0.5570,  ..., 0.4422, 0.5217, 0.5036],
        [0.6266, 0.3852, 0.5718,  ..., 0.4631, 0.4725, 0.4763]],
       device='cuda:0')
item_factors.weight tensor([[1.0937, 2.0633, 0.0412,  ..., 0.1061, 1.1069, 0.4289],
        [0.9284, 1.7505, 1.0515,  ..., 0.7088, 1.3743, 1.0537],
        [1.7615, 0.1292, 0.7672,  ..., 0.9834, 0.6787, 1.4742],
        ...,
        [0.9681, 0.9672, 0.9728,  ..., 0.9306, 0.9604, 0.9212],
        [0.9679, 0.9814, 1.0028,  ..., 1.0000, 0.9634, 0.9764],
        [1.0463, 1.0869, 1.0374,  ..., 1.1072, 1.0807, 1.0674]],
       device='cuda:0')


In [188]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [189]:
len(trained_movie_embeddings) # unique movie factor weights

518

In [190]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [191]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []
    # Find movie indices belonging to the current cluster
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        # Check how many ratings this movie has
        rat_count = len(ratings_df.loc[ratings_df['movie_id'] == movid])
        movs.append((movid, rat_count))
    # Sort movies by rating count in descending order and print top 10
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])
        

Cluster #0
	 media-366
	 media-296
	 media-167
	 media-365
	 media-345
	 media-81
	 media-298
	 media-282
	 media-315
	 media-476
Cluster #1
	 media-339
	 media-503
	 media-354
	 media-318
	 media-352
	 media-6
	 media-406
	 media-138
	 media-293
	 media-50
Cluster #2
	 media-241
	 media-357
	 media-351
	 media-356
	 media-511
	 media-232
	 media-415
	 media-387
	 media-96
	 media-248
Cluster #3
	 media-346
	 media-480
	 media-323
	 media-350
	 media-519
	 media-362
	 media-471
	 media-40
	 media-130
	 media-509
Cluster #4
	 media-355
	 media-418
Cluster #5
	 media-238
	 media-173
	 media-266
	 media-463
	 media-458
	 media-249
	 media-334
	 media-453
	 media-154
	 media-407
Cluster #6
	 media-448
Cluster #7
	 media-197
Cluster #8
	 media-412
	 media-144
	 media-312
	 media-482
	 media-478
	 media-313
	 media-397
	 media-21
	 media-508
	 media-215
Cluster #9
	 media-59
	 media-82
