In [None]:
# Imports
import pandas as pd
import numpy as np

from torch import nn, tensor, float32, save
from torch.cuda import is_available
from torch.optim import Adam
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

from sklearn.cluster import KMeans


from tqdm import tqdm_notebook as tqdm

import pickle
import joblib


: 

In [None]:
# Load data
movies_df = pd.read_csv('/content/movies.csv')
ratings_df = pd.read_csv('/content/ratings.csv')

In [None]:
# Shape
print(f"Shape of movies dataframe: {movies_df.shape}")
print(f"Shape of ratings dataframe: {ratings_df.shape}")

In [None]:
movies_df.head()

In [None]:
ratings_df.head()

In [None]:
movie_names= movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df['userId'].unique())
n_items = len(ratings_df['movieId'].unique())
print(f"Number of unique users: {n_users}")
print(f"Number of unique movies: {n_items}")
print(f"The full rating matrix will have: {n_users * n_items} elements")
print(f"therefore {len(ratings_df) / (n_users * n_items) * 100}% of the matrix are filled.")

In [None]:
print(len(ratings_df))

In [None]:
class MatrixFactorization(nn.Module):

  def __init__(self, n_users, n_items, n_factors=20):
    super().__init__()

    # Generate user embeddings
    self.user_factors = nn.Embedding(n_users, n_factors)
    self.item_factors = nn.Embedding(n_items, n_factors)

    # Initialize the embeddings with continuous uniformly distributed values
    self.user_factors.weight.data.uniform_(0, 0.05)
    self.item_factors.weight.data.uniform_(0, 0.05)

  def forward(self, data):
    users, items = data[:,0], data[:,1]
    return (self.user_factors(users) * self.item_factors(items)).sum(1)

  def predict(self, user, item):
    return self.forward(user, item)

In [None]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
  def __init__(self, ratings_df):

    self.ratings_df = ratings_df.copy()

    users = self.ratings_df['userId'].unique()
    movies = self.ratings_df['movieId'].unique()

    # Generate continuous IDs for users and movies

    self.userIdToIdx = {o:i for i,o in enumerate(users)}
    self.movieIdToIdx = {o:i for i,o in enumerate(movies)}

    self.idxToUserId = {i:o for o,i in self.userIdToIdx.items()}
    self.idxToMovieId = {i:o for o,i in self.movieIdToIdx.items()}

    self.ratings_df['movieId'] = ratings_df['movieId'].apply(lambda x: self.movieIdToIdx[x])
    self.ratings_df['userId'] = ratings_df['userId'].apply(lambda x: self.userIdToIdx[x])

    self.x = self.ratings_df.drop(['rating', 'timestamp'], axis=1).values
    self.y = self.ratings_df['rating'].values
    self.x, self.y = tensor(self.x), tensor(self.y)

  def __getitem__(self, index):
    return (self.x[index], self.y[index])

  def __len__(self):
    return self.ratings_df.shape[0]


In [None]:
NUM_EPOCHS = 128
LEARNING_RATE = 1e-3
BATCH_SIZE = 128
DEVICE = "cuda" if is_available() else "cpu"

model = MatrixFactorization(n_users=n_users, n_items=n_items, n_factors=8)

print(f"Device: {DEVICE}")
print(f"Model: {model}")

for name, param in model.named_parameters():
  if param.requires_grad:
    print(name, param.data)

if DEVICE == "cuda":
  model.cuda()
else:
  model.cpu()

# Loss function
loss_fn = nn.MSELoss()

# Optimizer
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

# DataLoader
train_set = Loader(ratings_df=ratings_df)
train_loader = DataLoader(train_set, BATCH_SIZE, shuffle=True)

In [None]:
for iter in tqdm(range(NUM_EPOCHS)):
  total_loss = []
  for x, y in train_loader:
    if DEVICE == "cuda":
      x, y = x.cuda(), y.cuda()
    else:
      x, y = x.cpu(), y.cpu()
    optimizer.zero_grad()
    outputs = model(x)
    loss = loss_fn(outputs.squeeze(), y.type(float32))
    total_loss.append(loss.item())
    loss.backward()
    optimizer.step()
  if iter % 10 == 0:
    print(f"Iteration: {iter}, Loss: {sum(total_loss)/len(total_loss)}")

In [None]:
print(f"Loss: {sum(total_loss)/len(total_loss)}")

In [None]:
# Latent factors for movies and users
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
  if param.requires_grad:
    print(name, param.data)
    if c == 0:
      uw = param.data
      c += 1
    else:
      iw = param.data

In [None]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [None]:
len(trained_movie_embeddings)

In [None]:
kMeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [None]:
for cluster in range(10):
  print(f"Cluster #{cluster}")
  cluster_movies = []
  # Find indices of the movies
  for movieIdx in np.where(kMeans.labels_ == cluster)[0]:
    movieId = train_set.idxToMovieId[movieIdx]
    # Check the ratings of the movie
    rat_count = len(ratings_df.loc[ratings_df['movieId']==movieId])
    cluster_movies.append((movie_names[movieId], rat_count))
  # Sort movies by rating count
  for movie in sorted(cluster_movies, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", movie[0])

In [None]:
model

In [None]:
# Save model
save(model.state_dict(), "matrix_factorization.pt")

# Save mappings
mappings = {
    "userIdToIdx": train_set.userIdToIdx,
    "movieIdToIdx": train_set.movieIdToIdx,
    "idxToUserId": train_set.idxToUserId,
    "idxToMovieId": train_set.idxToMovieId,
}

with open("mappings.pkl", "wb") as f:
    pickle.dump(mappings, f)
    f.close()

# Save kMeans
joblib.dump(kMeans, "kmeans_model.pkl")

# Save trained movie embeddings
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()
np.save("movie_embeddings.npy", trained_movie_embeddings)
