<a href="https://colab.research.google.com/github/Adeeba2617/RecSystem-MSAI/blob/main/RecSystem_MSAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The Movielens Datasets: History and Context.  ACM Transactions on
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1-19:19 <https://doi.org/10.1145/2827872>

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0   483k      0  0:00:01  0:00:01 --:--:--  483k


In [2]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
  zip_ref.extractall('data')

In [3]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [4]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [5]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have", n_users*n_items, "elements.")
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items)*100, "% of the matrix is filled.")
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and product grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix is memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicity, thus we don't need all the data")

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have 5931640 elements.
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and product grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix is memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicity, thus we don't need all the data


In [11]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
  def __init__(self, n_users, n_items, n_factors=20): # Changed _init_ to __init__
    super().__init__()
    # create user embeddings
    self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input
    # create item embeddings
    self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input
    self.user_factors.weight.data.uniform_(0, 0.05)
    self.item_factors.weight.data.uniform_(0, 0.05)

  def forward(self, data):
    # matrix multiplication
    users, items = data[:,0], data[:,1]
    return (self.user_factors(users)*self.item_factors(items)).sum(1)
  # def forward(self, data):
  #   # matrix multiplication
  #   return (self.user_factors(users)*self.item_factors(items)).sum(1)

  def predict (self, user, item):
    return self.forward(torch.tensor([[user, item]]))

In [13]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
  def __init__(self): # Changed _init_ to __init__
    self.ratings = ratings_df.copy()

    # Exract all user IDs and movie IDs
    users = ratings_df.userId.unique()
    movies = ratings_df.movieId.unique()

    # Producing new continuous IDs for users and movies

    # Unique values : index
    self.userid2idx = {o:i for i,o in enumerate(users)}
    self.movieid2idx = {o:i for i,o in enumerate(movies)}

    # Obtained continous ID for users and movies
    self.idx2userid = {i:o for o,i in self.userid2idx.items()}
    self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

    # return the id from the indexed values as noted in the lambda function down below.
    self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
    self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

    self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
    self.y = self.ratings['rating'].values
    self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) #Transforms the data to tensors (ready for torch models).

  def __getitem__(self, index): # Changed _getitem_ to __getitem__
    return (self.x[index], self.y[index])

  def __len__(self): # Changed _len_ to __len__
    return len(self.ratings)

In [14]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
  if param.requires_grad:
    print(name, param.data)
  # GPU enable if you have a GPU...
  if cuda:
    model.cuda()

  # MSE loss
  loss_fn = torch.nn.MSELoss()

  # ADAM optimizer
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

  # Train data
  train_set = Loader()
  train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0057, 0.0081, 0.0248,  ..., 0.0247, 0.0265, 0.0205],
        [0.0033, 0.0320, 0.0347,  ..., 0.0333, 0.0373, 0.0249],
        [0.0067, 0.0266, 0.0205,  ..., 0.0251, 0.0340, 0.0061],
        ...,
        [0.0371, 0.0060, 0.0012,  ..., 0.0369, 0.0484, 0.0414],
        [0.0331, 0.0467, 0.0396,  ..., 0.0032, 0.0058, 0.0416],
        [0.0020, 0.0209, 0.0386,  ..., 0.0327, 0.0311, 0.0446]])
item_factors.weight tensor([[0.0233, 0.0028, 0.0266,  ..., 0.0207, 0.0348, 0.0453],
        [0.0268, 0.0137, 0.0157,  ..., 0.0096, 0.0092, 0.0321],
        [0.0187, 0.0455, 0.0119,  ..., 0.0289, 0.0208, 0.0098],
        ...,
        [0.0211, 0.0352, 0.0126,  ..., 0.0079, 0.0400, 0.0178],
        [0.0428, 0.0345, 0.0395,  ..., 0.0352, 0.0349, 0.0464],
        [0.0226, 0.0493, 0.0037,  ..., 0.0176, 0.0033, 0.0268]])


In [15]:
for it in range(num_epochs):
  losses = []
  for x, y in train_loader:
    if cuda:
      x, y = x.cuda(), y.cuda
      optimizer.zero_grad()
    outputs = model(x)
    loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
    losses.append(loss.item())
    loss.backward()
    optimizer.step()
  print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

iter #0 Loss: 6.032847167422929
iter #1 Loss: 4.700374476800715
iter #2 Loss: 4.393061212930583
iter #3 Loss: 5.545101681033972
iter #4 Loss: 6.220261722651835
iter #5 Loss: 5.33315110025067
iter #6 Loss: 5.077159167546307
iter #7 Loss: 5.301059835453324
iter #8 Loss: 5.039412483951162
iter #9 Loss: 5.226240569867459
iter #10 Loss: 5.13419372206412
iter #11 Loss: 5.261391096913875
iter #12 Loss: 5.367502750176464
iter #13 Loss: 5.764141298187566
iter #14 Loss: 5.974548351038531
iter #15 Loss: 6.331216989434915
iter #16 Loss: 6.606823988372299
iter #17 Loss: 6.906870833508254
iter #18 Loss: 7.16331525835289
iter #19 Loss: 7.376570278920498
iter #20 Loss: 7.5765036828626835
iter #21 Loss: 7.71986991802448
iter #22 Loss: 8.031523593791245
iter #23 Loss: 8.315778141094343
iter #24 Loss: 8.75840445883988
iter #25 Loss: 8.894953736799017
iter #26 Loss: 9.179784959342879
iter #27 Loss: 9.501769073723537
iter #28 Loss: 9.74965199298665
iter #29 Loss: 10.062927593434523
iter #30 Loss: 10.371516

In [16]:
# By training the model, we will have tuned latent factors for movies and users
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
  if param.requires_grad:
    print(name, param.data)
    if c == 0:
      uw = param.data
      c += 1
    else:
      iw = param.data
    #print('param_data', param_data)


user_factors.weight tensor([[ 0.2233, -0.5671, -1.1821,  ...,  0.6400,  0.2572,  0.9713],
        [-1.0281, -0.9851, -0.3293,  ..., -1.5624,  0.0275, -0.5361],
        [-2.5994, -0.3543,  1.5534,  ...,  0.9777,  0.9651,  2.5837],
        ...,
        [-0.4542, -0.6082, -0.5053,  ..., -0.2446, -0.0309,  1.2216],
        [ 2.0293, -2.0947, -0.2131,  ..., -0.0467,  0.0034,  0.7085],
        [ 0.1566,  0.1833,  0.1040,  ...,  0.7560, -0.2078,  0.8645]])
item_factors.weight tensor([[  0.1226,  -0.6575,   0.1920,  ...,   0.8806,   0.9413,   3.2650],
        [  0.2361,  -2.5507,   0.1793,  ...,  -0.3703,   1.1414,   4.3609],
        [  0.1479,  -0.6107,   0.2404,  ...,   1.2788,   1.0650,   3.9879],
        ...,
        [ -6.1028, -26.1380,   0.5332,  ...,  15.4473, -30.6978, -14.4035],
        [ -6.8367, -19.1551, -10.6312,  ...,   6.4338,  -3.3741, -17.3452],
        [ -5.0023,   3.8503,  11.7136,  ...,  12.6292, -25.7488,  11.8998]])


In [23]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [24]:
len(trained_movie_embeddings) # unique movie factor weight

9724

In [25]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [29]:
import numpy as np
from collections import Counter

def get_movie_genres(movie_id):
    """
    This function should retrieve the genres for a given movie ID.
    Replace this with your actual logic to fetch genres based on your data structure.
    """
    # Example: Assuming movie_names is a dictionary with movie IDs as keys and a list of genres as values
    genres = movie_names.get(movie_id, [])  # Replace with your actual genre retrieval logic
    if isinstance(genres, str):
        return genres.split("|")  # Adjust the delimiter if necessary
    return genres


def get_movie_id(movie_name):
  """
  This function should retrieve the movie id for a given movie name.
  """
  for movie_id, name in movie_names.items():
    if name == movie_name:
      return movie_id
  return None # Return None if movie name not found


for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []
    genres_in_cluster = []

    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        mov_name = movie_names.get(movid, "Unknown Movie")
        movs.append(mov_name)

        # Extract and store genres for movies in the cluster (assuming you have a way to access movie genres)
        genres = get_movie_genres(movid)  # This now calls the defined get_movie_genres function
        genres_in_cluster.extend(genres)

    # Find most frequent genres in the cluster
    most_common_genres = [genre for genre, count in Counter(genres_in_cluster).most_common(3)]

    # Prioritize movies containing the most frequent genres
    sorted_movs = sorted(movs, key=lambda mov_name: sum(1 for genre in get_movie_genres(get_movie_id(mov_name)) if genre in most_common_genres), reverse=True)

    # Display top 10 movies based on genre similarity
    print("\t", ", ".join(sorted_movs[:10]))

Cluster #0
	 Snow Dogs (2002), Galaxy of Terror (Quest) (1981), Alien Contamination (1980), Troll 2 (1990), Nobody Loves Me (Keiner liebt mich) (1994), Getting Even with Dad (1994), Fear (1996), First Kid (1996), Land Before Time III: The Time of the Great Giving (1995), Nothing Personal (1995)
Cluster #1
	 Pajama Game, The (1957), Circle, The (Dayereh) (2000), Cure, The (1995), Stupids, The (1996), Return to the Blue Lagoon (1991), Priceless (Hors de prix) (2006), Private Lives of Pippa Lee, The (2009), Magic Mike (2012), Rust and Bone (De rouille et d'os) (2012), Magic Mike XXL (2015)
Cluster #2
	 Best Men (1997), I'm the One That I Want (2000), L.I.E. (2001), Reckless (1995), Swan Princess, The (1994), Spirits of the Dead (1968), Last Supper, The (1995), Lammbock (2001), Kill the Irishman (2011), Einstein and Eddington (2008)
Cluster #3
	 Newton Boys, The (1998), Little Voice (1998), Before and After (1996), Jury Duty (1995), Mad Love (1995), Losing Isaiah (1995), Geronimo: An Ameri