In [132]:
import torch
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from torch.utils.data.dataset import Dataset
import numpy as np
from torch.utils.data import DataLoader
from torch.nn import init

In [133]:
movies_df = pd.read_csv('data/movies_date.csv')
movies_df = movies_df.drop(columns=["Unnamed: 0"])
ratings_df = pd.read_csv('data/ratings.csv')

In [134]:
movies_df["genres"] = movies_df["genres"].apply(lambda x: x.split("|"))

movies_df["date"] = pd.to_datetime(movies_df["date"])
min_date = movies_df["date"].min()
movies_df["date_score"] = (movies_df["date"] - min_date).dt.days  
movies_df["date_score"] = movies_df["date_score"] / movies_df["date_score"].max()


movies_df.head()

Unnamed: 0,movieId,title,genres,date,date_score
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",2024-04-14 05:05:00,0.643347
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",2024-02-20 04:40:09,0.569273
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",2023-06-17 11:26:20,0.229081
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",2023-05-13 11:27:31,0.18107
4,5,Father of the Bride Part II (1995),[Comedy],2023-04-25 16:26:53,0.156379


In [135]:
ratings_df = ratings_df.drop(columns=['timestamp'])
ratings_df["rating"] = (ratings_df["rating"] - ratings_df["rating"].min()) / \
                       (ratings_df["rating"].max() - ratings_df["rating"].min())
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,0.777778
1,1,3,0.777778
2,1,6,0.777778
3,1,47,1.000000
4,1,50,1.000000
...,...,...,...
100831,610,166534,0.777778
100832,610,168248,1.000000
100833,610,168250,1.000000
100834,610,168252,1.000000


In [136]:
n_users = len(ratings_df.userId.unique())
n_items = len(movies_df.movieId.unique())
print('unique user - ', n_users) 
print('unique movies - ', n_items) 
print('matrix size - ', n_items*n_users)
print('total ratings available - ', len(ratings_df))
print('% of filled matrix - ', len(ratings_df)/(n_users*n_items)*100,'%')

unique user -  610
unique movies -  9742
matrix size -  5942620
total ratings available -  100836
% of filled matrix -  1.6968273253211548 %


In [137]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=30):
        super().__init__()
        
        self.user_factors = torch.nn.Embedding(n_users, n_factors)  
        self.item_factors = torch.nn.Embedding(n_items, n_factors)  
        
        #self.user_factors.weight.data.uniform_(0, 0.05)
        #self.item_factors.weight.data.uniform_(0, 0.05)
        init.xavier_uniform_(self.user_factors.weight)
        init.xavier_uniform_(self.item_factors.weight)

    def forward(self, data):
        users, items = data[:, 0], data[:, 1]
        return (self.user_factors(users) * self.item_factors(items)).sum(1)


    def predict(self, user, item):
        return self.forward(user, item)


In [138]:

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        users = ratings_df.userId.unique()
        movies = movies_df.movieId.unique()
        
        
        self.userid2idx = {o:i for i, o in enumerate(users)}
        self.movieid2idx = {o:i for i, o in enumerate(movies)}
   

        
        self.idx2userid = {i:o for o, i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o, i in self.movieid2idx.items()}
        
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

        self.x = self.ratings.drop(['rating'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) 
        
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)


In [141]:
num_epochs = 500
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        
if cuda:
    model = model.cuda()

loss_fn = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_set = Loader()
train_loader = DataLoader(train_set, 1024, shuffle=True)


Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9742, 8)
)
user_factors.weight tensor([[ 0.0297,  0.0154, -0.0447,  ..., -0.0353,  0.0497,  0.0693],
        [ 0.0248, -0.0889,  0.0613,  ...,  0.0458,  0.0526,  0.0405],
        [ 0.0055, -0.0681, -0.0898,  ..., -0.0300, -0.0774,  0.0518],
        ...,
        [ 0.0557, -0.0083,  0.0032,  ..., -0.0275, -0.0513,  0.0725],
        [ 0.0839,  0.0426,  0.0577,  ...,  0.0734,  0.0643,  0.0035],
        [-0.0875,  0.0420,  0.0892,  ..., -0.0145, -0.0444,  0.0630]])
item_factors.weight tensor([[-0.0107,  0.0073,  0.0239,  ...,  0.0233,  0.0033,  0.0165],
        [-0.0065,  0.0160, -0.0242,  ...,  0.0025, -0.0198, -0.0101],
        [ 0.0059, -0.0162, -0.0114,  ..., -0.0110, -0.0018, -0.0208],
        ...,
        [ 0.0067, -0.0081, -0.0087,  ...,  0.0233,  0.0105,  0.0088],
        [-0.0130, -0.0237,  0.0077,  ...,  0.0007, -0.0178,  0.0224],
        [-0.0247, -0.0051,  0.0094,  ..., -

In [142]:
for it in tqdm(range(num_epochs)):
    losses= []
    for x,y in train_loader:
        if cuda:
            x,y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(),y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print('iter #{}'.format(it),'loss',sum(losses)/len(losses))
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/500 [00:00<?, ?it/s]

iter #0 loss 0.49800102909406024
iter #1 loss 0.4640200532446004
iter #2 loss 0.2996117739063321
iter #3 loss 0.14017124037549952
iter #4 loss 0.08435641168945968
iter #5 loss 0.06434118638586517
iter #6 loss 0.05391115059304719
iter #7 loss 0.04759088485981479
iter #8 loss 0.04345043831401401
iter #9 loss 0.040473843162710015
iter #10 loss 0.03832452823266839
iter #11 loss 0.036642106833181
iter #12 loss 0.035345055576827794
iter #13 loss 0.03427725285291672
iter #14 loss 0.033402090101982605
iter #15 loss 0.03264947192310685
iter #16 loss 0.0320027968799225
iter #17 loss 0.03143482860365902
iter #18 loss 0.030919661237434906
iter #19 loss 0.030478448917468388
iter #20 loss 0.030071662889436038
iter #21 loss 0.029668600608905155
iter #22 loss 0.029315547547256102
iter #23 loss 0.02895167161418934
iter #24 loss 0.02859592703029965
iter #25 loss 0.028214241632006386
iter #26 loss 0.02785661980283983
iter #27 loss 0.0274755030945696
iter #28 loss 0.027108110525090285
iter #29 loss 0.0267

In [143]:
c = 0
uw = 0
iw = 0

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name,param.data)
        if c == 0:
            uw = param.data
            c += 1
        else:
            iw = param.data


user_factors.weight tensor([[-0.6015,  0.7170, -0.2746,  ...,  0.3628,  0.3460, -0.5552],
        [-0.4311, -0.3387, -0.3272,  ...,  0.9839,  0.5951, -0.5277],
        [-0.3149, -0.1269,  0.2866,  ...,  0.2711, -1.1271, -0.6729],
        ...,
        [-0.6420,  0.5939, -0.3205,  ...,  0.6617,  0.2119,  0.4383],
        [-0.4512,  0.4103, -0.1643,  ...,  0.2159,  0.4198, -0.1726],
        [-0.5438,  0.6093, -0.2693,  ...,  0.5533, -0.0213, -0.5004]],
       device='cuda:0')
item_factors.weight tensor([[-0.2793,  0.3691, -0.1514,  ...,  0.1498,  0.2397, -0.3764],
        [-0.2673,  0.1349, -0.2131,  ...,  0.1197,  0.2877, -0.3051],
        [-0.0619,  0.2056, -0.3665,  ..., -0.0287,  0.1925, -0.2870],
        ...,
        [-0.1811,  0.2057, -0.2267,  ...,  0.2312,  0.1424, -0.1642],
        [-0.1974,  0.1986, -0.2179,  ...,  0.2171,  0.1092, -0.1439],
        [-0.2307,  0.2801, -0.2322,  ...,  0.2751,  0.1273, -0.2415]],
       device='cuda:0')


In [144]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()
print(trained_movie_embeddings.shape)

trained_user_embeddings = model.user_factors.weight.data.cpu().numpy()
print(trained_user_embeddings.shape)

(9742, 8)
(610, 8)


In [145]:
trained_user_embeddings = model.user_factors.weight.data.cpu().numpy()
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

print(f"Trained user embeddings shape: {trained_user_embeddings.shape}")
print(f"Trained movie embeddings shape: {trained_movie_embeddings.shape}")

full_matrix = np.dot(trained_user_embeddings, trained_movie_embeddings.T)
full_matrix.shape

Trained user embeddings shape: (610, 8)
Trained movie embeddings shape: (9742, 8)


(610, 9742)

In [151]:
user_index = train_set.userid2idx[569] 
user_ratings = full_matrix[user_index]
print(user_ratings[0:10])
top_movie_indices = user_ratings.argsort()[::-1]  
top_movie_indices[0:10]

top_movie_ids = [int(train_set.idx2movieid[idx]) for idx in top_movie_indices]
top_movie_ids[0:10]

movies_df["movieId"] = pd.Categorical(movies_df["movieId"], categories=top_movie_ids, ordered=True)
movies_df = movies_df.sort_values("movieId")
movies_df

[0.6937063  0.71819746 1.0594624  0.2970363  0.862324   0.9349352
 0.30797344 0.90422916 0.78854334 0.7704701 ]


Unnamed: 0,movieId,title,genres,date,date_score
3462,4721,American Outlaws (2001),"[Action, Comedy, Western]",2023-01-04 00:10:19,0.002743
76,85,Angels and Insects (1995),"[Drama, Romance]",2024-07-22 12:08:21,0.779150
2575,3444,Bloodsport (1988),[Action],2023-11-24 14:48:21,0.448560
3542,4846,Iron Monkey (Siu nin Wong Fei-hung ji: Tit Ma ...,"[Action, Comedy]",2024-08-05 04:23:34,0.798354
4843,7235,Ichi the Killer (Koroshiya 1) (2001),"[Action, Comedy, Crime, Drama, Horror, Thriller]",2023-05-21 05:07:54,0.192044
...,...,...,...,...,...
5830,32291,Melinda and Melinda (2004),"[Comedy, Drama]",2023-05-27 23:06:57,0.200274
1474,1999,"Exorcist III, The (1990)",[Horror],2024-08-16 19:04:33,0.813443
400,459,"Getaway, The (1994)","[Action, Adventure, Crime, Drama, Romance, Thr...",2023-06-21 08:36:44,0.234568
1172,1554,"Pillow Book, The (1996)","[Drama, Romance]",2024-07-16 13:31:11,0.770919


In [150]:
user_index = train_set.userid2idx[569] 
user_ratings = full_matrix[user_index]
print(user_ratings[0:10])

updated_scores = []
for i,score in enumerate(user_ratings):
    movie_id = train_set.idx2movieid[i]
    time_weight = movies_df.loc[movies_df["movieId"] == movie_id, "date_score"].values[0]
    updated_score = (0.3 * score) + (0.7 * time_weight)  
    updated_scores.append(float(updated_score))
updated_scores = np.array(updated_scores)
print(updated_scores[0:10])

top_movie_indices = updated_scores.argsort()[::-1]  
top_movie_indices[0:10]

top_movie_ids = [int(train_set.idx2movieid[idx]) for idx in top_movie_indices]
top_movie_ids[0:10]

movies_df["movieId"] = pd.Categorical(movies_df["movieId"], categories=top_movie_ids, ordered=True)
movies_df = movies_df.sort_values("movieId")
movies_df

[0.6937063  0.71819746 1.0594624  0.2970363  0.862324   0.9349352
 0.30797344 0.90422916 0.78854334 0.7704701 ]
[0.65845483 0.61395033 0.47819538 0.21585986 0.36816223 0.96319661
 0.70981316 0.38841553 0.68594573 0.77750591]


Unnamed: 0,movieId,title,genres,date,date_score
76,85,Angels and Insects (1995),"[Drama, Romance]",2024-07-22 12:08:21,0.779150
1672,2249,My Blue Heaven (1990),[Comedy],2024-10-23 15:05:13,0.906722
3542,4846,Iron Monkey (Siu nin Wong Fei-hung ji: Tit Ma ...,"[Action, Comedy]",2024-08-05 04:23:34,0.798354
6060,40723,Wolf Creek (2005),"[Crime, Horror, Thriller]",2024-11-09 14:55:35,0.930041
3058,4102,Eddie Murphy Raw (1987),"[Comedy, Documentary]",2024-10-31 15:50:44,0.917695
...,...,...,...,...,...
5830,32291,Melinda and Melinda (2004),"[Comedy, Drama]",2023-05-27 23:06:57,0.200274
712,931,Spellbound (1945),"[Mystery, Romance, Thriller]",2023-03-20 14:13:48,0.106996
3062,4109,Flowers in the Attic (1987),"[Drama, Thriller]",2023-02-27 17:24:37,0.078189
400,459,"Getaway, The (1994)","[Action, Adventure, Crime, Drama, Romance, Thr...",2023-06-21 08:36:44,0.234568
