In [None]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing 
import torch
import torch.nn as nn
import matplotlib.pyplot as plt 
from torch.utils.data import Dataset, DataLoader

In [None]:
#device agnostic code
device = torch. device( 'cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
df = pd.read_csv("/kaggle/input/recemmonder/ml-latest-small/ratings.csv")

In [None]:
df.head(100) # basically show schema

In [None]:
df.userId.nunique()

In [None]:
df.movieId.nunique()

In [None]:
df.rating.value_counts() #check value distribution

In [None]:
df.shape

In [None]:
class MovieDataset:
    def __init__ (self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings
        
    def __len__(self):
        return len (self.users)
    
    def __getitem__(self, item) :
        users = self.users[item]
        movies = self.movies[item]
        ratings = self.ratings[item]
        return {
            "users": torch.tensor(users,dtype=torch.long) ,
            "movies": torch.tensor(movies,dtype=torch.long),
            "ratings": torch.tensor(ratings,dtype=torch.long)
        }

In [None]:
class RecSysModel (nn.Module) :
    def __init__(self, n_users, n_movies):
        super ( ).__init__()
        # trainable lookup matrix for shallow embedding vectors
        self.user_embed = nn.Embedding(n_users, 32)
        self.movie_embed = nn.Embedding(n_movies, 32)
        # user, movie embedding concat
        self.out = nn.Linear(64, 1)
        
    def forward(self, users, movies, ratings=None):
        user_embeds = self.user_embed(users)
        movie_embeds = self.movie_embed(movies)
        output = torch.cat([user_embeds, movie_embeds], dim=1)
        output = self.out(output)

        return output

In [None]:
# encode the user and movie id to start from 0 so we don't run into index out of bound with Embedding
lb1_user = preprocessing.LabelEncoder()
lb1_movie = preprocessing.LabelEncoder()
df.userId = lb1_user.fit_transform(df.userId.values)
df.movieId = lb1_movie.fit_transform(df.movieId.values)

df_train, df_valid = model_selection.train_test_split(
df, test_size=0.1, random_state=42, stratify=df.rating.values
)

train_dataset = MovieDataset(
    users=df_train.userId.values,
    movies=df_train.movieId.values, 
    ratings=df_train.rating.values
)

valid_dataset = MovieDataset(
    users=df_valid.userId.values, 
    movies=df_valid.movieId.values, 
    ratings=df_valid.rating.values
)

In [None]:
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=4, 
                          shuffle=True,
                          num_workers=2)

validation_loader = DataLoader(dataset=valid_dataset,
                               batch_size=4,
                               shuffle=True, 
                               num_workers=2)

# dataiter = iter(train_loader)
# dataloader_data = next(dataiter)
# print(dataloader_data)

In [None]:
model = RecSysModel (
    n_users=len(lb1_user.classes_), 
    n_movies=len(lb1_movie.classes_),
).to(device)

optimizer = torch.optim.Adam(model.parameters())
sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)
loss_func = nn.MSELoss()

In [None]:
print (len(lb1_user.classes_)) 
print(len(lb1_movie.classes_))
print (df.movieId.max())
print(len(train_dataset))

In [None]:
epochs = 1
total_loss = 0
plot_steps, print_steps = 5000, 5000
step_cnt = 0
all_losses_list = []

model.train()
for epoch_i in range (epochs):
    for i, train_data in enumerate(train_loader):
        output = model(train_data["users"],
                       train_data["movies"] 
                      )
        # view(4, -1) is to reshape the rating to match the shape of model output which is 4x1
        
        rating = train_data["ratings"].view(4,-1).to(torch.float32)
        loss = loss_func(output, rating)
        total_loss = total_loss + loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
                            
        step_cnt=step_cnt + len(train_data["users"])
        if(step_cnt % plot_steps == 0) :
            avg_loss = total_loss/(len(train_data["users"]) * plot_steps)
            print(f"epoch {epoch_i} loss at step: {step_cnt} is {avg_loss}")
            all_losses_list.append(avg_loss)
            total_loss = 0 # reset total loss

In [None]:
plt.figure()
plt.plot(all_losses_list)
plt.show()