In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection , metrics , preprocessing
import torch
from torch import nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset , DataLoader

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
df = pd.read_csv('ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
df.info() # shows the schema of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [5]:
df.userId.nunique() # no of unique user ids in the data frame 

610

In [6]:
df.movieId.nunique() 

9724

In [7]:
df.rating.value_counts() # number of ratings
# result is class is imbalanced

4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: rating, dtype: int64

In [8]:
df.shape

(100836, 4)

## class wrapper

In [9]:
class MovieDataset:
    def __init__(self,users,movies,ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return len(self.users)


    def __getitem__(self,item):
        users = self.users[item]
        movies = self.movies[item]
        ratings = self.ratings[item]

        return {
            "users" : torch.tensor(users,dtype=torch.long),
            "movies" : torch.tensor(movies,dtype=torch.long),
            "ratings" : torch.tensor(ratings,dtype=torch.long),
        }

In [10]:
class recsys(nn.Module):
    def __init__(self,n_users,n_movies):
        super(recsys,self).__init__()

        #building blocks
        # trainable look up matrix for shallow embedding vectors
        self.user_embed = nn.Embedding(num_embeddings=n_users,embedding_dim=32)
        self.movie_embed = nn.Embedding(num_embeddings=n_movies,embedding_dim=32)

        # concat user and movie embed
        self. out = nn.Linear(64,1)


    def forward(self,users,movies,ratings=None):
        user_embeds = self.user_embed(users)
        movies_embeds = self.movie_embed(movies)
        output = torch.cat([user_embeds,movies_embeds],dim=1)
        output = self.out(output)

        return(output)

In [11]:
# encode the user and movie id to start from 0 so we dont run into index out of bounds with embdenings
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()
df.userId = lbl_user.fit_transform(df.userId.values)
df.movieId = lbl_movie.fit_transform(df.movieId.values)

In [12]:
df_train , df_test = model_selection.train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df.rating.values
            )

In [13]:
train_dataset = MovieDataset(
    users=df_train.userId.values,
    movies=df_train.movieId.values,
    ratings=df_train.rating.values
)

test_dataset = MovieDataset(
    users=df_test.userId.values,
    movies=df_test.movieId.values,
    ratings=df_test.rating.values
)

In [14]:
# loading the data into DataLoaders
train_loader = DataLoader(dataset=train_dataset,
                        batch_size=4,
                        shuffle=True,
                        num_workers=2
                        )

test_loader = DataLoader(dataset=test_dataset,
                        batch_size=4,
                        shuffle=True,
                        num_workers=2,
                        )

In [15]:
# dataiter = next(iter(train_loader))
# print(dataiter)

In [16]:
model = recsys(
                n_users=len(lbl_user.classes_),
                n_movies=len(lbl_movie.classes_)
                ).to(device)

optimizer = torch.optim.Adam(params=model.parameters())
sch = torch.optim.lr_scheduler.StepLR(optimizer=optimizer,step_size=3,gamma=0.7)

loss_func = nn.MSELoss()

In [17]:
print(len(lbl_user.classes_))
print(len(lbl_movie.classes_))
print(df.movieId.max())
print(len(train_dataset))

610
9724
9723
90752


In [18]:
# lets run the forward path
# Assuming previous code is still in place

# Training loop
epochs = 10  # You can adjust the number of epochs

for epoch in range(epochs):
    model.train()  # Set the model in training mode
    total_loss = 0.0
    
    for batch in train_loader:
        users = batch['users'].to(device)
        movies = batch['movies'].to(device)
        ratings = batch['ratings'].to(device)  # Assuming ratings are float values

        optimizer.zero_grad()  # Zero the gradients

        # Forward pass
        predictions = model(users, movies)
        
        # Compute the loss
        loss = loss_func(predictions.squeeze(), ratings)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()

        total_loss += loss.item()

    # Print the average loss for the epoch
    average_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch + 1}/{epochs}], Loss: {average_loss:.4f}')

    # Adjust learning rate
    sch.step()



In [None]:
# Evaluate the model on the test set
model.eval()  # Set the model in evaluation mode
total_test_loss = 0.0

with torch.no_grad():
    for batch in test_loader:
        users = batch['users'].to(device)
        movies = batch['movies'].to(device)
        ratings = batch['ratings'].float().to(device)

        predictions = model(users, movies)
        loss = loss_func(predictions.squeeze(), ratings)

        total_test_loss += loss.item()

average_test_loss = total_test_loss / len(test_loader)
print(f'Test Loss: {average_test_loss:.4f}')
