# First lets handle all the imports

In [1]:
import numpy as np
import pylab as pl
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import pytorch_lightning as pl
import plotly.graph_objs as go
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from collections import defaultdict
from itertools import islice
from plotly.subplots import make_subplots

# Pre-process the data

In [2]:
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv', 
                      parse_dates=['timestamp'])

rand_userIds = np.random.choice(ratings['userId'].unique(), 
                                size=int(len(ratings['userId'].unique())*0.3), 
                                replace=False)

ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))

ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

train_ratings.loc[:, 'rating'] = 1
all_movieIds = ratings['movieId'].unique()

train_ratings.sample(5)

There are 6043155 rows of data from 41547 users


Unnamed: 0,userId,movieId,rating
100881,707,5377,1.0
19890970,137716,6707,1.0
12662180,87502,49220,1.0
8081486,55659,3178,1.0
15072808,104067,3527,1.0


## Lets work with only 30% of the dataset of ratings for managing the CPU capacity

In [3]:
class RecommendationDataset(Dataset):
    """Custom Dataset for Recommendation System with GPU Support
    
    Args:
        interaction_data (pd.DataFrame): DataFrame containing user-item interactions
        all_item_ids (list): List of all unique item IDs
        processing_device (torch.device): Device to move tensors (default: cuda if available)
    """

    def __init__(self, interaction_data, all_item_ids, processing_device=None):
        if processing_device is None:
            processing_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.processing_device = processing_device
        print('Dataset initialization with device configuration.')
        self.user_list, self.item_list, self.rating_list = self._prepare_dataset(interaction_data, all_item_ids)
        print('Dataset prepared with users, items, and labels.')

    def __len__(self):
        return len(self.user_list)
  
    def __getitem__(self, index):
        return self.user_list[index], self.item_list[index], self.rating_list[index]

    def _prepare_dataset(self, interaction_data, all_item_ids):
        print('Generating negative samples...')
        users, items, labels = [], [], []
        positive_pairs = set(zip(interaction_data['userId'], interaction_data['movieId']))
        num_negative_samples = 4

        for user_id, item_id in tqdm(positive_pairs, leave=False, dynamic_ncols=True, desc="Progress"):
            users.append(user_id)
            items.append(item_id)
            labels.append(1)
            for _ in range(num_negative_samples):
                negative_item = np.random.choice(all_item_ids)
                while (user_id, negative_item) in positive_pairs:
                    negative_item = np.random.choice(all_item_ids)
                users.append(user_id)
                items.append(negative_item)
                labels.append(0)
        
        print('Negative samples generated successfully.')

        return (
            torch.tensor(users).to(self.processing_device),
            torch.tensor(items).to(self.processing_device),
            torch.tensor(labels).to(self.processing_device)
        )


class CollaborativeFiltering(pl.LightningModule):
    """Collaborative Filtering Model with GPU Support
    
    Args:
        total_users (int): Number of unique users
        total_items (int): Number of unique items
        training_data (pd.DataFrame): User-item interaction data for training
        item_ids (list): List of all unique item IDs
    """
    
    def __init__(self, total_users, total_items, training_data, item_ids):
        super().__init__()
        print('Model initialization started.')
        self.user_embedding_layer = nn.Embedding(num_embeddings=total_users, embedding_dim=8)
        self.item_embedding_layer = nn.Embedding(num_embeddings=total_items, embedding_dim=8)
        self.dense_layer_1 = nn.Linear(in_features=16, out_features=64)
        self.dense_layer_2 = nn.Linear(in_features=64, out_features=32)
        self.final_output = nn.Linear(in_features=32, out_features=1)
        self.batch_norm_1 = nn.BatchNorm1d(64)
        self.batch_norm_2 = nn.BatchNorm1d(32)
        # self.dropout = nn.Dropout(p=0.2)
        self.training_data = training_data
        self.item_ids = item_ids
        print('Model layers initialized successfully.')

    def forward(self, user_inputs, item_inputs):
        user_vector = self.user_embedding_layer(user_inputs)
        item_vector = self.item_embedding_layer(item_inputs)
        combined_vector = torch.cat([user_vector, item_vector], dim=-1)
        hidden_output = torch.relu(self.dense_layer_1(combined_vector))
        hidden_output = torch.relu(self.dense_layer_2(hidden_output))
        # hidden_output = self.dropout(torch.relu(self.batch_norm_1(self.dense_layer_1(combined_vector))))
        # hidden_output = self.dropout(torch.relu(self.batch_norm_2(self.dense_layer_2(hidden_output))))
        prediction = torch.sigmoid(self.final_output(hidden_output))
        return prediction
    
    def training_step(self, batch_data, batch_idx):
        user_inputs, item_inputs, labels = batch_data
        predictions = self(user_inputs, item_inputs)
        loss_value = nn.BCELoss()(predictions, labels.view(-1, 1).float())
        return loss_value

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        processing_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        return data.DataLoader(
            RecommendationDataset(self.training_data, self.item_ids, processing_device),
            batch_size=4096,
            num_workers=4
        )


def execute_training(interaction_data, training_subset, max_epochs=5, logging_interval=50):
    processing_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    total_users = interaction_data['userId'].max() + 1
    total_items = interaction_data['movieId'].max() + 1
    unique_item_ids = interaction_data['movieId'].unique()

    model = CollaborativeFiltering(total_users, total_items, training_subset, unique_item_ids)
    model = model.to(processing_device)

    optimizer = torch.optim.Adam(model.parameters())
    loss_function = nn.BCELoss()

    print('Training process initialized.')

    for epoch in range(max_epochs):
        model.train()
        epoch_loss = 0
        data_loader = DataLoader(
            RecommendationDataset(training_subset, unique_item_ids), 
            batch_size=4096, 
            shuffle=True
        )

        print(f"Starting Epoch {epoch + 1}/{max_epochs}")
        for batch_idx, (users, items, labels) in enumerate(data_loader):
            optimizer.zero_grad()
            batch_predictions = model(users, items)
            loss_value = loss_function(batch_predictions, labels.view(-1, 1).float())
            loss_value.backward()
            optimizer.step()
            epoch_loss += loss_value.item()

            if (batch_idx + 1) % logging_interval == 0:
                print(f"  Batch {batch_idx + 1}, Loss: {loss_value.item():.4f}")

        print(f"Epoch {epoch + 1} Average Loss: {epoch_loss / len(data_loader):.4f}\n")

    return model

In [4]:
model = execute_training(ratings, train_ratings, max_epochs=5, logging_interval=100)

Model initialization started.
Model layers initialized successfully.
Training process initialized.
Dataset initialization with device configuration.
Generating negative samples...


                                                                      

Negative samples generated successfully.
Dataset prepared with users, items, and labels.
Starting Epoch 1/5
  Batch 100, Loss: 0.5054
  Batch 200, Loss: 0.4781
  Batch 300, Loss: 0.4658
  Batch 400, Loss: 0.4225
  Batch 500, Loss: 0.3677
  Batch 600, Loss: 0.3106
  Batch 700, Loss: 0.2903
  Batch 800, Loss: 0.2777
  Batch 900, Loss: 0.2638
  Batch 1000, Loss: 0.2607
  Batch 1100, Loss: 0.2491
  Batch 1200, Loss: 0.2510
  Batch 1300, Loss: 0.2461
  Batch 1400, Loss: 0.2555
  Batch 1500, Loss: 0.2626
  Batch 1600, Loss: 0.2314
  Batch 1700, Loss: 0.2297
  Batch 1800, Loss: 0.2549
  Batch 1900, Loss: 0.2441
  Batch 2000, Loss: 0.2473
  Batch 2100, Loss: 0.2360
  Batch 2200, Loss: 0.2327
  Batch 2300, Loss: 0.2346
  Batch 2400, Loss: 0.2298
  Batch 2500, Loss: 0.2246
  Batch 2600, Loss: 0.2227
  Batch 2700, Loss: 0.2267
  Batch 2800, Loss: 0.2342
  Batch 2900, Loss: 0.2273
  Batch 3000, Loss: 0.2112
  Batch 3100, Loss: 0.2346
  Batch 3200, Loss: 0.2268
  Batch 3300, Loss: 0.2329
  Batch 34

                                                                      

Negative samples generated successfully.
Dataset prepared with users, items, and labels.
Starting Epoch 2/5
  Batch 100, Loss: 0.2213
  Batch 200, Loss: 0.2085
  Batch 300, Loss: 0.2158
  Batch 400, Loss: 0.2105
  Batch 500, Loss: 0.2260
  Batch 600, Loss: 0.2250
  Batch 700, Loss: 0.2132
  Batch 800, Loss: 0.2184
  Batch 900, Loss: 0.2283
  Batch 1000, Loss: 0.2135
  Batch 1100, Loss: 0.2141
  Batch 1200, Loss: 0.2147
  Batch 1300, Loss: 0.2207
  Batch 1400, Loss: 0.2307
  Batch 1500, Loss: 0.2182
  Batch 1600, Loss: 0.2269
  Batch 1700, Loss: 0.2083
  Batch 1800, Loss: 0.2334
  Batch 1900, Loss: 0.2145
  Batch 2000, Loss: 0.2172
  Batch 2100, Loss: 0.2047
  Batch 2200, Loss: 0.2055
  Batch 2300, Loss: 0.2150
  Batch 2400, Loss: 0.2077
  Batch 2500, Loss: 0.2235
  Batch 2600, Loss: 0.2243
  Batch 2700, Loss: 0.2220
  Batch 2800, Loss: 0.2111
  Batch 2900, Loss: 0.2158
  Batch 3000, Loss: 0.2044
  Batch 3100, Loss: 0.2153
  Batch 3200, Loss: 0.2164
  Batch 3300, Loss: 0.2047
  Batch 34

                                                                      

Negative samples generated successfully.
Dataset prepared with users, items, and labels.
Starting Epoch 3/5
  Batch 100, Loss: 0.2126
  Batch 200, Loss: 0.2224
  Batch 300, Loss: 0.2110
  Batch 400, Loss: 0.2091
  Batch 500, Loss: 0.2091
  Batch 600, Loss: 0.2151
  Batch 700, Loss: 0.2037
  Batch 800, Loss: 0.2224
  Batch 900, Loss: 0.2137
  Batch 1000, Loss: 0.2196
  Batch 1100, Loss: 0.2118
  Batch 1200, Loss: 0.2114
  Batch 1300, Loss: 0.2143
  Batch 1400, Loss: 0.2089
  Batch 1500, Loss: 0.2160
  Batch 1600, Loss: 0.2058
  Batch 1700, Loss: 0.2074
  Batch 1800, Loss: 0.2200
  Batch 1900, Loss: 0.2175
  Batch 2000, Loss: 0.2235
  Batch 2100, Loss: 0.2265
  Batch 2200, Loss: 0.2156
  Batch 2300, Loss: 0.2174
  Batch 2400, Loss: 0.2238
  Batch 2500, Loss: 0.2117
  Batch 2600, Loss: 0.2017
  Batch 2700, Loss: 0.2080
  Batch 2800, Loss: 0.2211
  Batch 2900, Loss: 0.2064
  Batch 3000, Loss: 0.2004
  Batch 3100, Loss: 0.2108
  Batch 3200, Loss: 0.2138
  Batch 3300, Loss: 0.2109
  Batch 34

                                                                      

Negative samples generated successfully.
Dataset prepared with users, items, and labels.
Starting Epoch 4/5
  Batch 100, Loss: 0.2141
  Batch 200, Loss: 0.2196
  Batch 300, Loss: 0.2013
  Batch 400, Loss: 0.2164
  Batch 500, Loss: 0.2050
  Batch 600, Loss: 0.2096
  Batch 700, Loss: 0.2147
  Batch 800, Loss: 0.2132
  Batch 900, Loss: 0.2112
  Batch 1000, Loss: 0.2162
  Batch 1100, Loss: 0.2148
  Batch 1200, Loss: 0.2145
  Batch 1300, Loss: 0.1976
  Batch 1400, Loss: 0.2066
  Batch 1500, Loss: 0.2106
  Batch 1600, Loss: 0.2134
  Batch 1700, Loss: 0.2013
  Batch 1800, Loss: 0.2093
  Batch 1900, Loss: 0.2105
  Batch 2000, Loss: 0.2053
  Batch 2100, Loss: 0.2200
  Batch 2200, Loss: 0.2106
  Batch 2300, Loss: 0.2167
  Batch 2400, Loss: 0.2056
  Batch 2500, Loss: 0.1947
  Batch 2600, Loss: 0.2080
  Batch 2700, Loss: 0.1956
  Batch 2800, Loss: 0.2038
  Batch 2900, Loss: 0.2116
  Batch 3000, Loss: 0.2079
  Batch 3100, Loss: 0.2288
  Batch 3200, Loss: 0.2064
  Batch 3300, Loss: 0.2104
  Batch 34

                                                                      

Negative samples generated successfully.
Dataset prepared with users, items, and labels.
Starting Epoch 5/5
  Batch 100, Loss: 0.2102
  Batch 200, Loss: 0.2216
  Batch 300, Loss: 0.2104
  Batch 400, Loss: 0.2021
  Batch 500, Loss: 0.2082
  Batch 600, Loss: 0.2002
  Batch 700, Loss: 0.2098
  Batch 800, Loss: 0.1946
  Batch 900, Loss: 0.2059
  Batch 1000, Loss: 0.1905
  Batch 1100, Loss: 0.2080
  Batch 1200, Loss: 0.2101
  Batch 1300, Loss: 0.2127
  Batch 1400, Loss: 0.2030
  Batch 1500, Loss: 0.2074
  Batch 1600, Loss: 0.2091
  Batch 1700, Loss: 0.2093
  Batch 1800, Loss: 0.2107
  Batch 1900, Loss: 0.2117
  Batch 2000, Loss: 0.1924
  Batch 2100, Loss: 0.2069
  Batch 2200, Loss: 0.2074
  Batch 2300, Loss: 0.2053
  Batch 2400, Loss: 0.2017
  Batch 2500, Loss: 0.2171
  Batch 2600, Loss: 0.2059
  Batch 2700, Loss: 0.1953
  Batch 2800, Loss: 0.2072
  Batch 2900, Loss: 0.1971
  Batch 3000, Loss: 0.2175
  Batch 3100, Loss: 0.1977
  Batch 3200, Loss: 0.2067
  Batch 3300, Loss: 0.2059
  Batch 34

In [5]:
processing_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# Get a list of all movie IDs
# all_movieIds = ratings['movieId'].unique()

# # Placeholders that will hold the training data
# users, items, labels = [], [], []

# # This is the set of items that each user has interaction with
# user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))

# # 4:1 ratio of negative to positive samples
# num_negatives = 4

# for (u, i) in tqdm(user_item_set):
#     users.append(u)
#     items.append(i)
#     labels.append(1) # items that the user has interacted with are positive
#     for _ in range(num_negatives):
#         # randomly select an item
#         negative_item = np.random.choice(all_movieIds) 
#         # check that the user has not interacted with this item
#         while (u, negative_item) in user_item_set:
#             negative_item = np.random.choice(all_movieIds)
#         users.append(u)
#         items.append(negative_item)
#         labels.append(0) # items not interacted with are negative

In [7]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for step, (u, i) in enumerate(tqdm(test_user_item_set), start=1):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(
        torch.tensor([u] * 100).to('cuda'),
        torch.tensor(test_items).to('cuda')
    ).to('cpu').detach().numpy())
    
    top10_items = [test_items[j] for j in np.argsort(predicted_labels)[::-1][:10]]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
    
    # Print the hit ratio every 200 steps
    if step % 4000 == 0:
        current_hit_ratio = np.mean(hits)
        print(f"Step {step}: Current Hit Ratio @10 = {current_hit_ratio:.4f}")

# Final hit ratio
print("The Hit Ratio @ 10 is {:.4f}".format(np.mean(hits)))


 10%|▉         | 4025/41547 [00:20<03:13, 194.10it/s]

Step 4000: Current Hit Ratio @10 = 0.8203


 19%|█▉        | 8022/41547 [00:41<02:52, 193.98it/s]

Step 8000: Current Hit Ratio @10 = 0.8253


 29%|██▉       | 12029/41547 [01:01<02:31, 194.97it/s]

Step 12000: Current Hit Ratio @10 = 0.8213


 39%|███▊      | 16030/41547 [01:22<02:09, 196.36it/s]

Step 16000: Current Hit Ratio @10 = 0.8230


 48%|████▊     | 20022/41547 [01:43<01:52, 192.15it/s]

Step 20000: Current Hit Ratio @10 = 0.8229


 58%|█████▊    | 24038/41547 [02:04<01:30, 192.93it/s]

Step 24000: Current Hit Ratio @10 = 0.8218


 67%|██████▋   | 28024/41547 [02:25<01:10, 191.43it/s]

Step 28000: Current Hit Ratio @10 = 0.8205


 77%|███████▋  | 32026/41547 [02:46<00:48, 195.44it/s]

Step 32000: Current Hit Ratio @10 = 0.8214


 87%|████████▋ | 36033/41547 [03:07<00:28, 193.00it/s]

Step 36000: Current Hit Ratio @10 = 0.8213


 96%|█████████▋| 40031/41547 [03:28<00:08, 181.47it/s]

Step 40000: Current Hit Ratio @10 = 0.8210


100%|██████████| 41547/41547 [03:35<00:00, 192.36it/s]

The Hit Ratio @ 10 is 0.8211





In [8]:
torch.save(model.state_dict(), '/kaggle/working/model_v3_2.pth')