In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from tqdm import tqdm

In [2]:
dataset = pd.read_csv('../data/interim/dataset.csv')
dataset = dataset.drop(columns=['timestamp'])

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

dataset['user_id'] = user_encoder.fit_transform(dataset['user_id'])
dataset['item_id'] = item_encoder.fit_transform(dataset['item_id'])

In [3]:
class RecommenderDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.values
        self.columns = dataframe.columns

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return torch.tensor(self.data[index], dtype=torch.float32)

class EmbeddingRecommenderModel(nn.Module):
    def __init__(self, user_size, item_size, embedding_size=64):
        super(EmbeddingRecommenderModel, self).__init__()
        self.user_embedding = nn.Embedding(user_size, embedding_size)
        self.item_embedding = nn.Embedding(item_size, embedding_size)
        self.fc1 = nn.Linear(embedding_size * 2, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 32)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(32, 1)

    def forward(self, user_id, item_id):
        user_embed = self.user_embedding(user_id)
        item_embed = self.item_embedding(item_id)
        x = torch.cat([user_embed, item_embed], dim=1)
        x = self.relu1(self.bn1(self.fc1(x)))
        x = self.relu2(self.bn2(self.fc2(x)))
        x = self.relu3(self.fc3(x))
        x = self.fc4(x)
        return x

class RecommenderSystem:
    def __init__(self, dataframe, batch_size=64, lr=0.001, epochs=10):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        user_size = dataframe['user_id'].nunique()
        item_size = dataframe['item_id'].nunique()
        self.model = EmbeddingRecommenderModel(user_size, item_size).to(self.device)
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.batch_size = batch_size
        self.epochs = epochs
        self.dataset = dataframe
        train_data, test_data = train_test_split(dataframe, test_size=0.2, random_state=42)
        self.train_dataset = RecommenderDataset(train_data)
        self.test_dataset = RecommenderDataset(test_data)

    def train(self):
        train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

        loop = tqdm(range(self.epochs))
        for epoch in loop:
            self.model.train()
            running_loss = 0.0

            for batch in train_loader:
                batch = batch.to(self.device)
                inputs, targets = batch[:, :-1], batch[:, -1]
                user_ids = inputs[:, 0].long()
                item_ids = inputs[:, 1].long()
                self.optimizer.zero_grad()
                outputs = self.model(user_ids, item_ids)
                loss =  self.criterion(outputs.squeeze(), targets)
                loss.backward()
                self.optimizer.step()

                running_loss += loss.item()

            loop.set_postfix({'Epoch': epoch + 1, 'loss': running_loss / len(train_loader)})

    def evaluate(self):
        test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False)
        self.model.eval()
        total_loss = 0.0

        with torch.no_grad():
            for batch in test_loader:
                batch = batch.to(self.device)
                user_ids = batch[:, 0].long()
                item_ids = batch[:, 1].long()
                outputs = self.model(user_ids, item_ids)
                loss = self.criterion(outputs, batch[:, -1:])
                total_loss += loss.item()

        print(f"Test Loss: {total_loss / len(test_loader)}")
    
    def predict_top_k(self, user_id, top_k=5):
        self.model.eval()
        
        all_item_ids = torch.arange(self.model.item_embedding.num_embeddings, device=self.device)
        user_ids = torch.full((len(all_item_ids),), user_id, dtype=torch.long, device=self.device)
        
        with torch.no_grad():
            scores = self.model(user_ids, all_item_ids)
        
        watched = self.dataset[self.dataset['user_id'] == user_id]['item_id'].values.tolist()
        unwatched_item_ids = torch.tensor(list(set(all_item_ids.tolist()) - set(watched)), device=self.device)
        unwatched_scores = scores[unwatched_item_ids]
        _, indices = torch.topk(unwatched_scores.view(-1), top_k)
        top_k_items = unwatched_item_ids[indices].cpu().numpy().tolist()
        
        return top_k_items


recommender_system = RecommenderSystem(dataset)
recommender_system.train()
recommender_system.evaluate()

100%|██████████| 10/10 [01:07<00:00,  6.76s/it, Epoch=10, loss=3.66e-5]


Test Loss: 7.42944810311309e-05


In [19]:
recommender_system.predict_top_k(125)

[890, 1018, 230, 1290, 1166]