# Collaborative Filtering mit NCF

In [None]:
import numpy as np
import polars as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from utils.ncf import NCF
from utils.interactions_dataset import InteractionsDataset, get_user_item_maps

## 1. Datenvorbereitung

#### 1.1 Datenaufbereitung

In [None]:

data = pl.read_csv("data/ncf_data_v1.csv")
data = data.drop_nulls()
print(data["user_pseudo_id"].n_unique(), "unique users")
print(data["article_id"].n_unique(), "unique items")
print(data.shape, "rows in the dataset")

#### 1.2 Trainings & Testdaten

In [None]:
train_df, test_df = train_test_split(data.to_pandas(), test_size=0.2, random_state=42)

train_df = pl.from_pandas(train_df)
test_df = pl.from_pandas(test_df)

print("Training DataFrame:")
print(train_df.shape)

print("\nTest DataFrame:")
print(test_df.shape)

#### 1.3 Konvertierung zu PyTorch Tensoren

In [None]:
train_dataset = InteractionsDataset(train_df)
test_dataset = InteractionsDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

num_users = len(train_dataset.user_id_map)
num_items = len(train_dataset.item_id_map)
print(f"Number of users: {num_users}\nNumber of items: {num_items}")

## 2. Modelltraining

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCF(num_users, num_items, embedding_dim=32).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for user_idxs, item_idxs in train_loader:
        user_idxs, item_idxs = user_idxs.to(device), item_idxs.to(device)
        optimizer.zero_grad()
        outputs = model(user_idxs, item_idxs)
        labels = torch.ones(len(outputs)).to(device)  # Assuming all interactions in training are positive
        loss = criterion(outputs.view(-1), labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

## 3. Evaluierung auf Basis des nDCG@10

In [None]:
def dcg_at_k(scores, k=10):
    """Calculate Discounted Cumulative Gain (DCG) at k."""
    if len(scores) == 0:
        return 0.0
    scores = scores[:k]
    return np.sum((2**scores - 1) / np.log2(np.arange(2, scores.size + 2)))

def ndcg_at_k(truth, scores, k=10):
    """Calculate Normalized Discounted Cumulative Gain (nDCG) at k."""
    best_dcg = dcg_at_k(sorted(truth, reverse=True), k)
    actual_dcg = dcg_at_k([truth[i] for i in np.argsort(scores)[::-1]], k)
    return actual_dcg / best_dcg if best_dcg > 0 else 0

model.eval()
ndcg_scores = []
with torch.no_grad():
    for user_idxs, item_idxs in test_loader:
        user_idxs, item_idxs = user_idxs.to(device), item_idxs.to(device)
        scores = model(user_idxs, item_idxs)

        true_scores = np.ones(len(scores))  # Assuming all interactions in test are positive
        scores = scores.cpu().numpy()
        ndcg = ndcg_at_k(true_scores, scores)
        ndcg_scores.append(ndcg)

average_ndcg = np.mean(ndcg_scores)
print(f"Mean nDCG@10: {average_ndcg:.4f}")