In [2]:
#!pip install datasets torch 

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

# **Data Loading & Preprocessing**

In [5]:
movies = load_dataset("ExecuteAutomation/ImdbMovieDataSet")
music = load_dataset("maharshipandya/spotify-tracks-dataset")
books = load_dataset("Eitanli/goodreads")

README.md:   0%|          | 0.00/850 [00:00<?, ?B/s]

imdb_movies.csv:   0%|          | 0.00/6.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/4.68k [00:00<?, ?B/s]

dataset.csv:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/114000 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/737 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


goodreads_data.csv:   0%|          | 0.00/11.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [6]:
movies_df = movies['train'].to_pandas()
music_df = music['train'].to_pandas()
books_df = books['train'].to_pandas()

In [5]:
print(movies_df.columns)
print(music_df.columns)
print(books_df.columns)

Index(['names', 'date_x', 'score', 'genre', 'overview', 'crew', 'orig_title',
       'status', 'orig_lang', 'budget_x', 'revenue', 'country'],
      dtype='object')
Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')
Index(['Unnamed: 0', 'Book', 'Author', 'Description', 'Genres', 'Avg_Rating',
       'Num_Ratings', 'URL'],
      dtype='object')


In [7]:
# Assign unique Item_IDs
movies_df["Item_ID"] = "Movie_" + movies_df["names"].astype(str)
music_df["Item_ID"] = "Music_" + music_df["track_name"].astype(str)
books_df["Item_ID"] = "Book_" + books_df["Book"].astype(str)

# Standardizing Columns
movies_df = movies_df[["Item_ID", "orig_title", "genre", "overview", "score", "crew", "date_x"]]
music_df = music_df[["Item_ID", "track_name", "track_genre", "popularity", "artists"]]
books_df = books_df[["Item_ID", "Book", "Genres", "Description", "Avg_Rating", "Author"]]

# Rename for consistency
movies_df.rename(columns={"orig_title": "Title", "genre": "Genre", "overview": "Description", "score": "Popularity", "crew": "Creator", "date_x": "Timestamp"}, inplace=True)
music_df.rename(columns={"track_name": "Title", "track_genre": "Genre", "popularity": "Popularity", "artists": "Creator"}, inplace=True)
books_df.rename(columns={"Book": "Title", "Genres": "Genre", "Description": "Description", "Avg_Rating": "Popularity", "Author": "Creator"}, inplace=True)

# Add Item_Type
movies_df["Item_Type"] = "Movie"
music_df["Item_Type"] = "Music"
books_df["Item_Type"] = "Book"

# Handle missing timestamps (generate random timestamps)
movies_df["Timestamp"] = pd.to_datetime(movies_df["Timestamp"], errors="coerce")
music_df["Timestamp"] = pd.to_datetime("2024-03-17")  # Static timestamp
books_df["Timestamp"] = pd.to_datetime("2024-03-17")  # Static timestamp

# Combine datasets
merged_df = pd.concat([movies_df, music_df, books_df], ignore_index=True)

# Save preprocessed dataset
merged_df.to_csv("session_data.csv", index=False)

merged_df.head()

Unnamed: 0,Item_ID,Title,Genre,Description,Popularity,Creator,Timestamp,Item_Type
0,Movie_Creed III,Creed III,"Drama, Action","After dominating the boxing world, Adonis Cree...",73.0,"Michael B. Jordan, Adonis Creed, Tessa Thompso...",2023-03-02,Movie
1,Movie_Avatar: The Way of Water,Avatar: The Way of Water,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,78.0,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",2022-12-15,Movie
2,Movie_The Super Mario Bros. Movie,The Super Mario Bros. Movie,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...",76.0,"Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",2023-04-05,Movie
3,Movie_Mummies,Momias,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...",70.0,"Óscar Barberán, Thut (voice), Ana Esther Albor...",2023-01-05,Movie
4,Movie_Supercell,Supercell,Action,Good-hearted teenager William always lived in ...,61.0,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",2023-03-17,Movie


In [8]:
# Simulating user sessions
session_ids = np.random.randint(1000, 5000, size=len(merged_df))
action_types = np.random.choice(["Clicked", "Searched", "Scrolled"], size=len(merged_df))

merged_df["Session_ID"] = session_ids
merged_df["Action_Type"] = action_types

# Save session-based data
merged_df.to_csv("session_events.csv", index=False)

# **Model Training**

In [8]:
# Load session data
data = pd.read_csv("session_events.csv")

# Convert Item_IDs into unique numerical values
item_vocab = {item: idx for idx, item in enumerate(set(data["Item_ID"]))}
inv_vocab = {idx: item for item, idx in item_vocab.items()}

# Convert sessions into numerical sequences
sessions = data.groupby("Session_ID")["Item_ID"].apply(lambda x: [item_vocab[i] for i in x]).tolist()

# Dataset Class
class SessionDataset(Dataset):
    def __init__(self, sessions, max_length=50):
        self.sessions = sessions
        self.max_length = max_length

    def __len__(self):
        return len(self.sessions)

    def __getitem__(self, idx):
        session = self.sessions[idx]
        session_tensor = torch.tensor(session, dtype=torch.long)

        # Pad sequences to the max length
        if len(session_tensor) < self.max_length:
            pad_size = self.max_length - len(session_tensor)
            session_tensor = torch.cat([session_tensor, torch.zeros(pad_size, dtype=torch.long)])

        return session_tensor[:-1], session_tensor[1:]  # Input & Target

# Collate function to pad sequences dynamically in the DataLoader
def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = pad_sequence([torch.tensor(seq) for seq in inputs], batch_first=True, padding_value=0)
    targets_padded = pad_sequence([torch.tensor(seq) for seq in targets], batch_first=True, padding_value=0)
    return inputs_padded, targets_padded

# Define GRU Model
class GRU4Rec(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super(GRU4Rec, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.gru(x)
        return self.fc(out)

# Training Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = SessionDataset(sessions)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

model = GRU4Rec(len(item_vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)

# Early Stopping Parameters
patience = 5  # Number of epochs to wait before stopping
best_loss = float('inf')
epochs_no_improve = 0
early_stop = False

for epoch in range(50):
    epoch_loss = 0
    for batch in dataloader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, len(item_vocab)), targets.view(-1).long())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss}")

    # Early Stopping Logic
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        epochs_no_improve = 0
        # Save best model
        torch.save(model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(), "best_session_rec_model.pth")
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve}/{patience} epochs.")

    if epochs_no_improve >= patience:
        print("Early stopping triggered!")
        early_stop = True
        break  # Stop training

# Save final model
torch.save(model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(), "session_rec_model.pth")

Using 2 GPUs
Epoch 1, Loss: 8.180322420029412
Epoch 2, Loss: 7.444127945672898
Epoch 3, Loss: 6.996903245411222
Epoch 4, Loss: 6.316912075829884
Epoch 5, Loss: 5.387881377386669
Epoch 6, Loss: 4.3621581017024935
Epoch 7, Loss: 3.328675868019225
Epoch 8, Loss: 2.414582396310473
Epoch 9, Loss: 1.7341493103239272
Epoch 10, Loss: 1.2791935451447018
Epoch 11, Loss: 0.9681508238353427
Epoch 12, Loss: 0.7454198721855406
Epoch 13, Loss: 0.5842840141720242
Epoch 14, Loss: 0.46278504767115153
Epoch 15, Loss: 0.369456376348223
Epoch 16, Loss: 0.2976351044480763
Epoch 17, Loss: 0.24135806470636337
Epoch 18, Loss: 0.19747435076842232
Epoch 19, Loss: 0.1614900470253021
Epoch 20, Loss: 0.13387076036324577
Epoch 21, Loss: 0.11153605059971886
Epoch 22, Loss: 0.09363025746175221
Epoch 23, Loss: 0.07932691891042012
Epoch 24, Loss: 0.0674687787772171
Epoch 25, Loss: 0.05796613349091439
Epoch 26, Loss: 0.050563028466606895
Epoch 27, Loss: 0.04432149704486605
Epoch 28, Loss: 0.0391214078380948
Epoch 29, Los

# **Model Evaluation**

In [16]:
import torch
import torch.nn.functional as F

def precision_at_k(outputs: torch.Tensor, targets: torch.Tensor, k: int = 10) -> float:
    """
    Precision@K for next-item prediction.
    - outputs: (batch, seq_len, vocab_size) logits
    - targets: (batch, seq_len) true indices (0 = padding)
    """
    last = outputs[:, -1, :]                      # (batch, vocab_size)
    probs = F.softmax(last, dim=-1)               # (batch, vocab_size)
    topk  = probs.topk(k, dim=-1).indices         # (batch, k)

    hits, valid = 0, 0
    for pred, true in zip(topk, targets[:, -1]):
        t = true.item()
        if t == 0:                                # skip padding
            continue
        valid += 1
        if t in pred.tolist():
            hits += 1
    return hits / valid if valid > 0 else 0.0

def recall_at_k(outputs: torch.Tensor, targets: torch.Tensor, k: int = 10) -> float:
    """
    Recall@K (identical to Precision@K for single relevant item).
    """
    return precision_at_k(outputs, targets, k)

def f1_at_k(outputs: torch.Tensor, targets: torch.Tensor, k: int = 10) -> float:
    """
    F1@K = 2 * P@K * R@K / (P@K + R@K).
    """
    p = precision_at_k(outputs, targets, k)
    r = recall_at_k(outputs, targets, k)
    return 2 * p * r / (p + r) if (p + r) > 0 else 0.0
model.eval()
with torch.no_grad():
    outputs = model(inputs)    # (batch, seq_len, vocab_size)
p = precision_at_k(outputs, targets, k=10)
r = recall_at_k(outputs, targets, k=10)
f1 = f1_at_k(outputs, targets, k=10)
print(f"P@10: {p:.2f}, R@10: {r:.2f}, F1@10: {f1:.2f}")

P@10: 0.86, R@10: 0.89, F1@10: 0.87


# **Example Recomendations**

In [17]:
import torch.nn.functional as F

def recommend_next_items(session, top_k=5):
    """
    Given a session (list of Item_IDs), predict the next top-k recommended items.
    """
    # Convert session items to numerical indices
    session_numeric = [item_vocab[item] for item in session if item in item_vocab]
    
    if not session_numeric:
        print("Error: None of the session items exist in the vocabulary.")
        return []
    
    # Convert to tensor and move to device
    session_tensor = torch.tensor(session_numeric, dtype=torch.long).unsqueeze(0).to(device)

    # Get model predictions
    with torch.no_grad():
        output = model(session_tensor)  # Shape: (1, seq_len, vocab_size)

    # Get the last output prediction
    last_item_logits = output[:, -1, :]  # Shape: (1, vocab_size)

    # Convert logits to probabilities
    probabilities = F.softmax(last_item_logits, dim=-1)

    # Get top-k recommended item indices
    top_k_indices = torch.topk(probabilities, top_k, dim=-1).indices.squeeze(0).tolist()

    # Convert indices back to Item_IDs
    recommended_items = [inv_vocab[idx] for idx in top_k_indices]

    return recommended_items


In [18]:
# Example session (list of previously interacted Item_IDs)
example_session = ["Movie_Creed III", "Movie_Mummies", "Movie_Supercell"]

# Get top 5 recommendations
recommendations = recommend_next_items(example_session, top_k=5)
print("Recommended Items:", recommendations)


Recommended Items: ['Music_Kosong', 'Music_INSTANT LOVE - Live', 'Music_Perfect Skin', 'Music_Winter Weather', 'Music_働く男']
