<a href="https://colab.research.google.com/github/Eatosin/NextClick-RecSys-Engine/blob/main/NextClick_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- COLAB SETUP BLOCK ---
!pip install wget pandas numpy

import pandas as pd
import numpy as np
import wget
import zipfile
import os
import logging

# --- CONFIGURATION ---
DATA_URL = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
RAW_DIR = "/content/data/raw"
PROCESSED_DIR = "/content/data/processed"
MIN_SESSION_LENGTH = 5

# Configure Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()
logger.setLevel(logging.INFO)

class DataPipeline:
    def __init__(self):
        os.makedirs(RAW_DIR, exist_ok=True)
        os.makedirs(PROCESSED_DIR, exist_ok=True)

    def download_data(self):
        zip_path = os.path.join(RAW_DIR, "ml-1m.zip")
        if not os.path.exists(zip_path):
            logger.info("ðŸ“¡ Downloading MovieLens 1M dataset...")
            wget.download(DATA_URL, zip_path)
            logger.info("\nâœ… Download Complete.")

        logger.info("ðŸ“¦ Extracting data...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(RAW_DIR)

        return os.path.join(RAW_DIR, "ml-1m/ratings.dat")

    def preprocess(self, file_path):
        logger.info("ðŸ§¹ Starting Preprocessing...")

        # Load Data
        df = pd.read_csv(file_path, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'], engine='python')

        # Filter & Sort
        df = df.sort_values(by=['uid', 'timestamp'])

        # Mapping
        unique_movies = df['mid'].unique()
        movie_map = {mid: i + 1 for i, mid in enumerate(unique_movies)}
        df['mid'] = df['mid'].map(movie_map)

        logger.info(f"ðŸ“Š Unique Movies: {len(unique_movies)}")
        logger.info(f"ðŸ“Š Total Interactions: {len(df)}")

        # Grouping
        logger.info("ðŸ”— Grouping by User Session...")
        user_group = df.groupby('uid')['mid'].apply(list)

        # Filter
        user_group = user_group[user_group.apply(len) >= MIN_SESSION_LENGTH]

        # Splitting
        train_seqs, val_seqs, test_seqs = [], [], []

        for seq in user_group:
            train_seqs.append(seq[:-2])
            val_seqs.append(seq[:-1])
            test_seqs.append(seq)

        logger.info(f"âœ… Data Processed. Users: {len(user_group)}")

        # Save
        np.save(os.path.join(PROCESSED_DIR, "train.npy"), np.array(train_seqs, dtype=object))
        np.save(os.path.join(PROCESSED_DIR, "val.npy"), np.array(val_seqs, dtype=object))
        np.save(os.path.join(PROCESSED_DIR, "test.npy"), np.array(test_seqs, dtype=object))

        # Save Metadata
        with open(os.path.join(PROCESSED_DIR, "meta.txt"), "w") as f:
            f.write(str(len(unique_movies) + 1))

        logger.info("ðŸ’¾ Artifacts Saved to /content/data/processed/")

# Execution
pipeline = DataPipeline()
raw_path = pipeline.download_data()
pipeline.preprocess(raw_path)

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=38dafe19f373fc480aa6d9e94af650161cc4d9e697fd2e0d5c38e98492ba02bb
  Stored in directory: /root/.cache/pip/wheels/01/46/3b/e29ffbe4ebe614ff224bad40fc6a5773a67a163251585a13a9
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


INFO:root:ðŸ“¡ Downloading MovieLens 1M dataset...
INFO:root:
âœ… Download Complete.
INFO:root:ðŸ“¦ Extracting data...
INFO:root:ðŸ§¹ Starting Preprocessing...
INFO:root:ðŸ“Š Unique Movies: 3706
INFO:root:ðŸ“Š Total Interactions: 1000209
INFO:root:ðŸ”— Grouping by User Session...
INFO:root:âœ… Data Processed. Users: 6040
INFO:root:ðŸ’¾ Artifacts Saved to /content/data/processed/


In [2]:
import torch
import torch.nn as nn
import numpy as np
import os

# --- MODEL CONFIGURATION ---
MAX_LEN = 50       # Max sequence length to look back
HIDDEN_UNITS = 64  # Embedding size
NUM_HEADS = 2      # Attention heads (Multi-head attention)
NUM_LAYERS = 2     # Transformer blocks
DROPOUT = 0.2
BATCH_SIZE = 128
EPOCHS = 10        # For demo speed (In prod, use 50+)
LR = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Item Count
with open("/content/data/processed/meta.txt", "r") as f:
    ITEM_COUNT = int(f.read())

print(f"ðŸš€ Initializing SASRec on {DEVICE} for {ITEM_COUNT} items...")

# --- THE SASREC ARCHITECTURE ---
class SASRec(nn.Module):
    def __init__(self, item_num, hidden_units, max_len, num_heads, num_layers, dropout_rate):
        super(SASRec, self).__init__()
        self.item_emb = nn.Embedding(item_num + 1, hidden_units, padding_idx=0)
        self.pos_emb = nn.Embedding(max_len, hidden_units)
        self.emb_dropout = nn.Dropout(dropout_rate)

        # Transformer Blocks
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_units,
            nhead=num_heads,
            dropout=dropout_rate,
            dim_feedforward=hidden_units * 4
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.last_layernorm = nn.LayerNorm(hidden_units)

    def forward(self, log_seqs):
        # Create Sequence Embeddings
        seqs = self.item_emb(log_seqs)
        positions = np.tile(np.array(range(log_seqs.shape[1])), [log_seqs.shape[0], 1])
        seqs += self.pos_emb(torch.LongTensor(positions).to(DEVICE))

        # Masking (Ignore Padding 0)
        timeline_mask = (log_seqs == 0)

        # Pass through Transformer
        # PyTorch Transformer expects (Seq_Len, Batch, Hidden), so we transpose
        seqs = seqs.transpose(0, 1)
        output = self.transformer_encoder(seqs, src_key_padding_mask=timeline_mask)
        output = output.transpose(0, 1)

        output = self.last_layernorm(output)
        return output # Returns vectors for all steps

    def predict(self, log_seqs, item_indices):
        # Used for Inference (Predict next item)
        log_feats = self.forward(log_seqs)
        final_feat = log_feats[:, -1, :] # Take the last step only
        item_embs = self.item_emb(item_indices)
        logits = (final_feat * item_embs).sum(dim=-1)
        return logits

# --- DATA LOADER ---
class RecDataset(torch.utils.data.Dataset):
    def __init__(self, data_path, max_len):
        self.data = np.load(data_path, allow_pickle=True)
        self.max_len = max_len

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        seq = self.data[idx]
        # Pad sequence to fixed length (Left Padding is standard for RNN/Transf)
        seq = seq[-self.max_len:]
        padding_len = self.max_len - len(seq)
        seq = [0] * padding_len + seq

        target = seq[-1]      # The item we want to predict
        input_seq = seq[:-1]  # The history
        input_seq = [0] + input_seq # Shift padding back

        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(target, dtype=torch.long)

# --- TRAINING LOOP ---
def train():
    train_dataset = RecDataset("/content/data/processed/train.npy", MAX_LEN)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    model = SASRec(ITEM_COUNT, HIDDEN_UNITS, MAX_LEN, NUM_HEADS, NUM_LAYERS, DROPOUT).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss(ignore_index=0) # Ignore padding 0

    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        for step, (seqs, labels) in enumerate(train_loader):
            seqs, labels = seqs.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()

            # Forward Pass: Get vectors
            log_feats = model(seqs)
            final_feat = log_feats[:, -1, :] # Only predict based on the last step

            # Calculate Scores against ALL items (Expensive but accurate)
            # In production, we use Sampled Softmax, but for 3000 items, full softmax is fine
            logits = torch.matmul(final_feat, model.item_emb.weight.transpose(0, 1))

            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if step % 100 == 0:
                print(f"Epoch {epoch+1} | Step {step} | Loss: {loss.item():.4f}")

    # Save Model Artifact
    torch.save(model.state_dict(), "sasrec_model.pth")
    print("ðŸ’¾ Model Saved: sasrec_model.pth")
    return model

# Run Training
trained_model = train()

ðŸš€ Initializing SASRec on cuda for 3707 items...




Epoch 1 | Step 0 | Loss: 36.0646
Epoch 2 | Step 0 | Loss: 17.8119
Epoch 3 | Step 0 | Loss: 14.3562
Epoch 4 | Step 0 | Loss: 13.0311
Epoch 5 | Step 0 | Loss: 11.4550
Epoch 6 | Step 0 | Loss: 11.0445
Epoch 7 | Step 0 | Loss: 9.3715
Epoch 8 | Step 0 | Loss: 9.0682
Epoch 9 | Step 0 | Loss: 9.0044
Epoch 10 | Step 0 | Loss: 8.2011
ðŸ’¾ Model Saved: sasrec_model.pth


In [3]:
def recommend_next_items(model, history_seq, top_k=5):
    model.eval()

    # 1. Preprocess Input
    # Pad to Max Len
    seq = history_seq[-MAX_LEN:]
    padding_len = MAX_LEN - len(seq)
    seq = [0] * padding_len + seq

    # Convert to Tensor
    seq_tensor = torch.tensor([seq], dtype=torch.long).to(DEVICE)

    # 2. Model Inference
    with torch.no_grad():
        # Get the embedding for the sequence
        log_feats = model(seq_tensor)
        final_feat = log_feats[:, -1, :] # The user's current "state"

        # Calculate Scores (Dot Product with Item Embeddings)
        item_embs = model.item_emb.weight
        logits = torch.matmul(final_feat, item_embs.transpose(0, 1))

        # Remove padding (0) and history items (don't recommend what they just watched)
        logits[0, 0] = -float('inf')
        for i in history_seq:
            if i < ITEM_COUNT:
                logits[0, i] = -float('inf')

        # 3. Rank Top K
        scores, indices = torch.topk(logits, top_k)

    return indices[0].cpu().numpy().tolist()

# --- LIVE TEST ---
# Let's verify with a real movie sequence
# IDs from MovieLens:
# 1=Toy Story, 260=Star Wars IV, 1196=Star Wars V, 1210=Star Wars VI
print("ðŸ§ª Testing Recommendation Engine...")

sci_fi_fan = [260, 1196, 1210] # Watched Original Star Wars Trilogy
recs = recommend_next_items(trained_model, sci_fi_fan)

print(f"\nUser Watched: {sci_fi_fan} (Star Wars Trilogy)")
print(f"Recommended:  {recs}")

# 4. Decode (Optional: In a real app, we would map IDs back to Titles)
# Since we don't have the titles loaded in RAM, we just trust the IDs for now.
# Typically, 2628 (Star Wars I) or 1198 (Raiders of Lost Ark) should appear.

ðŸ§ª Testing Recommendation Engine...

User Watched: [260, 1196, 1210] (Star Wars Trilogy)
Recommended:  [186, 437, 887, 96, 272]
