<a href="https://www.kaggle.com/code/bineetbairagi/ecomrec?scriptVersionId=259155805" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import HashingVectorizer
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
import scipy.sparse as sp

# Optional KaggleHub handling
try:
    import kagglehub
except Exception:
    kagglehub = None

# --------------------------
# 0) Load Kaggle dataset robustly
# --------------------------

DATASET_HANDLE = "bhadramohit/customer-shopping-latest-trends-dataset"
df = None

try:
    if kagglehub is None:
        raise ImportError("kagglehub is not available.")

    path = kagglehub.dataset_download(DATASET_HANDLE)
    if isinstance(path, str) and os.path.isdir(path):
        dataset_dir = path
    elif isinstance(path, (list, tuple)) and len(path) > 0:
        dataset_dir = path[0]
    else:
        raise FileNotFoundError("Dataset directory not found.")

    # Load CSV
    csv_files = [f for f in os.listdir(dataset_dir) if f.endswith(".csv")]
    if not csv_files:
        raise FileNotFoundError("No CSV in dataset.")
    file_path = os.path.join(dataset_dir, csv_files[0])
    df = pd.read_csv(file_path)
    print(f"Loaded dataset from KaggleHub: {file_path}")

except Exception as e:
    print(f"Failed to load dataset from KaggleHub. Error: {e}")
    print("Using fallback toy data.")

    def generate_toy_data(n_users=50, n_events_per_user=20, seed=42):
        rng = np.random.default_rng(seed)
        categories = ['electronics', 'clothing', 'home', 'sports', 'books']
        queries_pool = [
            'blue jeans', 'running shoes', 'laptop', 'coffee maker', 'smartphone',
            'wireless headphones', 'winter coat', 'gaming mouse', 'kitchen blender', 'yoga mat'
        ]
        rows = []
        for user_id in range(n_users):
            for t in range(n_events_per_user):
                ts = datetime(2024, 1, 1).timestamp() + t*3600 + rng.integers(0, 3600)
                q1 = rng.choice(queries_pool)
                q2 = rng.choice(queries_pool)
                target = rng.choice(categories)
                rows.append({
                    'user_id': f'u{user_id}',
                    'timestamp': ts,
                    'queries': [q1, q2],
                    'target_category': target
                })
        return pd.DataFrame(rows)

    df = generate_toy_data()

# --------------------------
# 1) Normalize text + targets
# --------------------------

df.columns = df.columns.astype(str).str.strip().str.lower()
print("Columns:", df.columns.tolist())

def to_text(val):
    if isinstance(val, str):
        return val
    try:
        return ' '.join(val)
    except Exception:
        return str(val)

if 'queries' in df.columns:
    df['text'] = df['queries'].apply(to_text)
    target_col = 'target_category'
elif 'category' in df.columns:
    df['text'] = df['category'].astype(str)
    target_col = 'category'
else:
    raise KeyError("No suitable text column found.")

# --------------------------
# 1.5) Add timestamp for ordering
# --------------------------

if 'timestamp' not in df.columns:
    if 'invoice_date' in df.columns:
        df['timestamp'] = pd.to_datetime(df['invoice_date'], errors='coerce').astype(int) / 1e9
    else:
        df['timestamp'] = np.arange(len(df))

if 'user_id' not in df.columns:
    if 'customer_id' in df.columns:
        df['user_id'] = df['customer_id'].astype(str)
    else:
        df['user_id'] = "global_user"

# --------------------------
# 2) Build history-based samples
# --------------------------

history_len = 3
df = df.sort_values(['user_id', 'timestamp']).reset_index(drop=True)

def build_history_features(user_id, group):
    texts = group['text'].tolist()
    targets = group[target_col].astype(str).tolist()
    samples = []
    for i in range(history_len, len(group)):
        history_concat = ' '.join(texts[i-history_len:i])
        target = targets[i]
        samples.append({
            'user_id': user_id,
            'history_text': history_concat,
            'target_category': target
        })
    return pd.DataFrame(samples)

samples_df = pd.concat([
    build_history_features(uid, g) for uid, g in df.groupby('user_id') if len(g) > history_len
], ignore_index=True)

print("Generated samples:", samples_df.shape)

X_text = samples_df['history_text'].values
y_targets = samples_df['target_category'].values

le = LabelEncoder()
y = le.fit_transform(y_targets)
class_names = le.classes_

vectorizer = HashingVectorizer(n_features=2**20, alternate_sign=False, lowercase=True)
X_hashed = vectorizer.transform(X_text)

X_train, X_val, y_train, y_val = train_test_split(X_hashed, y, test_size=0.2, random_state=42, stratify=y)

# --------------------------
# 3) PyTorch Dataset + Model
# --------------------------

def sparse_collate_fn(batch):
    data = sp.vstack([item[0] for item in batch])
    targets = torch.tensor([item[1] for item in batch], dtype=torch.long)
    return torch.tensor(data.toarray(), dtype=torch.float32), targets

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X, self.y = X, y
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(TextDataset(X_train, y_train), batch_size=64, shuffle=True, collate_fn=sparse_collate_fn)
val_loader = DataLoader(TextDataset(X_val, y_val), batch_size=64, shuffle=False, collate_fn=sparse_collate_fn)

class SimpleNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, n_classes)
    def forward(self, x):
        return self.fc2(self.drop(self.relu(self.fc1(x))))

model = SimpleNet(2**20, 256, len(class_names))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# --------------------------
# 4) Train & Evaluate
# --------------------------

def train_one_epoch():
    model.train()
    total_loss = 0
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(Xb), yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * Xb.size(0)
    return total_loss / len(train_loader.dataset)

def evaluate(loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for Xb, yb in loader:
            Xb, yb = Xb.to(device), yb.to(device)
            preds = model(Xb).argmax(1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    return correct / total

for epoch in range(1, 6):
    loss = train_one_epoch()
    acc = evaluate(val_loader)
    print(f"Epoch {epoch}: Loss {loss:.4f}, Val Acc {acc:.4f}")

# --------------------------
# 5) Inference Helper
# --------------------------

def predict_next_category(history_text):
    vec = vectorizer.transform([history_text]).toarray().astype(np.float32)
    vec_tensor = torch.tensor(vec, dtype=torch.float32).to(device)
    model.eval()
    with torch.no_grad():
        pred_idx = model(vec_tensor).argmax(1).item()
    return class_names[pred_idx]

example = "blue jeans running shoes laptop"
print("Example history:", example)
print("Predicted next category:", predict_next_category(example))


Loaded dataset from KaggleHub: /kaggle/input/customer-shopping-latest-trends-dataset/shopping_trends.csv
Columns: ['customer id', 'age', 'gender', 'item purchased', 'category', 'purchase amount (usd)', 'location', 'size', 'color', 'season', 'review rating', 'subscription status', 'payment method', 'shipping type', 'discount applied', 'promo code used', 'previous purchases', 'preferred payment method', 'frequency of purchases']
Generated samples: (3897, 3)
Epoch 1: Loss 1.3027, Val Acc 0.4449
Epoch 2: Loss 1.2212, Val Acc 0.4449
Epoch 3: Loss 1.2201, Val Acc 0.4449
Epoch 4: Loss 1.2209, Val Acc 0.4449
Epoch 5: Loss 1.2204, Val Acc 0.4449
Example history: blue jeans running shoes laptop
Predicted next category: Clothing
