<a href="https://www.kaggle.com/code/bineetbairagi/shop-pred?scriptVersionId=259151318" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import HashingVectorizer
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
import scipy.sparse as sp
import glob

# Optional: KaggleHub helper (import may vary by environment)
try:
    import kagglehub
except Exception:
    kagglehub = None

# --------------------------
# 0) Load Kaggle dataset robustly
# --------------------------

# User-configurable
DATASET_HANDLE = "bhadramohit/customer-shopping-latest-trends-dataset"
# EXPECTED_FILE is a hint; we will actually discover CSVs in the dataset
EXPECTED_FILE = "customer_shopping_data.csv"

def find_csv_in_dataset(dataset_path):
    """
    Recursively find candidate CSV files under dataset_path.
    Returns a list of full file paths.
    """
    csv_files = []
    for root, dirs, files in os.walk(dataset_path):
        for f in files:
            if f.lower().endswith('.csv'):
                csv_files.append(os.path.join(root, f))
    return csv_files

def load_dataframe_from_csv(file_path):
    print(f"Loading dataset CSV: {file_path}")
    return pd.read_csv(file_path)

df = None

try:
    if kagglehub is None:
        raise ImportError("kagglehub is not available in this environment.")

    # Attempt to download the dataset
    # Note: Some environments require different call signatures.
    # Here, we try a flexible approach that works with the provided API.
    path = None
    try:
        # Some versions support dataset_download(handle, filename=None)
        path = kagglehub.dataset_download(DATASET_HANDLE)  # no specific file
    except TypeError:
        # Fallback signature
        path = kagglehub.dataset_download(DATASET_HANDLE, None)

    if isinstance(path, str) and os.path.isdir(path):
        dataset_dir = path
    else:
        # If the API returns a different structure, try to coerce to a directory
        dataset_dir = None
        if isinstance(path, (list, tuple)) and len(path) > 0:
            candidate = path[0]
            if isinstance(candidate, str) and os.path.isdir(candidate):
                dataset_dir = candidate

    if dataset_dir is None:
        raise FileNotFoundError("Dataset directory not found after download.")

    # Discover CSV files inside the dataset directory
    csv_candidates = find_csv_in_dataset(dataset_dir)

    if len(csv_candidates) == 0:
        raise FileNotFoundError("No CSV files found in the downloaded dataset directory.")

    if len(csv_candidates) == 1:
        file_path = csv_candidates[0]
    else:
        # Heuristic: prefer files with 'shopping' or 'data' in the filename
        candidates_sorted = sorted(
            csv_candidates,
            key=lambda p: (
                ('shopping' in os.path.basename(p).lower()) * 1 +
                ('data' in os.path.basename(p).lower()) * 1
            ),
            reverse=True
        )
        file_path = candidates_sorted[0]

    df = load_dataframe_from_csv(file_path)
    print(f"Loaded dataset from KaggleHub: {file_path}")

except Exception as e:
    print(f"Failed to load dataset from KaggleHub. Error: {e}")
    print("Using fallback toy data.")

    # Fallback toy data to keep the workflow executable
    def generate_toy_data(n_users=50, n_events_per_user=20, seed=42):
        rng = np.random.default_rng(seed)
        categories = ['electronics', 'clothing', 'home', 'sports', 'books']
        queries_pool = [
            'blue jeans', 'running shoes', 'laptop', 'coffee maker', 'smartphone',
            'wireless headphones', 'winter coat', 'gaming mouse', 'kitchen blender', 'yoga mat'
        ]
        rows = []
        for user_id in range(n_users):
            for t in range(n_events_per_user):
                ts = datetime(2024, 1, 1).timestamp() + t*3600 + rng.integers(0, 3600)
                q1 = rng.choice(queries_pool)
                q2 = rng.choice(queries_pool)
                queries = [q1, q2]
                target = rng.choice(categories)
                rows.append({
                    'user_id': f'u{user_id}',
                    'timestamp': ts,
                    'queries': queries,
                    'target_category': target
                })
        return pd.DataFrame(rows)

    df = generate_toy_data()

# --------------------------
# 1) Normalize to a single text field
# --------------------------

# Normalize column names: strip spaces and lowercase for robust handling
df.columns = df.columns.astype(str)
df.columns = df.columns.str.strip().str.lower()

print("Columns after normalization:", df.columns.tolist())

# Helper to convert a list of queries to a single text field
def to_text(val):
    if isinstance(val, str):
        return val
    try:
        return ' '.join(val)
    except TypeError:
        try:
            return ' '.join(str(x) for x in val)
        except Exception:
            return str(val)

# Use 'category' for the real data, and 'queries' for the fallback toy data
if 'category' in df.columns:
    df['text'] = df['category'].astype(str)
    target_col = 'category'
elif 'queries' in df.columns:
    df['text'] = df['queries'].apply(to_text)
    target_col = 'target_category'
else:
    raise KeyError("No suitable text column found in the dataframe. Expected 'queries' or 'category'.")

print("Text field created. Sample:")
print(df['text'].head())
print("Target column present:", target_col in df.columns)

# --------------------------
# 2) Prepare data for modeling
# --------------------------

# Encode targets
le = LabelEncoder()
y = le.fit_transform(df[target_col].astype(str))
class_names = list(le.classes_)
n_classes = len(class_names)
print("Classes:", class_names, "Number of classes:", n_classes)

# Features: text -> hashing vectorization, keeping it sparse
vectorizer = HashingVectorizer(n_features=2**20, alternate_sign=False, lowercase=True)
X_hashed = vectorizer.transform(df['text'].values)

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X_hashed, y, test_size=0.2, random_state=42, stratify=y)

# --------------------------
# 3) PyTorch dataset/dataloader
# --------------------------

def sparse_collate_fn(batch):
    data = sp.vstack([item[0] for item in batch])
    targets = torch.tensor([item[1] for item in batch], dtype=torch.long)
    return torch.tensor(data.toarray(), dtype=torch.float32), targets

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = TextDataset(X_train, y_train)
val_ds = TextDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=sparse_collate_fn)
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False, collate_fn=sparse_collate_fn)

# --------------------------
# 4) Simple model: feed-forward over hashed features
# --------------------------

class SimpleNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_classes):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, n_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Note: input_dim must match the hashed feature dimension (2**20)
input_dim = 2**20
hidden_dim = 256
model = SimpleNet(input_dim, hidden_dim, n_classes)

# Training settings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# --------------------------
# 5) Training loop
# --------------------------

def train_one_epoch():
    model.train()
    total_loss = 0.0
    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        logits = model(batch_X)
        loss = criterion(logits, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch_X.size(0)
    return total_loss / len(train_loader.dataset)

def evaluate(loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_X, batch_y in loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            logits = model(batch_X)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == batch_y).sum().item()
            total += batch_y.size(0)
    return correct / total

n_epochs = 5
for epoch in range(1, n_epochs + 1):
    train_loss = train_one_epoch()
    val_acc = evaluate(val_loader)
    print(f"Epoch {epoch}/{n_epochs} - Loss: {train_loss:.4f} - Val Acc: {val_acc:.4f}")

# --------------------------
# 6) Inference helper
# --------------------------

def predict_text(text_sample):
    vec = vectorizer.transform([text_sample]).toarray().astype(np.float32)
    vec_tensor = torch.tensor(vec, dtype=torch.float32).to(device)
    model.eval()
    with torch.no_grad():
        logits = model(vec_tensor)
        pred_idx = int(torch.argmax(logits, dim=1).item())
    return class_names[pred_idx]

# Example usage after training
sample_text = df['text'].iloc[0]
print("Sample text:", sample_text)
print("Predicted category:", predict_text(sample_text))

Loading dataset CSV: /kaggle/input/customer-shopping-latest-trends-dataset/shopping_trends.csv
Loaded dataset from KaggleHub: /kaggle/input/customer-shopping-latest-trends-dataset/shopping_trends.csv
Columns after normalization: ['customer id', 'age', 'gender', 'item purchased', 'category', 'purchase amount (usd)', 'location', 'size', 'color', 'season', 'review rating', 'subscription status', 'payment method', 'shipping type', 'discount applied', 'promo code used', 'previous purchases', 'preferred payment method', 'frequency of purchases']
Text field created. Sample:
0    Clothing
1    Clothing
2    Clothing
3    Footwear
4    Clothing
Name: text, dtype: object
Target column present: True
Classes: ['Accessories', 'Clothing', 'Footwear', 'Outerwear'] Number of classes: 4
Epoch 1/5 - Loss: 1.2076 - Val Acc: 1.0000
Epoch 2/5 - Loss: 0.5463 - Val Acc: 1.0000
Epoch 3/5 - Loss: 0.1511 - Val Acc: 1.0000
Epoch 4/5 - Loss: 0.0547 - Val Acc: 1.0000
Epoch 5/5 - Loss: 0.0295 - Val Acc: 1.0000
Samp