In [None]:

# 📦 Install and import all libraries
!pip install lightfm --quiet

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from google.colab import files


In [None]:

print("Please upload all your dataset files (e.g., *.train.csv.gz, *.valid.csv.gz, *.test.csv.gz)")
uploaded = files.upload()


In [None]:

def load_and_preprocess_data(file_prefix, min_year=2020):
    train_df = pd.read_csv(f'{file_prefix}.train.csv.gz', compression='gzip')
    valid_df = pd.read_csv(f'{file_prefix}.valid.csv.gz', compression='gzip')
    test_df = pd.read_csv(f'{file_prefix}.test.csv.gz', compression='gzip')

    for df in [train_df, valid_df, test_df]:
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms', errors='coerce')

    filtered_train_df = train_df[train_df['timestamp'].dt.year >= min_year]
    filtered_valid_df = valid_df[valid_df['timestamp'].dt.year >= min_year]
    filtered_test_df = test_df[test_df['timestamp'].dt.year >= min_year]

    return filtered_train_df, filtered_valid_df, filtered_test_df


In [None]:

def train_neural_network(user_ids, item_ids, ratings, num_users, num_items):
    user_input = keras.Input(shape=(1,))
    item_input = keras.Input(shape=(1,))
    user_embed = layers.Embedding(num_users, 4)(user_input)
    item_embed = layers.Embedding(num_items, 4)(item_input)

    x = layers.Concatenate()([user_embed, item_embed])
    x = layers.Flatten()(x)
    x = layers.Dense(16, activation='relu')(x)
    x = layers.Dropout(0.4)(x)
    output = layers.Dense(1)(x)

    model = keras.Model(inputs=[user_input, item_input], outputs=output)
    model.compile(optimizer='adam', loss='mse')

    history = model.fit([user_ids, item_ids], ratings, validation_split=0.2,
                        epochs=15, batch_size=128,
                        callbacks=[keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)],
                        verbose=1)
    return model, history


In [None]:

def train_lightfm_model(df):
    dataset = Dataset()
    dataset.fit(df['user_id'], df['parent_asin'])
    interactions, _ = dataset.build_interactions([
        (u, i, r) for u, i, r in zip(df['user_id'], df['parent_asin'], df['rating'])
    ])
    model = LightFM(loss='warp')
    model.fit(interactions, epochs=5, num_threads=2)
    return model, dataset


In [None]:

def run_recommendation_pipeline(category):
    train_df, valid_df, test_df = load_and_preprocess_data(category)

    all_data = pd.concat([train_df, valid_df])
    user_cat = all_data['user_id'].astype('category')
    item_cat = all_data['parent_asin'].astype('category')

    user_ids = user_cat.cat.codes
    item_ids = item_cat.cat.codes
    ratings = all_data['rating'].astype(float)

    num_users = user_ids.nunique()
    num_items = item_ids.nunique()

    model_nn, history = train_neural_network(user_ids, item_ids, ratings, num_users, num_items)

    val_data = all_data.sample(frac=0.2, random_state=42)
    val_user_ids = val_data['user_id'].astype('category').cat.codes
    val_item_ids = val_data['parent_asin'].astype('category').cat.codes
    val_ratings = val_data['rating'].values

    nn_preds = model_nn.predict([val_user_ids, val_item_ids], verbose=0).flatten()
    rmse = np.sqrt(mean_squared_error(val_ratings, nn_preds))

    model_lfm, dataset = train_lightfm_model(all_data)

    sample_user = 0
    sample_items = np.arange(num_items)
    nn_preds_sample = model_nn.predict([np.full(num_items, sample_user), sample_items], verbose=0).flatten()
    lfm_preds_sample = model_lfm.predict(sample_user, sample_items)

    top_nn_items = np.argsort(-nn_preds_sample)[:5]
    top_lfm_items = np.argsort(-lfm_preds_sample)[:5]

    print(f"Top 5 NN recommendations (item indices): {top_nn_items}")
    print(f"Top 5 LightFM recommendations (item indices): {top_lfm_items}")
    print(f"Validation RMSE: {rmse:.4f}")

    return {
        'results': {
            'summary': f"Processed {category} | NN RMSE: {rmse:.4f}",
            'nn_history': history.history,
            'nn_rmse': rmse,
            'nn_top_items': top_nn_items,
            'lfm_top_items': top_lfm_items
        }
    }


In [None]:

def plot_all_losses(category_results):
    plt.figure(figsize=(10, 6))
    for category, history in category_results.items():
        val_loss = history.get('val_loss', [])
        if val_loss:
            plt.plot(val_loss, label=category.replace("_", " "), linewidth=2)
    plt.title("Validation Loss Across Categories", fontsize=16)
    plt.xlabel("Epoch", fontsize=14)
    plt.ylabel("Validation Loss", fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()


In [None]:

categories = [
    "Office_Products",
    "Arts_Crafts_and_Sewing",
    "Baby_Products",
    "Software",
    "Video_Games"
]

category_results = {}

for category in categories:
    print(f"\n📦 Processing category: {category}")
    result = run_recommendation_pipeline(category)
    category_results[category] = result['results']['nn_history']

plot_all_losses(category_results)
