In [1]:
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.model_selection import StratifiedKFold
from lightfm.evaluation import precision_at_k, auc_score

import numpy as np




In [2]:
# Load dataset
data = pd.read_csv('../data/all_cleaned.csv', usecols=['user_id', 'isbn', 'book_rating'])

# Convert ratings into binary relevance (1 for positive interactions, 0 for negative)
# You can define your own threshold for what's considered "positive"
data['relevant'] = (data['book_rating'] >= 6).astype(int)  # ratings 6 or higher are relevant

#### Creating user-item interaction matrix ####

In [3]:
# Create LightFM dataset object
dataset = Dataset()

# Fit the dataset with unique users and items (books)
dataset.fit((x for x in data['user_id']), 
            (x for x in data['isbn']))

# Build interactions matrix (required for LightFM)
(interactions, weights) = dataset.build_interactions(
    ((row['user_id'], row['isbn'], row['relevant']) for idx, row in data.iterrows())
)


#### stratification: ensures that each fold has a similar proportion of positive interactions for each user ####

In [4]:
# Prepare for k-fold cross-validation
# We create a binary label indicating whether each interaction is relevant or not.
X = data[['user_id', 'isbn']].values  # Features are user-item pairs
y = data['relevant'].values  # Labels (1: relevant, 0: non-relevant)

# Hyperparameter search space
param_grid = {
    'epochs': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_components': [10, 20, 30],
    'loss': ['warp', 'bpr']  # LightFM losses
}

# Stratified K-Folds cross-validator
n_splits = 5  # You can choose the number of folds
skf = StratifiedKFold(n_splits=n_splits)

# Store results for each hyperparameter combination
results = []

def evaluate_model(train_data, test_data, model):
    # Build train/test interactions
    train_interactions, _ = dataset.build_interactions(
            ((row['user_id'], row['isbn'], row['relevant']) for idx, row in train_data.iterrows())
    )
    test_interactions, _ = dataset.build_interactions(
            ((row['user_id'], row['isbn'], row['relevant']) for idx, row in test_data.iterrows())
    )

    # Train the model
    model.fit(train_interactions, epochs=epochs, num_threads=2)

    # Evaluate the model
    train_precision = precision_at_k(model, train_interactions, k=5).mean()
    test_precision = precision_at_k(model, test_interactions, k=5).mean()
    test_auc = auc_score(model, test_interactions).mean()

    return train_precision, test_precision, test_auc


# Grid search over hyperparameters
for loss in param_grid['loss']:
    for epochs in param_grid['epochs']:
        for learning_rate in param_grid['learning_rate']:
            for num_components in param_grid['num_components']:
                fold = 1
                avg_test_precision = []
                avg_test_auc = []
                
                print(f"Testing: Loss={loss}, Epochs={epochs}, Learning Rate={learning_rate}, Num Components={num_components}")
                
                for train_index, test_index in skf.split(X, y):
                    train_data, test_data = data.iloc[train_index], data.iloc[test_index]
                    
                    # Initialize model with current hyperparameters
                    model = LightFM(loss=loss, learning_rate=learning_rate, no_components=num_components)
                    
                    # Evaluate model on this fold
                    _, test_precision, test_auc = evaluate_model(train_data, test_data, model)
                    
                    # Store results for this fold
                    avg_test_precision.append(test_precision)
                    avg_test_auc.append(test_auc)
                    fold += 1
                
                # Calculate the mean scores across all folds
                mean_test_precision = np.mean(avg_test_precision)
                mean_test_auc = np.mean(avg_test_auc)
                
                # Store results for this hyperparameter set
                results.append({
                    'loss': loss,
                    'epochs': epochs,
                    'learning_rate': learning_rate,
                    'num_components': num_components,
                    'mean_test_precision': mean_test_precision,
                    'mean_test_auc': mean_test_auc
                })
                
                print(f"Mean Test Precision@5: {mean_test_precision}, Mean Test AUC: {mean_test_auc}")

# Convert results to a DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Find the best hyperparameter combination based on the evaluation metric (Precision@K or AUC)
best_params = results_df.loc[results_df['mean_test_precision'].idxmax()]
print("Best Hyperparameters based on Precision@5:")
print(best_params)

Testing: Loss=warp, Epochs=10, Learning Rate=0.01, Num Components=10
