# Imports

In [1]:
import numpy as np
import pandas as pd
import datetime
from sklearn.model_selection import ParameterGrid, train_test_split as sklearn_split
from surprise import SVD, SVDpp, Dataset, Reader, KNNBasic, CoClustering, NMF, Prediction
from surprise.model_selection import GridSearchCV, train_test_split
from joblib import dump, load
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from scipy.sparse import coo_matrix




# Data Loading & Preprocessing

In [14]:
# Declaring train_df as None to avoid undefined variable error
train_df = None

## Load MovieLens 1M

In [None]:
ratings_1m_df = pd.read_csv("../../preprocessing/ratings_1m.csv")
ratings_1m_df.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [4]:
ratings_1m_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype
---  ------   --------------    -----
 0   UserID   1000209 non-null  int64
 1   MovieID  1000209 non-null  int64
 2   Rating   1000209 non-null  int64
dtypes: int64(3)
memory usage: 22.9 MB


In [5]:
ratings_1m_df['UserID'] = ratings_1m_df['UserID'].astype('int32')
ratings_1m_df['MovieID'] = ratings_1m_df['MovieID'].astype('int32')
ratings_1m_df['Rating'] = ratings_1m_df['Rating'].astype('float16')

ratings_1m_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   UserID   1000209 non-null  int32  
 1   MovieID  1000209 non-null  int32  
 2   Rating   1000209 non-null  float16
dtypes: float16(1), int32(2)
memory usage: 9.5 MB


In [6]:
ratings_1m_df["Rating"].value_counts()

Rating
4.0    348971
3.0    261197
5.0    226310
2.0    107557
1.0     56174
Name: count, dtype: int64

In [7]:
ratings_per_user = ratings_1m_df.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

      user_id  num_ratings
0           1           53
1           2          129
2           3           51
3           4           21
4           5          198
...       ...          ...
6035     6036          888
6036     6037          202
6037     6038           20
6038     6039          123
6039     6040          341

[6040 rows x 2 columns]


In [8]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

count    6040.000000
mean      165.597517
std       192.747029
min        20.000000
10%        27.000000
50%        96.000000
90%       400.000000
max      2314.000000
Name: num_ratings, dtype: float64


In [9]:
ratings_1m_df["MovieID"].describe()

count    1.000209e+06
mean     1.865540e+03
std      1.096041e+03
min      1.000000e+00
25%      1.030000e+03
50%      1.835000e+03
75%      2.770000e+03
max      3.952000e+03
Name: MovieID, dtype: float64

In [10]:
ratings_1m_df["UserID"].describe()

count    1.000209e+06
mean     3.024512e+03
std      1.728413e+03
min      1.000000e+00
25%      1.506000e+03
50%      3.070000e+03
75%      4.476000e+03
max      6.040000e+03
Name: UserID, dtype: float64

In [11]:
reader = Reader(rating_scale=(ratings_1m_df['Rating'].min(), ratings_1m_df['Rating'].max()))
data = Dataset.load_from_df(ratings_1m_df[['UserID', 'MovieID', 'Rating']], reader)

In [12]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

### Load 1M for LightFM

In [13]:
train_df, test_df = sklearn_split(ratings_1m_df, test_size=0.2, random_state=42)

## Load MovieLens 10M

In [None]:
ratings_10m_df = pd.read_csv("../../preprocessing/ratings_10m.csv")
ratings_10m_df = ratings_10m_df.drop('Timestamp', axis=1)

ratings_10m_df.head()

In [None]:
ratings_10m_df.info()

In [None]:
ratings_10m_df['UserID'] = ratings_10m_df['UserID'].astype('int32')
ratings_10m_df['MovieID'] = ratings_10m_df['MovieID'].astype('int32')
ratings_10m_df['Rating'] = ratings_10m_df['Rating'].astype('float16')

ratings_10m_df.info()

In [None]:
ratings_10m_df["Rating"].value_counts()

In [None]:
ratings_per_user = ratings_10m_df.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

In [None]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

In [None]:
ratings_10m_df["MovieID"].describe()

In [None]:
ratings_10m_df["UserID"].describe()

In [None]:
reader = Reader(rating_scale=(ratings_10m_df['Rating'].min(), ratings_10m_df['Rating'].max()))
data = Dataset.load_from_df(ratings_10m_df[['UserID', 'MovieID', 'Rating']], reader)

In [None]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

### Load 10M for LightFM

In [None]:
train_df, test_df = sklearn_split(ratings_10m_df, test_size=0.2, random_state=42)

## Load MovieLens 32M

In [None]:
ratings_32m_df = pd.read_csv("../../preprocessing/ratings_32m.csv")
ratings_32m_df = ratings_32m_df.drop('Timestamp', axis=1)

ratings_32m_df.head()

In [None]:
ratings_32m_df.info()

In [None]:
ratings_32m_df['UserID'] = ratings_32m_df['UserID'].astype('int32')
ratings_32m_df['MovieID'] = ratings_32m_df['MovieID'].astype('int32')
ratings_32m_df['Rating'] = ratings_32m_df['Rating'].astype('float16')

ratings_32m_df.info()

In [None]:
ratings_32m_df["Rating"].value_counts()

In [None]:
ratings_per_user = ratings_32m_df.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

In [None]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

In [None]:
ratings_32m_df["MovieID"].describe()

In [None]:
ratings_32m_df["UserID"].describe()

In [None]:
reader = Reader(rating_scale=(ratings_32m_df['Rating'].min(), ratings_32m_df['Rating'].max()))
data = Dataset.load_from_df(ratings_32m_df[['UserID', 'MovieID', 'Rating']], reader)

In [None]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

### Load 32M for LightFM

In [None]:
train_df, test_df = sklearn_split(ratings_32m_df, test_size=0.2, random_state=42)

# Models List

In [18]:
# Define models and parameter grids
models = [
    {
        'name': 'LightFM-WARP',
        'algo': LightFM,
        'params': {
            'no_components': [20, 50],
            'loss': ['warp'],
            'learning_rate': [0.005, 0.01],
            'item_alpha': [0.02, 0.1, 0.3],
            'user_alpha': [0.02, 0.1, 0.3],
            'random_state': [42]
        }
    },
    {
        'name': 'LightFM-BPR',
        'algo': LightFM,
        'params': {
            'no_components': [20, 50],
            'loss': ['bpr'],
            'learning_rate': [0.005, 0.01],
            'item_alpha': [0.02, 0.1, 0.3],
            'user_alpha': [0.02, 0.1, 0.3],
            'random_state': [42]
        }
    },
    {
        'name': 'SVD',
        'algo': SVD,
        'params': {
            'n_factors': [50, 100, 150],
            'n_epochs': [20, 30],
            'lr_all': [0.005, 0.01],
            'reg_all': [0.02, 0.1]
        }
    },
    {
        'name': 'KNNBasic',
        'algo': KNNBasic,
        'params': {
            'k': [20, 40],
            'sim_options': {
                'name': ['msd', 'pearson'],
                'user_based': [False]
            }
        }
    },
    {
        'name': 'NMF',
        'algo': NMF,
        'params': {
            'n_factors': [10, 15],
            'n_epochs': [50, 100]
        }
    },
    {
        'name': 'CoClustering',
        'algo': CoClustering,
        'params': {
            'n_cltr_u': [3, 5],
            'n_cltr_i': [3, 5],
            'n_epochs': [20, 30]
        }
    }
]

# Function Definitions

In [19]:
# Compute RMSE and accuracy metrics
def compute_metrics(predictions, tolerance=1):
    actuals = np.array([pred.r_ui for pred in predictions])
    preds = np.array([pred.est for pred in predictions])
    rmse_val = np.sqrt(np.mean((preds - actuals) ** 2))
    accuracy = np.mean(np.abs(preds - actuals) <= tolerance) * 100
    return {'RMSE': rmse_val, f'Acc (±{tolerance})': accuracy}

# Compute precision@k for top-N recommendations
def compute_precision_at_k(predictions, k=5, threshold=3):
    user_est_true = {}
    for pred in predictions:
        uid, iid, true_r, est, _ = pred
        if uid not in user_est_true:
            user_est_true[uid] = []
        user_est_true[uid].append((est, true_r))
    
    precisions = []
    for uid, ratings in user_est_true.items():
        ratings.sort(key=lambda x: x[0], reverse=True)  # Sort by predicted rating
        top_k = [r[1] >= threshold for r in ratings[:k]]  # True ratings >= threshold
        if top_k:
            precisions.append(sum(top_k) / len(top_k))
    
    return np.mean(precisions) if precisions else 0

# Convert DataFrame to COO matrix for LightFM
def df_to_lightfm(train_df):
    # Map UserID and MovieID to consecutive indices
    user_ids = train_df['UserID'].unique()
    item_ids = train_df['MovieID'].unique()
    user_map = {uid: idx for idx, uid in enumerate(user_ids)}
    item_map = {iid: idx for idx, iid in enumerate(item_ids)}
    
    # Prepare data for coo_matrix
    rows = [user_map[uid] for uid in train_df['UserID']]
    cols = [item_map[iid] for iid in train_df['MovieID']]
    data = [1] * len(train_df)  # Binary implicit feedback
    
    # Create sparse matrix
    interactions = coo_matrix((data, (rows, cols)), shape=(len(user_ids), len(item_ids)))
    return interactions, user_map, item_map

In [20]:
# Prepare interactions matrix for LightFM
if train_df is not None:
    interactions, user_map, item_map = df_to_lightfm(train_df)
else:
    raise ValueError("train_df is not loaded. Ensure the `Load *M for LightFM` is run correctly.")

ValueError: train_df is not loaded. Ensure the `Load *M for LightFM` is run correctly.

# Hyperparameter Tuning

In [None]:
# Hyperparameter tuning
best_params_dict = {}
for model in models:
    if model['algo'] == LightFM:
        if train_df is not None:
            # LightFM handling
            best_score = -np.inf
            best_params = {}
            for params in ParameterGrid(model['params']):
                print(f"Tuning {model['name']} with params: {params}")
                model = LightFM(**params)
                model.fit(interactions, epochs=10, verbose=False)
                score = precision_at_k(model, interactions, k=5).mean()
                if score > best_score:
                    best_score = score
                    best_params = params

            print(f"Best RMSE: {gs.best_score['rmse']:.4f}")
            print(f"Best params: {best_params}")
            best_params_dict[model['name']] = best_params
        else:
            print(f"Skipping {model['name']} tuning due to missing train_df")
    else:
        # Original Surprise handling
        print(f"Tuning {model['name']} with params: {model['params']}")
        gs = GridSearchCV(
            model['algo'],
            model['params'],
            measures=['rmse'],
            cv=5,
            n_jobs=-1,
            pre_dispatch='2*n_jobs',
        )
        gs.fit(data)
        best_params = gs.best_params['rmse']

        print(f"Best RMSE: {gs.best_score['rmse']:.4f}")
        print(f"Best params: {best_params}")
        best_params_dict[model['name']] = best_params

In [None]:
# Save parameters to disk
dump(best_params_dict, '../output_models/best_params.pkl')

# Load Model Paramters from File

In [15]:
# Load desired parameters from disk
best_params_dict = load('../output_models/best_params.pkl')

In [16]:
print(best_params_dict)

{'SVD': {'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}, 'KNNBasic': {'k': 40, 'sim_options': {'name': 'msd', 'user_based': False}}, 'NMF': {'n_factors': 10, 'n_epochs': 100}, 'CoClustering': {'n_cltr_u': 5, 'n_cltr_i': 3, 'n_epochs': 30}}


# Training and Evaluation

In [29]:
# Training and evaluation
results = {}

for model in models:
    if model['name'] not in best_params_dict:
        print(f"Skipping {model['name']} due to missing parameters")
        continue
    else:
        if model['algo'] == LightFM:
            if train_df is not None:
                # LightFM handling
                print(f"\n=== Training {model['name']} ===\n")
                model_name = model['name']
                results[model_name] = {}
                model = model['algo'](**best_params_dict[model_name])
                model.fit(interactions, epochs=30)

                # Generate valid test indices (positional)
                valid_indices = []
                test_user_ids = []
                test_item_ids = []
                for pos_idx, (idx, row) in enumerate(test_df.iterrows()):
                    uid, iid, rating = row['UserID'], row['MovieID'], row['Rating']
                    if uid in user_map and iid in item_map:  # Ensure user and item were in training
                        valid_indices.append(pos_idx)  # Store positional index
                        test_user_ids.append(user_map[uid])
                        test_item_ids.append(item_map[iid])

                # Predict and scale to rating range
                preds = model.predict(test_user_ids, test_item_ids)
                min_rating, max_rating = 1, 5
                min_pred, max_pred = np.min(preds), np.max(preds)
                if max_pred != min_pred:  # Avoid division by zero
                    scaled_preds = min_rating + (preds - min_pred) * (max_rating - min_rating) / (max_pred - min_pred)
                else:
                    scaled_preds = np.full_like(preds, min_rating)  # Fallback if all predictions are the same

                # Create Prediction objects using positional indices
                predictions = [
                    Prediction(
                        uid=test_df.iloc[pos_idx]['UserID'],
                        iid=test_df.iloc[pos_idx]['MovieID'],
                        r_ui=test_df.iloc[pos_idx]['Rating'],
                        est=float(scaled_preds[j]),
                        details=None,
                    )
                    for j, pos_idx in enumerate(valid_indices)
                ]

                results[model_name].update({
                    'params': best_params_dict[model_name],
                    'metrics': compute_metrics(predictions),
                    'precision_at_k': compute_precision_at_k(predictions, k=10, threshold=3)
                })
            else:
                print(f"Skipping {model['name']} training due to missing train_df")

        else:
            # Original Surprise handling
            print(f"\n=== Training {model['name']} ===")
            model_name = model['name']
            results[model_name] = {}
            model = model['algo'](**best_params_dict[model_name])
            model.fit(trainset)
            
            # Generate predictions
            predictions = model.test(testset)
            
            results[model_name].update({
                'params': best_params_dict[model_name],
                'metrics': compute_metrics(predictions),
                'precision_at_k': compute_precision_at_k(predictions, k=10, threshold=3)
            })

Skipping LightFM-WARP due to missing parameters
Skipping LightFM-BPR due to missing parameters

=== Training SVD ===

=== Training KNNBasic ===
Computing the msd similarity matrix...
Done computing similarity matrix.

=== Training NMF ===

=== Training CoClustering ===


In [30]:
# Format results for readability
formatted_results = []
for model_name, data in results.items():
    formatted_results.append({
        'Model': model_name,
        'RMSE': data['metrics']['RMSE'],
        'Acc (±1)': data['metrics']['Acc (±1)'],
        'Precision@10': data['precision_at_k'],
        'Best Params': str(data['params'])  # Convert dict to string for simplicity
    })

In [31]:
training_report = pd.DataFrame(formatted_results)
training_report.set_index('Model', inplace=True)
print("\n=== Final Results ===\n")
print(training_report.to_string())


=== Final Results ===

                  RMSE   Acc (±1)  Precision@10                                                         Best Params
Model                                                                                                              
SVD           0.872063  75.718099      0.925831  {'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}
KNNBasic      0.914429  73.982964      0.923130      {'k': 40, 'sim_options': {'name': 'msd', 'user_based': False}}
NMF           0.898074  74.059947      0.919301                                  {'n_factors': 10, 'n_epochs': 100}
CoClustering  0.910022  74.452365      0.919234                      {'n_cltr_u': 5, 'n_cltr_i': 3, 'n_epochs': 30}


# Save Outputs

In [38]:
# Save results, models and data mappings
current_date = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')
training_report.to_csv(f'../training_results/training_report_{current_date}.csv')

if train_df is not None:
    dump(user_map, '../output_models/user_map.pkl')
    dump(item_map, '../output_models/item_map.pkl')

for model in models:
    if train_df is None and model['algo'] == LightFM:
        continue
    else:
        # Fixed f-string by using different quote types
        file_path = f"../output_models/{model['name'].lower().replace('-', '_')}_model.pkl"
        dump(model, file_path)
        print(f"Model {model['name']} saved successfully to {file_path}!")

Model SVD saved successfully to ../output_models/svd_model.pkl!
Model KNNBasic saved successfully to ../output_models/knnbasic_model.pkl!
Model NMF saved successfully to ../output_models/nmf_model.pkl!
Model CoClustering saved successfully to ../output_models/coclustering_model.pkl!
