# Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid, train_test_split as sklearn_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score, roc_auc_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from surprise import SVD, SVDpp, Dataset, Reader, accuracy, KNNBasic, SlopeOne, CoClustering, NMF, Prediction
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from surprise.accuracy import rmse
from joblib import Memory, parallel_backend, dump
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from scipy.sparse import coo_matrix

# Data Preprocessing

In [4]:
df = pd.read_csv("../../preprocessing/merged_data.csv")
df.head()

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Title,Genres,Year
0,1,1193,5,0,1,One Flew Over the Cuckoo's Nest,[8],1975
1,1,661,3,0,1,James and the Giant Peach,"[3, 4, 12]",1996
2,1,914,3,0,1,My Fair Lady,"[12, 14]",1964
3,1,3408,4,0,1,Erin Brockovich,[8],2000
4,1,2355,5,0,1,"Bug's Life, A","[3, 4, 5]",1998


In [None]:
df.info()

In [5]:
filtered_df = df.drop(columns=["Gender","Age","Title","Year", "Genres"])
filtered_df.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [11]:
filtered_df["Rating"].value_counts()

Rating
4    348971
3    261197
5    226310
2    107557
1     56174
Name: count, dtype: int64

In [None]:
ratings_per_user = filtered_df.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

In [None]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

In [None]:
filtered_df["MovieID"].describe()

In [None]:
filtered_df["UserID"].describe()

In [None]:
subsampled_df, _ = sklearn_split(filtered_df, test_size=0.5, random_state=42, stratify=filtered_df['UserID'])

In [3]:
# Dataset Upgrade
beegar_data = pd.read_csv(r"~/Downloads/ratings.csv")

# Drop the Timestamp column
beegar_data = beegar_data.drop('timestamp', axis=1)
beegar_data.columns = ['UserID', 'MovieID', 'Rating']
beegar_data.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [None]:
beegar_data.info()

In [4]:
beegar_data['UserID'] = beegar_data['UserID'].astype('int32')
beegar_data['MovieID'] = beegar_data['MovieID'].astype('int32')
beegar_data['Rating'] = beegar_data['Rating'].astype('float16')

beegar_data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   UserID   int32  
 1   MovieID  int32  
 2   Rating   float16
dtypes: float16(1), int32(2)
memory usage: 305.2 MB


In [None]:
reader = Reader(rating_scale=(beegar_data['Rating'].min(), beegar_data['Rating'].max()))
data = Dataset.load_from_df(beegar_data[['UserID', 'MovieID', 'Rating']], reader)

In [None]:
trainset, testset = sklearn_split(beegar_data, test_size=0.2, random_state=42)

# Test Section 1

In [None]:
# Convert Surprise Trainset to COO matrix for LightFM
def surprise_to_lightfm(trainset):
    rows, cols, data = [], [], []
    for uid in trainset.all_users():
        user_ratings = trainset.ur[uid]
        for iid, rating in user_ratings:
            rows.append(uid)
            cols.append(iid)
            data.append(1)  # Implicit feedback
    return coo_matrix((data, (rows, cols)))

# Compute RMSE and accuracy metrics
def compute_metrics(predictions, tolerance=0.5):
    actuals = np.array([pred.r_ui for pred in predictions])
    preds = np.array([pred.est for pred in predictions])
    rmse = np.sqrt(np.mean((preds - actuals) ** 2))
    accuracy = np.mean(np.abs(preds - actuals) <= tolerance) * 100
    return {'RMSE': rmse, f'Acc (±{tolerance})': accuracy}

In [5]:
# Convert DataFrame to COO matrix for LightFM
def df_to_lightfm(train_df):
    # Map UserID and MovieID to consecutive indices
    user_ids = train_df['UserID'].unique()
    item_ids = train_df['MovieID'].unique()
    user_map = {uid: idx for idx, uid in enumerate(user_ids)}
    item_map = {iid: idx for idx, iid in enumerate(item_ids)}
    
    # Prepare data for coo_matrix
    rows = [user_map[uid] for uid in train_df['UserID']]
    cols = [item_map[iid] for iid in train_df['MovieID']]
    data = [1] * len(train_df)  # Binary implicit feedback
    
    # Create sparse matrix
    interactions = coo_matrix((data, (rows, cols)), shape=(len(user_ids), len(item_ids)))
    return interactions, user_map, item_map

# Compute RMSE and accuracy metrics
def compute_metrics(predictions, tolerance=0.5):
    actuals = np.array([pred.r_ui for pred in predictions])
    preds = np.array([pred.est for pred in predictions])
    rmse = np.sqrt(np.mean((preds - actuals) ** 2))
    accuracy = np.mean(np.abs(preds - actuals) <= tolerance) * 100
    return {'RMSE': rmse, f'Acc (±{tolerance})': accuracy}

In [6]:
# Split data using sklearn
train_df, test_df = sklearn_split(beegar_data, test_size=0.2, random_state=42)

In [7]:
# Prepare interactions matrix
interactions, user_map, item_map = df_to_lightfm(train_df)

In [None]:
# Prepare interactions matrix
interactions = surprise_to_lightfm(trainset)

In [None]:
# Define models and parameter grids
models = [
    {
        'name': 'LightFM-WARP',
        'algo': LightFM,
        'params': {
            'no_components': [20, 50],
            'loss': ['warp'],
            'learning_rate': [0.005, 0.01],
            'item_alpha': [0.02, 0.1, 0.3],
            'user_alpha': [0.02, 0.1, 0.3],
            'random_state': [42]
        }
    },
    {
        'name': 'LightFM-BPR',
        'algo': LightFM,
        'params': {
            'no_components': [20, 50],
            'loss': ['bpr'],
            'learning_rate': [0.005, 0.01],
            'item_alpha': [0.02, 0.1, 0.3],
            'user_alpha': [0.02, 0.1, 0.3],
            'random_state': [42]
        }
    },
]

In [None]:
# Hyperparameter tuning
best_params_dict = {}
for config in models:
    print(f"\n=== Tuning {config['name']} ===\n")
    best_score = -np.inf
    best_params = {}
    for params in ParameterGrid(config['params']):
        print(f"Training {config['name']} with params: {params}")
        model = LightFM(**params)
        model.fit(interactions, epochs=10, verbose=False)
        score = precision_at_k(model, interactions, k=5).mean()
        if score > best_score:
            best_score = score
            best_params = params
    best_params_dict[config['name']] = best_params

In [8]:
lmodels = {
    'LightFM-WARP': LightFM(loss='warp', no_components=20, learning_rate=0.005, item_alpha=0.1, user_alpha=0.02, random_state=42),
    'LightFM-BPR': LightFM(loss='bpr', no_components=20, learning_rate=0.005, item_alpha=0.02, user_alpha=0.02, random_state=42),
}

In [None]:
# Training and evaluation
results = []
for model_name, model_instance in lmodels.items():
    print(f"\n=== Training {model_name} ===\n")
    model_result = {'Model': model_name}
    model_instance.fit(interactions, epochs=30)

    # Generate valid test indices
    valid_indices = []
    test_user_ids = []
    test_item_ids = []
    for idx, (uid, iid, rating) in enumerate(testset):
        try:
            u_inner = trainset.to_inner_uid(uid)
            i_inner = trainset.to_inner_iid(iid)
            valid_indices.append(idx)
            test_user_ids.append(u_inner)
            test_item_ids.append(i_inner)
        except ValueError:
            continue  # Skip cold-start users/items

    # Predict and scale to rating range
    preds = model_instance.predict(test_user_ids, test_item_ids)
    min_rating, max_rating = beegar_data['Rating'].min(), beegar_data['Rating'].max()
    min_pred, max_pred = np.min(preds), np.max(preds)
    if max_pred != min_pred:  # Avoid division by zero
        scaled_preds = min_rating + (preds - min_pred) * (max_rating - min_rating) / (max_pred - min_pred)
    else:
        scaled_preds = preds  # Fallback if all predictions are the same

    # Create Prediction objects
    predictions = [
        Prediction(
            uid=testset[idx][0],
            iid=testset[idx][1],
            r_ui=testset[idx][2],
            est=float(scaled_preds[j]),
            details=None,
        )
        for j, idx in enumerate(valid_indices)
    ]

    # Compute metrics
    metrics = compute_metrics(predictions)
    precision = precision_at_k(model_instance, interactions, k=5).mean()
    model_result.update(metrics)
    model_result.update({'Precision@5': precision})
    results.append(model_result)

In [None]:
# Display results
results_df = pd.DataFrame(results)
print("\n=== Final Results ===\n")
styled_df = results_df.style.format({
    'RMSE': '{:.3f}',
    'Acc (±0.5)': '{:.1f}%',
    'Precision@5': '{:.3f}',
    'Best Params': lambda x: x.replace(', ', ',\n')
}).set_properties(**{'text-align': 'left'})
display(styled_df)

In [9]:
# Training and evaluation
results = []
for model_name, model_instance in lmodels.items():
    print(f"\n=== Training {model_name} ===\n")
    model_result = {'Model': model_name}
    model_instance.fit(interactions, epochs=30)

    # Generate valid test indices (positional)
    valid_indices = []
    test_user_ids = []
    test_item_ids = []
    for pos_idx, (idx, row) in enumerate(test_df.iterrows()):
        uid, iid, rating = row['UserID'], row['MovieID'], row['Rating']
        if uid in user_map and iid in item_map:  # Ensure user and item were in training
            valid_indices.append(pos_idx)  # Store positional index
            test_user_ids.append(user_map[uid])
            test_item_ids.append(item_map[iid])

    # Predict and scale to rating range
    preds = model_instance.predict(test_user_ids, test_item_ids)
    min_rating, max_rating = 1, 5
    min_pred, max_pred = np.min(preds), np.max(preds)
    if max_pred != min_pred:  # Avoid division by zero
        scaled_preds = min_rating + (preds - min_pred) * (max_rating - min_rating) / (max_pred - min_pred)
    else:
        scaled_preds = np.full_like(preds, min_rating)  # Fallback if all predictions are the same

    # Create Prediction objects using positional indices
    predictions = [
        Prediction(
            uid=test_df.iloc[pos_idx]['UserID'],
            iid=test_df.iloc[pos_idx]['MovieID'],
            r_ui=test_df.iloc[pos_idx]['Rating'],
            est=float(scaled_preds[j]),
            details=None,
        )
        for j, pos_idx in enumerate(valid_indices)
    ]

    # Compute metrics
    metrics = compute_metrics(predictions)
    precision = precision_at_k(model_instance, interactions, k=5).mean()
    model_result.update(metrics)
    model_result.update({'Precision@5': precision})
    results.append(model_result)


=== Training LightFM-WARP ===


=== Training LightFM-BPR ===



In [11]:
# Display results
results_df = pd.DataFrame(results)
print("\n=== Final Results ===\n")
styled_df = results_df.style.format({
    'RMSE': '{:.3f}',
    'Acc (±0.5)': '{:.1f}%',
    'Precision@5': '{:.3f}',
    'Best Params': lambda x: x.replace(', ', ',\n')
}).set_properties(**{'text-align': 'left'})
display(styled_df)


=== Final Results ===



Unnamed: 0,Model,RMSE,Acc (±0.5),Precision@5
0,LightFM-WARP,1.536,26.8%,0.386
1,LightFM-BPR,1.077,34.0%,0.386


In [None]:
from joblib import dump

# Save each LightFM model
for model_name, model_instance in lmodels.items():
    file_path = f'../models/{model_name.lower().replace("-", "_")}_model.pkl'
    dump(model_instance, file_path)
    print(f"Model {model_name} saved successfully to {file_path}!")


Model LightFM-WARP saved successfully to ../models/lightfm_warp_model.pkl!
Model LightFM-BPR saved successfully to ../models/lightfm_bpr_model.pkl!


In [15]:
# Save the mappings
dump(user_map, '../models/user_map.pkl')
dump(item_map, '../models/item_map.pkl')

['../models/item_map.pkl']

# Test Section 2

In [None]:
model = LightFM(no_components=20, loss='bpr', learning_rate=0.005, item_alpha=0.02, user_alpha=0.02, random_state=42)
model.fit(interactions, epochs=30, num_threads=4)

In [None]:
predictions = model.predict(test_user_ids, test_item_ids, num_threads=4)

In [None]:
score = precision_at_k(model, interactions, k=5, num_threads=4)
print(f"Precision at k=5: {score.mean():.2f}")

In [None]:
preds = np.array([pred.est for pred in predictions])
actuals = np.array([pred.r_ui for pred in predictions])

# Test Section 3

In [None]:
# Compute RMSE and accuracy metrics
def compute_metrics(predictions, tolerance=0.5):
    actuals = np.array([pred.r_ui for pred in predictions])
    preds = np.array([pred.est for pred in predictions])
    rmse_val = np.sqrt(np.mean((preds - actuals) ** 2))
    accuracy = np.mean(np.abs(preds - actuals) <= tolerance) * 100
    return {'RMSE': rmse_val, f'Acc (±{tolerance})': accuracy}

# Compute precision@k for top-N recommendations
def compute_precision_at_k(predictions, k=5, threshold=3.5):
    user_est_true = {}
    for pred in predictions:
        uid, iid, true_r, est, _ = pred
        if uid not in user_est_true:
            user_est_true[uid] = []
        user_est_true[uid].append((est, true_r))
    
    precisions = []
    for uid, ratings in user_est_true.items():
        ratings.sort(key=lambda x: x[0], reverse=True)  # Sort by predicted rating
        top_k = [r[1] >= threshold for r in ratings[:k]]  # True ratings >= threshold
        if top_k:
            precisions.append(sum(top_k) / len(top_k))
    
    return np.mean(precisions) if precisions else 0

In [None]:
# Generate predictions
predictions = model.test(testset)

In [None]:
# Compute metrics
metrics = compute_metrics(predictions)
precision_k = compute_precision_at_k(predictions, k=5, threshold=3.5)
metrics.update({'Precision@5': precision_k, 'Best Params': str(best_params)})

In [None]:
# Display results
results = [metrics]
results_df = pd.DataFrame(results)
print("\n=== CF Module Results (SVD) ===\n")
styled_df = results_df.style.format({
    'RMSE': '{:.3f}',
    'Acc (±0.5)': '{:.1f}%',
    'Precision@5': '{:.3f}',
    'Best Params': lambda x: x.replace(', ', ',\n')
}).set_properties(**{'text-align': 'left'})
display(styled_df)

# Test Section 4

In [None]:
# Compute RMSE and accuracy metrics
def compute_metrics(predictions, tolerance=0.5):
    actuals = np.array([pred.r_ui for pred in predictions])
    preds = np.array([pred.est for pred in predictions])
    rmse_val = np.sqrt(np.mean((preds - actuals) ** 2))
    accuracy = np.mean(np.abs(preds - actuals) <= tolerance) * 100
    return {'RMSE': rmse_val, f'Acc (±{tolerance})': accuracy}

# Compute precision@k
def compute_precision_at_k(predictions, k=5, threshold=3.5):
    user_est_true = {}
    for pred in predictions:
        uid, iid, true_r, est, _ = pred
        if uid not in user_est_true:
            user_est_true[uid] = []
        user_est_true[uid].append((est, true_r))
    
    precisions = []
    for uid, ratings in user_est_true.items():
        ratings.sort(key=lambda x: x[0], reverse=True)
        top_k = [r[1] >= threshold for r in ratings[:k]]
        if top_k:
            precisions.append(sum(top_k) / len(top_k))
    
    return np.mean(precisions) if precisions else 0

In [None]:
# Hyperparameter tuning
param_grid = {
    'n_factors': [50, 100],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}

gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
gs.fit(data)

# Best parameters
best_params = gs.best_params['rmse']
print(f"Best RMSE: {gs.best_score['rmse']:.4f}")
print(f"Best params: {best_params}")

In [None]:
# Train final model
model = SVDpp(**best_params, random_state=42)
model.fit(trainset)

In [None]:
# Generate predictions
predictions = model.test(testset)

In [None]:
# Compute metrics
metrics = compute_metrics(predictions)
precision_k = compute_precision_at_k(predictions, k=5, threshold=3.5)
metrics.update({'Precision@5': precision_k, 'Best Params': str(best_params)})

# Display results
results = [metrics]
results_df = pd.DataFrame(results)
print("\n=== CF Module Results (SVDpp) ===\n")
styled_df = results_df.style.format({
    'RMSE': '{:.3f}',
    'Acc (±0.5)': '{:.1f}%',
    'Precision@5': '{:.3f}',
    'Best Params': lambda x: x.replace(', ', ',\n')
}).set_properties(**{'text-align': 'left'})
display(styled_df)

# Hyperparameter Tuning

In [None]:
# Define models and parameter grids
models = [
    {
        'name': 'LightFM-WARP',
        'algo': LightFM,
        'params': {
            'no_components': [20, 50],
            'loss': ['warp'],
            'learning_rate': [0.01, 0.03],
            'item_alpha': [0.02, 0.1],
            'user_alpha': [0.02, 0.1],
            'random_state': [42]
        }
    },
    {
        'name': 'LightFM-BPR',
        'algo': LightFM,
        'params': {
            'no_components': [20, 50],
            'loss': ['bpr'],
            'learning_rate': [0.01, 0.03],
            'item_alpha': [0.02, 0.1],
            'user_alpha': [0.02, 0.1],
            'random_state': [42]
        }
    },
    {
        'name': 'SVD',
        'algo': SVD,
        'params': {
            'n_factors': [50, 100, 150],
            'n_epochs': [20, 30],
            'lr_all': [0.005, 0.01],
            'reg_all': [0.02, 0.1]
        }
    },
    {
        'name': 'KNNBasic',
        'algo': KNNBasic,
        'params': {
            'k': [20, 40],
            'sim_options': {
                'name': ['msd', 'pearson'],
                'user_based': [False]
            }
        }
    },
    {
        'name': 'NMF',
        'algo': NMF,
        'params': {
            'n_factors': [10, 15],
            'n_epochs': [50, 100]
        }
    },
    {
        'name': 'CoClustering',
        'algo': CoClustering,
        'params': {
            'n_cltr_u': [3, 5],
            'n_cltr_i': [3, 5],
            'n_epochs': [20, 30]
        }
    }
]


In [None]:
def surprise_to_lightfm(trainset):
    """Convert Surprise Trainset to COO matrix for LightFM"""
    rows, cols, data = [], [], []
    for uid in trainset.all_users():
        user_ratings = trainset.ur[uid]
        for iid, rating in user_ratings:
            rows.append(uid)
            cols.append(iid)
            data.append(1)  # Use 1 for implicit feedback
    return coo_matrix((data, (rows, cols))), trainset.n_users, trainset.n_items

In [None]:
interactions, _, _ = surprise_to_lightfm(trainset)

In [None]:
# In your training loop:
for model_config in models:
    if 'LightFM' in model_config['name']:
        # LightFM handling
        #interactions, _, _ = surprise_to_lightfm(trainset)
        
        # Hyperparameter tuning
        best_score = -np.inf
        best_params = {}
        for params in ParameterGrid(model_config['params']):
            print(f"Training {model_config['name']} with params: {params}")
            model = LightFM(**params)
            model.fit(interactions, epochs=10, verbose=False)
            score = precision_at_k(model, interactions, k=5).mean()
            if score > best_score:
                best_score = score
                best_params = params
                
        # Final training
        print(f"[Final] Training {model_config['name']} with params: {params}")
        final_model = LightFM(**best_params)
        final_model.fit(interactions, epochs=20)
        
        # Generate predictions (example for LightFM)
        user_ids = np.arange(interactions.shape[0])

    else:
    # Original Surprise handling
        print(f"Training {model_config['name']} with params: {params}")
        gs = GridSearchCV(
            model_config['algo'],
            model_config['params'],
            measures=['rmse'],
            cv=5
        )
        gs.fit(data)
        best_model = gs.best_estimator['rmse']
        best_model.fit(trainset)

In [None]:
# Convert testset to LightFM-compatible format
test_user_ids = [trainset.to_inner_uid(uid) for (uid, _, _) in testset]
test_item_ids = [trainset.to_inner_iid(iid) for (_, iid, _) in testset]

# Generate predictions only for test pairs
test_preds = final_model.predict(test_user_ids, test_item_ids)

In [None]:
preds = final_model.predict(user_ids, np.arange(interactions.shape[1]))
#print("Train precision: %.2f" % precision_at_k(model, interactions, k=5).mean())

best_params = {}

for model_config in models:
    print(f"\n=== Tuning {model_config['name']} ===")
    gs = GridSearchCV(
        model_config['algo'],
        model_config['params'],
        measures=['rmse'],
        cv=5,
        n_jobs=-1,
        pre_dispatch='2*n_jobs'
    )
    gs.fit(data)
    best_params[model_config['name']] = gs.best_params['rmse']

# Model Training

In [None]:
results = []

for model_config in models:
    if 'LightFM' in model_config['name']:
        continue  # Skip LightFM for now
    print(f"\n=== Training {model_config['name']} ===")
    
    # Initialize with best params
    model = model_config['algo'](**best_params[model_config['name']])
    model.fit(trainset)
    
    # Generate predictions
    predictions = model.test(testset)
    preds = np.array([pred.est for pred in predictions])
    actuals = np.array([pred.r_ui for pred in predictions])
    
    # Calculate metrics
    rmse = np.sqrt(np.mean((preds - actuals) ** 2))
    tol_1 = np.mean(np.abs(preds - actuals) <= 1) * 100
    tol_05 = np.mean(np.abs(preds - actuals) <= 0.5) * 100
    
    results.append({
        'Model': model_config['name'],
        'Best Params': best_params[model_config['name']],
        'RMSE': rmse,
        'Acc (±1)': tol_1,
        'Acc (±0.5)': tol_05
    })

In [None]:
# For each model's predictions:
tolerance = 1
stricter_tolerance = 0.5

for model_result in results:
    model_name = model_result['Model']
    print(f"\n{model_name} Accuracy:")
    print(f"Within ±{tolerance} Stars: {model_result['Acc (±1)']:.2f}%")
    print(f"Within ±{stricter_tolerance} Stars: {model_result['Acc (±0.5)']:.2f}%")

In [None]:
#preds = np.array([pred.est for pred in test_preds]).reshape(-1, 1)
#actuals = np.array([pred.r_ui for pred in test_preds]).reshape(-1, 1)

In [None]:
# Print predictions
#for pred in test_preds:
#    print(f"Predicted={pred.est:.2f}, Actual={pred.r_ui}")

In [None]:
# Display results in DataFrame
results_df = pd.DataFrame(results)
print("\n=== Model Comparison ===")
print(results_df.to_string(index=False))

# Optional: Formatting for better display
results_df.style.format({
    'RMSE': '{:.4f}',
    'Acc (±1)': '{:.2f}%',
    'Acc (±0.5)': '{:.2f}%'
})

In [None]:
# Define tolerance (e.g., predictions within ±1 stars are "correct")
tolerance = 1
correct = np.abs(preds - actuals) <= tolerance
test_accuracy = np.mean(correct) * 100

# Define stricter tolerance (e.g., predictions within ±0.5 stars are "correct")
stricter_tolerance = 0.5
s_correct = np.abs(preds - actuals) <= stricter_tolerance
s_test_accuracy = np.mean(s_correct) * 100

print(f"Accuracy (Within ±{tolerance} Stars): {test_accuracy:.2f}%")
print(f"Accuracy (Within ±{stricter_tolerance} Stars): {s_test_accuracy:.2f}%")

In [None]:
# Save the model to disk
dump(model, '../models/cf_model.pkl')  # Or use .joblib extension
print("Model saved successfully!")

# Model Training with 10M

In [None]:
# Dataset Upgrade
beeg_data = pd.read_csv(r"K:\MachineProject\Data\ml-32m\ratings.dat", sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

# Drop the Timestamp column
beeg_data = beeg_data.drop('Timestamp', axis=1)
beeg_data.columns = ['UserID', 'MovieID', 'Rating']
beeg_data.head()

In [None]:
ratings_per_user = beeg_data.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

In [None]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

In [None]:
beeg_data["Rating"].value_counts()

In [None]:
beeg_data["MovieID"].describe()

In [None]:
beeg_data["UserID"].describe()

In [None]:
beeg_data['UserID'] = beeg_data['UserID'].astype('int32')
beeg_data['MovieID'] = beeg_data['MovieID'].astype('int32')
beeg_data['Rating'] = beeg_data['Rating'].astype('float16')

beeg_data.info(memory_usage='deep')

In [None]:
# Stratify by user_id (ensure all users are represented)
subsampled_df, _ = sklearn_split(
    beeg_data,
    test_size=0.5,
    stratify=beeg_data['UserID'],  # Preserve user distribution
    random_state=42
)

In [None]:
reader = Reader(rating_scale=(beeg_data['Rating'].min(), beeg_data['Rating'].max()))
data = Dataset.load_from_df(beeg_data[['UserID', 'MovieID', 'Rating']], reader)

In [None]:
memory = Memory(location='./cache', verbose=0)

In [None]:
param_grid = {
    'n_factors': [50, 100],  # Test latent dimensions
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}


gs = GridSearchCV(
    SVD,
    param_grid,
    measures=['rmse'],
    cv=5,
    n_jobs=1,
)
gs.fit(data)

# Best RMSE score and params
print(f"Best RMSE: {gs.best_score['rmse']}")
print(f"Best params: {gs.best_params['rmse']}")

In [None]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
model = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42)
model.fit(trainset)

In [None]:
test_preds = model.test(testset)
accuracy.rmse(test_preds)

In [None]:
preds = np.array([pred.est for pred in test_preds]).reshape(-1, 1)
actuals = np.array([pred.r_ui for pred in test_preds]).reshape(-1, 1)

In [None]:
# Define tolerance (e.g., predictions within ±1 stars are "correct")
tolerance = 1
correct = np.abs(preds - actuals) <= tolerance
test_accuracy = np.mean(correct) * 100

# Define stricter tolerance (e.g., predictions within ±0.5 stars are "correct")
stricter_tolerance = 0.5
s_correct = np.abs(preds - actuals) <= stricter_tolerance
s_test_accuracy = np.mean(s_correct) * 100

print(f"Accuracy (Within ±{tolerance} Stars): {test_accuracy:.2f}%")
print(f"Accuracy (Within ±{stricter_tolerance} Stars): {s_test_accuracy:.2f}%")

In [None]:
# Save the model to disk
dump(model, '../models/cf_model_2.pkl')  # Or use .joblib extension
print("Model saved successfully!")

# Model Training with 32M

In [None]:
# Dataset Upgrade
beegar_data = pd.read_csv(r"K:\MachineProject\Data\ml-32m\ratings.csv")

# Drop the Timestamp column
beegar_data = beegar_data.drop('timestamp', axis=1)
beegar_data.columns = ['UserID', 'MovieID', 'Rating']
beegar_data.head()

In [None]:
ratings_per_user = beegar_data.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

In [None]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

In [None]:
beegar_data["Rating"].value_counts()

In [None]:
beegar_data["MovieID"].describe()

In [None]:
beegar_data["UserID"].describe()

In [None]:
beegar_data.info(memory_usage='deep')

In [None]:
beegar_data['UserID'] = beegar_data['UserID'].astype('int32')
beegar_data['MovieID'] = beegar_data['MovieID'].astype('int32')
beegar_data['Rating'] = beegar_data['Rating'].astype('float16')

beegar_data.info(memory_usage='deep')

In [None]:
# Stratify by user_id (ensure all users are represented)
subsampled_df, _ = sklearn_split(
    beegar_data,
    test_size=0.5,
    stratify=beegar_data['UserID'],  # Preserve user distribution
    random_state=42
)

In [None]:
reader = Reader(rating_scale=(beegar_data['Rating'].min(), beegar_data['Rating'].max()))
data = Dataset.load_from_df(beegar_data[['UserID', 'MovieID', 'Rating']], reader)

In [None]:
memory = Memory(location='./cache', verbose=0)

In [None]:
param_grid = {
    'n_factors': [50, 100],  # Test latent dimensions
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}


gs = GridSearchCV(
    SVD,
    param_grid,
    measures=['rmse'],
    cv=5,
    n_jobs=1,
)
gs.fit(data)

# Best RMSE score and params
print(f"Best RMSE: {gs.best_score['rmse']}")
print(f"Best params: {gs.best_params['rmse']}")

In [None]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
model = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42)
model.fit(trainset)

In [None]:
test_preds = model.test(testset)
accuracy.rmse(test_preds)

In [None]:
preds = np.array([pred.est for pred in test_preds]).reshape(-1, 1)
actuals = np.array([pred.r_ui for pred in test_preds]).reshape(-1, 1)

In [None]:
# Define tolerance (e.g., predictions within ±1 stars are "correct")
tolerance = 1
correct = np.abs(preds - actuals) <= tolerance
test_accuracy = np.mean(correct) * 100

# Define stricter tolerance (e.g., predictions within ±0.5 stars are "correct")
stricter_tolerance = 0.5
s_correct = np.abs(preds - actuals) <= stricter_tolerance
s_test_accuracy = np.mean(s_correct) * 100

print(f"Accuracy (Within ±{tolerance} Stars): {test_accuracy:.2f}%")
print(f"Accuracy (Within ±{stricter_tolerance} Stars): {s_test_accuracy:.2f}%")

In [None]:
# Save the model to disk
dump(model, '../models/cf_model_2.pkl')  # Or use .joblib extension
print("Model saved successfully!")