# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid, train_test_split as sklearn_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score, roc_auc_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from surprise import SVD, Dataset, Reader, accuracy, KNNBasic, SlopeOne, CoClustering, NMF, Prediction
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from surprise.accuracy import rmse
from joblib import Memory, parallel_backend, dump
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from scipy.sparse import coo_matrix

# Data Preprocessing

In [2]:
df = pd.read_csv("../../preprocessing/merged_data.csv")
df.head()

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Title,Genres,Year
0,1,1193,5,0,1,One Flew Over the Cuckoo's Nest,[8],1975
1,1,661,3,0,1,James and the Giant Peach,"[3, 4, 12]",1996
2,1,914,3,0,1,My Fair Lady,"[12, 14]",1964
3,1,3408,4,0,1,Erin Brockovich,[8],2000
4,1,2355,5,0,1,"Bug's Life, A","[3, 4, 5]",1998


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 8 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   UserID   1000209 non-null  int64 
 1   MovieID  1000209 non-null  int64 
 2   Rating   1000209 non-null  int64 
 3   Gender   1000209 non-null  int64 
 4   Age      1000209 non-null  int64 
 5   Title    1000209 non-null  object
 6   Genres   1000209 non-null  object
 7   Year     1000209 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 61.0+ MB


In [4]:
filtered_df = df.drop(columns=["Gender","Age","Title","Year", "Genres"])
filtered_df.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [5]:
filtered_df["Rating"].value_counts()

Rating
4    348971
3    261197
5    226310
2    107557
1     56174
Name: count, dtype: int64

In [6]:
ratings_per_user = filtered_df.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

      user_id  num_ratings
0           1           53
1           2          129
2           3           51
3           4           21
4           5          198
...       ...          ...
6035     6036          888
6036     6037          202
6037     6038           20
6038     6039          123
6039     6040          341

[6040 rows x 2 columns]


In [7]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

count    6040.000000
mean      165.597517
std       192.747029
min        20.000000
10%        27.000000
50%        96.000000
90%       400.000000
max      2314.000000
Name: num_ratings, dtype: float64


In [8]:
filtered_df["MovieID"].describe()

count    1.000209e+06
mean     1.865540e+03
std      1.096041e+03
min      1.000000e+00
25%      1.030000e+03
50%      1.835000e+03
75%      2.770000e+03
max      3.952000e+03
Name: MovieID, dtype: float64

In [9]:
filtered_df["UserID"].describe()

count    1.000209e+06
mean     3.024512e+03
std      1.728413e+03
min      1.000000e+00
25%      1.506000e+03
50%      3.070000e+03
75%      4.476000e+03
max      6.040000e+03
Name: UserID, dtype: float64

In [10]:
subsampled_df, _ = sklearn_split(filtered_df, test_size=0.5, random_state=42, stratify=filtered_df['UserID'])

In [11]:
reader = Reader(rating_scale=(filtered_df['Rating'].min(), filtered_df['Rating'].max()))
data = Dataset.load_from_df(filtered_df[['UserID', 'MovieID', 'Rating']], reader)

In [12]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Test Playground

In [13]:
def surprise_to_lightfm(trainset):
    """Convert Surprise Trainset to COO matrix for LightFM"""
    rows, cols, data = [], [], []
    for uid in trainset.all_users():
        user_ratings = trainset.ur[uid]
        for iid, rating in user_ratings:
            rows.append(uid)
            cols.append(iid)
            data.append(1)  # Use 1 for implicit feedback
    return coo_matrix((data, (rows, cols))), trainset.n_users, trainset.n_items



In [45]:
interactions, _, _ = surprise_to_lightfm(trainset)

In [30]:
# Check for NaNs
assert not np.isnan(interactions.data).any()


In [31]:
# Remove zero-interaction users/items
interactions = interactions.tocsr()
interactions = interactions[interactions.getnnz(1) > 0]  # Remove empty rows
interactions = interactions[:, interactions.getnnz(0) > 0]  # Remove empty cols

In [19]:
# Define models and parameter grids
models = [
    {
        'name': 'LightFM-WARP',
        'algo': LightFM,
        'params': {
            'no_components': [20, 50],
            'loss': ['warp'],
            'learning_rate': [0.005, 0.01],
            'item_alpha': [0.02, 0.1, 0.3],
            'user_alpha': [0.02, 0.1, 0.3],
            'random_state': [42]
        }
    },
    {
        'name': 'LightFM-BPR',
        'algo': LightFM,
        'params': {
            'no_components': [20, 50],
            'loss': ['bpr'],
            'learning_rate': [0.005, 0.01],
            'item_alpha': [0.02, 0.1, 0.3],
            'user_alpha': [0.02, 0.1, 0.3],
            'random_state': [42]
        }
    },
]

In [20]:
# Tuning results storage
best_params = {}

# Tuning loop
for config in models:
    print(f"\n=== Tuning {config['name']} ===")
    
    if config['algo'] == LightFM:
        # LightFM tuning
        best_score = -np.inf
        best_params = {}
        for params in ParameterGrid(config['params']):
            print(f"Training {config['name']} with params: {params}")
            model = LightFM(**params)
            model.fit(interactions, epochs=10, verbose=False)
            score = precision_at_k(model, interactions, k=5).mean()
            if score > best_score:
                best_score = score
                best_params = params
                
    else:
        continue


=== Tuning LightFM-WARP ===
Training LightFM-WARP with params: {'item_alpha': 0.02, 'learning_rate': 0.005, 'loss': 'warp', 'no_components': 20, 'random_state': 42, 'user_alpha': 0.02}
Training LightFM-WARP with params: {'item_alpha': 0.02, 'learning_rate': 0.005, 'loss': 'warp', 'no_components': 20, 'random_state': 42, 'user_alpha': 0.1}
Training LightFM-WARP with params: {'item_alpha': 0.02, 'learning_rate': 0.005, 'loss': 'warp', 'no_components': 20, 'random_state': 42, 'user_alpha': 0.3}
Training LightFM-WARP with params: {'item_alpha': 0.02, 'learning_rate': 0.005, 'loss': 'warp', 'no_components': 50, 'random_state': 42, 'user_alpha': 0.02}
Training LightFM-WARP with params: {'item_alpha': 0.02, 'learning_rate': 0.005, 'loss': 'warp', 'no_components': 50, 'random_state': 42, 'user_alpha': 0.1}
Training LightFM-WARP with params: {'item_alpha': 0.02, 'learning_rate': 0.005, 'loss': 'warp', 'no_components': 50, 'random_state': 42, 'user_alpha': 0.3}
Training LightFM-WARP with params

In [60]:
def compute_metrics(predictions, tolerance=0.5):
    actuals = np.array([pred.r_ui for pred in predictions])
    preds = np.array([pred.est for pred in predictions])
    
    return {
        'RMSE': np.sqrt(np.mean((preds - actuals) ** 2)),
        f'Acc (±{tolerance})': np.mean(np.abs(preds - actuals) <= tolerance) * 100
    }

In [48]:
# Get all item IDs present in training
train_iids = set(trainset.all_items())  # Surprise's inner item IDs

In [49]:
filtered_testset = []
for (uid, iid, rating) in testset:
    try:
        # Check if item exists in training set
        trainset.to_inner_iid(iid)  # Will throw ValueError if not found
        filtered_testset.append((uid, iid, rating))
    except ValueError:
        print(f"Skipping unknown item: {iid}")
        continue

Skipping unknown item: 3904
Skipping unknown item: 791
Skipping unknown item: 624
Skipping unknown item: 789
Skipping unknown item: 1724
Skipping unknown item: 2254
Skipping unknown item: 3065
Skipping unknown item: 1165
Skipping unknown item: 3229
Skipping unknown item: 701
Skipping unknown item: 3323
Skipping unknown item: 791
Skipping unknown item: 730
Skipping unknown item: 557
Skipping unknown item: 3647
Skipping unknown item: 655
Skipping unknown item: 584
Skipping unknown item: 557
Skipping unknown item: 774
Skipping unknown item: 3881
Skipping unknown item: 672
Skipping unknown item: 712
Skipping unknown item: 2909
Skipping unknown item: 789
Skipping unknown item: 672
Skipping unknown item: 1630
Skipping unknown item: 139
Skipping unknown item: 1832
Skipping unknown item: 712
Skipping unknown item: 398
Skipping unknown item: 3376
Skipping unknown item: 396
Skipping unknown item: 868
Skipping unknown item: 2226
Skipping unknown item: 1118
Skipping unknown item: 2563


In [50]:
model = LightFM(**best_params)
model.fit(interactions, epochs=30)

# Generate predictions
test_user_ids = [trainset.to_inner_uid(uid) for (uid, _, _) in testset]
test_item_ids = [trainset.to_inner_iid(iid) for (_, iid, _) in filtered_testset]



In [53]:
# Generate valid indices (original testset positions)
valid_indices = []
test_user_ids = []
test_item_ids = []

for idx, (uid, iid, rating) in enumerate(testset):
    try:
        u_inner = trainset.to_inner_uid(uid)
        i_inner = trainset.to_inner_iid(iid)
        valid_indices.append(idx)  # Store original testset index
        test_user_ids.append(u_inner)
        test_item_ids.append(i_inner)
    except ValueError:
        continue  # Skip cold-start users/items

# Predict for valid pairs
preds = model.predict(test_user_ids, test_item_ids).flatten()

# Create Prediction objects using original testset indices
predictions = [
    Prediction(
        uid=testset[idx][0], 
        iid=testset[idx][1], 
        r_ui=testset[idx][2], 
        est=float(preds[j]),
        details=None,
    )
    for j, idx in enumerate(valid_indices)
]

In [54]:
print("Predictions shape:", preds.shape)  # Should be (n_predictions,)

Predictions shape: (200006,)


In [61]:
results = []
model_result = {'Model': config['name']}

# Compute metrics
metrics = compute_metrics(predictions)
model_result.update(metrics)
model_result.update({'Best Params': str(best_params)})
results.append(model_result)

In [44]:
results = []

for config in models:
    print(f"\n=== Training {config['name']} ===")
    model_result = {'Model': config['name']}
    
    if config['algo'] == LightFM:
        # LightFM training
        interactions = surprise_to_lightfm(trainset)
        model = LightFM(**best_params)
        model.fit(interactions, epochs=30)
        
        # Generate predictions
        test_user_ids = [trainset.to_inner_uid(uid) for (uid, _, _) in testset]
        test_item_ids = [trainset.to_inner_iid(iid) for (_, iid, _) in testset]
        
        # Generate valid indices (original testset positions)
        valid_indices = []
        test_user_ids = []
        test_item_ids = []

        for idx, (uid, iid, rating) in enumerate(testset):
            try:
                u_inner = trainset.to_inner_uid(uid)
                i_inner = trainset.to_inner_iid(iid)
                valid_indices.append(idx)  # Store original testset index
                test_user_ids.append(u_inner)
                test_item_ids.append(i_inner)
            except ValueError:
                continue  # Skip cold-start users/items

        # Predict for valid pairs
        preds = model.predict(test_user_ids, test_item_ids)

        # Create Prediction objects using original testset indices
        predictions = [
            Prediction(
                uid=testset[idx][0], 
                iid=testset[idx][1], 
                r_ui=testset[idx][2], 
                est=preds[j],
                details=None,
            )
            for j, idx in enumerate(valid_indices)
        ]
    else:
        continue
    
    # Compute metrics
    metrics = compute_metrics(predictions)
    model_result.update(metrics)
    model_result.update({'Best Params': str(best_params[config['name']])})
    results.append(model_result)


=== Training LightFM-WARP ===


AttributeError: 'tuple' object has no attribute 'tocoo'

In [62]:
results_df = pd.DataFrame(results)
print("\n=== Final Results ===")

# Formatting
styled_df = results_df.style.format({
    'RMSE': '{:.3f}',
    'Acc (±1)': '{:.1f}%',
    'Best Params': lambda x: x.replace(', ', ',\n')
}).set_properties(**{'text-align': 'left'})

styled_df


=== Final Results ===


Unnamed: 0,Model,RMSE,Acc (±0.5),Best Params
0,LightFM-WARP,3.767,0.0,"{'item_alpha': 0.02, 'learning_rate': 0.005, 'loss': 'bpr', 'no_components': 20, 'random_state': 42, 'user_alpha': 0.02}"


# TEST 2

In [63]:
model = LightFM(no_components=20, loss='bpr', learning_rate=0.005, item_alpha=0.02, user_alpha=0.02, random_state=42)
model.fit(interactions, epochs=30, num_threads=4)

<lightfm.lightfm.LightFM at 0x72b2b34feb30>

In [64]:
predictions = model.predict(test_user_ids, test_item_ids, num_threads=4)

In [67]:
score = precision_at_k(model, interactions, k=5, num_threads=4)
print(f"Precision at k=5: {score.mean():.2f}")

Precision at k=5: 0.40


In [65]:
preds = np.array([pred.est for pred in predictions])
actuals = np.array([pred.r_ui for pred in predictions])

AttributeError: 'numpy.float32' object has no attribute 'est'

# Hyperparameter Tuning

In [None]:
# Define models and parameter grids
models = [
    {
        'name': 'LightFM-WARP',
        'algo': LightFM,
        'params': {
            'no_components': [20, 50],
            'loss': ['warp'],
            'learning_rate': [0.01, 0.03],
            'item_alpha': [0.02, 0.1],
            'user_alpha': [0.02, 0.1],
            'random_state': [42]
        }
    },
    {
        'name': 'LightFM-BPR',
        'algo': LightFM,
        'params': {
            'no_components': [20, 50],
            'loss': ['bpr'],
            'learning_rate': [0.01, 0.03],
            'item_alpha': [0.02, 0.1],
            'user_alpha': [0.02, 0.1],
            'random_state': [42]
        }
    },
    {
        'name': 'SVD',
        'algo': SVD,
        'params': {
            'n_factors': [50, 100, 150],
            'n_epochs': [20, 30],
            'lr_all': [0.005, 0.01],
            'reg_all': [0.02, 0.1]
        }
    },
    {
        'name': 'KNNBasic',
        'algo': KNNBasic,
        'params': {
            'k': [20, 40],
            'sim_options': {
                'name': ['msd', 'pearson'],
                'user_based': [False]
            }
        }
    },
    {
        'name': 'NMF',
        'algo': NMF,
        'params': {
            'n_factors': [10, 15],
            'n_epochs': [50, 100]
        }
    },
    {
        'name': 'CoClustering',
        'algo': CoClustering,
        'params': {
            'n_cltr_u': [3, 5],
            'n_cltr_i': [3, 5],
            'n_epochs': [20, 30]
        }
    }
]


In [None]:
def surprise_to_lightfm(trainset):
    """Convert Surprise Trainset to COO matrix for LightFM"""
    rows, cols, data = [], [], []
    for uid in trainset.all_users():
        user_ratings = trainset.ur[uid]
        for iid, rating in user_ratings:
            rows.append(uid)
            cols.append(iid)
            data.append(1)  # Use 1 for implicit feedback
    return coo_matrix((data, (rows, cols))), trainset.n_users, trainset.n_items

In [None]:
interactions, _, _ = surprise_to_lightfm(trainset)

In [None]:
# In your training loop:
for model_config in models:
    if 'LightFM' in model_config['name']:
        # LightFM handling
        #interactions, _, _ = surprise_to_lightfm(trainset)
        
        # Hyperparameter tuning
        best_score = -np.inf
        best_params = {}
        for params in ParameterGrid(model_config['params']):
            print(f"Training {model_config['name']} with params: {params}")
            model = LightFM(**params)
            model.fit(interactions, epochs=10, verbose=False)
            score = precision_at_k(model, interactions, k=5).mean()
            if score > best_score:
                best_score = score
                best_params = params
                
        # Final training
        print(f"[Final] Training {model_config['name']} with params: {params}")
        final_model = LightFM(**best_params)
        final_model.fit(interactions, epochs=20)
        
        # Generate predictions (example for LightFM)
        user_ids = np.arange(interactions.shape[0])

    else:
    # Original Surprise handling
        print(f"Training {model_config['name']} with params: {params}")
        gs = GridSearchCV(
            model_config['algo'],
            model_config['params'],
            measures=['rmse'],
            cv=5
        )
        gs.fit(data)
        best_model = gs.best_estimator['rmse']
        best_model.fit(trainset)

In [None]:
# Convert testset to LightFM-compatible format
test_user_ids = [trainset.to_inner_uid(uid) for (uid, _, _) in testset]
test_item_ids = [trainset.to_inner_iid(iid) for (_, iid, _) in testset]

# Generate predictions only for test pairs
test_preds = final_model.predict(test_user_ids, test_item_ids)

In [None]:
preds = final_model.predict(user_ids, np.arange(interactions.shape[1]))
#print("Train precision: %.2f" % precision_at_k(model, interactions, k=5).mean())

best_params = {}

for model_config in models:
    print(f"\n=== Tuning {model_config['name']} ===")
    gs = GridSearchCV(
        model_config['algo'],
        model_config['params'],
        measures=['rmse'],
        cv=5,
        n_jobs=-1,
        pre_dispatch='2*n_jobs'
    )
    gs.fit(data)
    best_params[model_config['name']] = gs.best_params['rmse']

# Model Training

In [None]:
results = []

for model_config in models:
    if 'LightFM' in model_config['name']:
        continue  # Skip LightFM for now
    print(f"\n=== Training {model_config['name']} ===")
    
    # Initialize with best params
    model = model_config['algo'](**best_params[model_config['name']])
    model.fit(trainset)
    
    # Generate predictions
    predictions = model.test(testset)
    preds = np.array([pred.est for pred in predictions])
    actuals = np.array([pred.r_ui for pred in predictions])
    
    # Calculate metrics
    rmse = np.sqrt(np.mean((preds - actuals) ** 2))
    tol_1 = np.mean(np.abs(preds - actuals) <= 1) * 100
    tol_05 = np.mean(np.abs(preds - actuals) <= 0.5) * 100
    
    results.append({
        'Model': model_config['name'],
        'Best Params': best_params[model_config['name']],
        'RMSE': rmse,
        'Acc (±1)': tol_1,
        'Acc (±0.5)': tol_05
    })

In [None]:
# For each model's predictions:
tolerance = 1
stricter_tolerance = 0.5

for model_result in results:
    model_name = model_result['Model']
    print(f"\n{model_name} Accuracy:")
    print(f"Within ±{tolerance} Stars: {model_result['Acc (±1)']:.2f}%")
    print(f"Within ±{stricter_tolerance} Stars: {model_result['Acc (±0.5)']:.2f}%")

In [None]:
#preds = np.array([pred.est for pred in test_preds]).reshape(-1, 1)
#actuals = np.array([pred.r_ui for pred in test_preds]).reshape(-1, 1)

In [None]:
# Print predictions
#for pred in test_preds:
#    print(f"Predicted={pred.est:.2f}, Actual={pred.r_ui}")

In [None]:
# Display results in DataFrame
results_df = pd.DataFrame(results)
print("\n=== Model Comparison ===")
print(results_df.to_string(index=False))

# Optional: Formatting for better display
results_df.style.format({
    'RMSE': '{:.4f}',
    'Acc (±1)': '{:.2f}%',
    'Acc (±0.5)': '{:.2f}%'
})

In [None]:
# Define tolerance (e.g., predictions within ±1 stars are "correct")
tolerance = 1
correct = np.abs(preds - actuals) <= tolerance
test_accuracy = np.mean(correct) * 100

# Define stricter tolerance (e.g., predictions within ±0.5 stars are "correct")
stricter_tolerance = 0.5
s_correct = np.abs(preds - actuals) <= stricter_tolerance
s_test_accuracy = np.mean(s_correct) * 100

print(f"Accuracy (Within ±{tolerance} Stars): {test_accuracy:.2f}%")
print(f"Accuracy (Within ±{stricter_tolerance} Stars): {s_test_accuracy:.2f}%")

In [None]:
# Save the model to disk
dump(model, '../models/cf_model.pkl')  # Or use .joblib extension
print("Model saved successfully!")

# Model Training with 10M

In [None]:
# Dataset Upgrade
beeg_data = pd.read_csv(r"K:\MachineProject\Data\ml-32m\ratings.dat", sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

# Drop the Timestamp column
beeg_data = beeg_data.drop('Timestamp', axis=1)
beeg_data.columns = ['UserID', 'MovieID', 'Rating']
beeg_data.head()

In [None]:
ratings_per_user = beeg_data.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

In [None]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

In [None]:
beeg_data["Rating"].value_counts()

In [None]:
beeg_data["MovieID"].describe()

In [None]:
beeg_data["UserID"].describe()

In [None]:
beeg_data['UserID'] = beeg_data['UserID'].astype('int32')
beeg_data['MovieID'] = beeg_data['MovieID'].astype('int32')
beeg_data['Rating'] = beeg_data['Rating'].astype('float16')

beeg_data.info(memory_usage='deep')

In [None]:
# Stratify by user_id (ensure all users are represented)
subsampled_df, _ = sklearn_split(
    beeg_data,
    test_size=0.5,
    stratify=beeg_data['UserID'],  # Preserve user distribution
    random_state=42
)

In [None]:
reader = Reader(rating_scale=(beeg_data['Rating'].min(), beeg_data['Rating'].max()))
data = Dataset.load_from_df(beeg_data[['UserID', 'MovieID', 'Rating']], reader)

In [None]:
memory = Memory(location='./cache', verbose=0)

In [None]:
param_grid = {
    'n_factors': [50, 100],  # Test latent dimensions
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}


gs = GridSearchCV(
    SVD,
    param_grid,
    measures=['rmse'],
    cv=5,
    n_jobs=1,
)
gs.fit(data)

# Best RMSE score and params
print(f"Best RMSE: {gs.best_score['rmse']}")
print(f"Best params: {gs.best_params['rmse']}")

In [None]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
model = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42)
model.fit(trainset)

In [None]:
test_preds = model.test(testset)
accuracy.rmse(test_preds)

In [None]:
preds = np.array([pred.est for pred in test_preds]).reshape(-1, 1)
actuals = np.array([pred.r_ui for pred in test_preds]).reshape(-1, 1)

In [None]:
# Define tolerance (e.g., predictions within ±1 stars are "correct")
tolerance = 1
correct = np.abs(preds - actuals) <= tolerance
test_accuracy = np.mean(correct) * 100

# Define stricter tolerance (e.g., predictions within ±0.5 stars are "correct")
stricter_tolerance = 0.5
s_correct = np.abs(preds - actuals) <= stricter_tolerance
s_test_accuracy = np.mean(s_correct) * 100

print(f"Accuracy (Within ±{tolerance} Stars): {test_accuracy:.2f}%")
print(f"Accuracy (Within ±{stricter_tolerance} Stars): {s_test_accuracy:.2f}%")

In [None]:
# Save the model to disk
dump(model, '../models/cf_model_2.pkl')  # Or use .joblib extension
print("Model saved successfully!")

# Model Training with 32M

In [None]:
# Dataset Upgrade
beegar_data = pd.read_csv(r"K:\MachineProject\Data\ml-32m\ratings.csv")

# Drop the Timestamp column
beegar_data = beegar_data.drop('timestamp', axis=1)
beegar_data.columns = ['UserID', 'MovieID', 'Rating']
beegar_data.head()

In [None]:
ratings_per_user = beegar_data.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

In [None]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

In [None]:
beegar_data["Rating"].value_counts()

In [None]:
beegar_data["MovieID"].describe()

In [None]:
beegar_data["UserID"].describe()

In [None]:
beegar_data.info(memory_usage='deep')

In [None]:
beegar_data['UserID'] = beegar_data['UserID'].astype('int32')
beegar_data['MovieID'] = beegar_data['MovieID'].astype('int32')
beegar_data['Rating'] = beegar_data['Rating'].astype('float16')

beegar_data.info(memory_usage='deep')

In [None]:
# Stratify by user_id (ensure all users are represented)
subsampled_df, _ = sklearn_split(
    beegar_data,
    test_size=0.5,
    stratify=beegar_data['UserID'],  # Preserve user distribution
    random_state=42
)

In [None]:
reader = Reader(rating_scale=(beegar_data['Rating'].min(), beegar_data['Rating'].max()))
data = Dataset.load_from_df(beegar_data[['UserID', 'MovieID', 'Rating']], reader)

In [None]:
memory = Memory(location='./cache', verbose=0)

In [None]:
param_grid = {
    'n_factors': [50, 100],  # Test latent dimensions
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}


gs = GridSearchCV(
    SVD,
    param_grid,
    measures=['rmse'],
    cv=5,
    n_jobs=1,
)
gs.fit(data)

# Best RMSE score and params
print(f"Best RMSE: {gs.best_score['rmse']}")
print(f"Best params: {gs.best_params['rmse']}")

In [None]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
model = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42)
model.fit(trainset)

In [None]:
test_preds = model.test(testset)
accuracy.rmse(test_preds)

In [None]:
preds = np.array([pred.est for pred in test_preds]).reshape(-1, 1)
actuals = np.array([pred.r_ui for pred in test_preds]).reshape(-1, 1)

In [None]:
# Define tolerance (e.g., predictions within ±1 stars are "correct")
tolerance = 1
correct = np.abs(preds - actuals) <= tolerance
test_accuracy = np.mean(correct) * 100

# Define stricter tolerance (e.g., predictions within ±0.5 stars are "correct")
stricter_tolerance = 0.5
s_correct = np.abs(preds - actuals) <= stricter_tolerance
s_test_accuracy = np.mean(s_correct) * 100

print(f"Accuracy (Within ±{tolerance} Stars): {test_accuracy:.2f}%")
print(f"Accuracy (Within ±{stricter_tolerance} Stars): {s_test_accuracy:.2f}%")

In [None]:
# Save the model to disk
dump(model, '../models/cf_model_2.pkl')  # Or use .joblib extension
print("Model saved successfully!")