# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split as sklearn_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score, roc_auc_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from surprise import SVD, Dataset, Reader, accuracy, KNNBasic, SlopeOne, CoClustering, NMF
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from joblib import Memory, parallel_backend, dump

# Data Preprocessing

In [2]:
df = pd.read_csv("../../preprocessing/merged_data.csv")
df.head()

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Title,Genres,Year
0,1,1193,5,0,1,One Flew Over the Cuckoo's Nest,[8],1975
1,1,661,3,0,1,James and the Giant Peach,"[3, 4, 12]",1996
2,1,914,3,0,1,My Fair Lady,"[12, 14]",1964
3,1,3408,4,0,1,Erin Brockovich,[8],2000
4,1,2355,5,0,1,"Bug's Life, A","[3, 4, 5]",1998


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 8 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   UserID   1000209 non-null  int64 
 1   MovieID  1000209 non-null  int64 
 2   Rating   1000209 non-null  int64 
 3   Gender   1000209 non-null  int64 
 4   Age      1000209 non-null  int64 
 5   Title    1000209 non-null  object
 6   Genres   1000209 non-null  object
 7   Year     1000209 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 61.0+ MB


In [4]:
filtered_df = df.drop(columns=["Gender","Age","Title","Year", "Genres"])
filtered_df.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [5]:
filtered_df["Rating"].value_counts()

Rating
4    348971
3    261197
5    226310
2    107557
1     56174
Name: count, dtype: int64

In [6]:
ratings_per_user = filtered_df.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

      user_id  num_ratings
0           1           53
1           2          129
2           3           51
3           4           21
4           5          198
...       ...          ...
6035     6036          888
6036     6037          202
6037     6038           20
6038     6039          123
6039     6040          341

[6040 rows x 2 columns]


In [7]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

count    6040.000000
mean      165.597517
std       192.747029
min        20.000000
10%        27.000000
50%        96.000000
90%       400.000000
max      2314.000000
Name: num_ratings, dtype: float64


In [8]:
filtered_df["MovieID"].describe()

count    1.000209e+06
mean     1.865540e+03
std      1.096041e+03
min      1.000000e+00
25%      1.030000e+03
50%      1.835000e+03
75%      2.770000e+03
max      3.952000e+03
Name: MovieID, dtype: float64

In [9]:
filtered_df["UserID"].describe()

count    1.000209e+06
mean     3.024512e+03
std      1.728413e+03
min      1.000000e+00
25%      1.506000e+03
50%      3.070000e+03
75%      4.476000e+03
max      6.040000e+03
Name: UserID, dtype: float64

In [10]:
user_means = filtered_df.groupby('UserID')['Rating'].mean().rename('user_mean')
filtered_df = filtered_df.join(user_means, on='UserID')

In [11]:
filtered_df['normalized_rating'] = filtered_df['Rating'] - filtered_df['user_mean']
print(filtered_df)

         UserID  MovieID  Rating  user_mean  normalized_rating
0             1     1193       5   4.188679           0.811321
1             1      661       3   4.188679          -1.188679
2             1      914       3   4.188679          -1.188679
3             1     3408       4   4.188679          -0.188679
4             1     2355       5   4.188679           0.811321
...         ...      ...     ...        ...                ...
1000204    6040     1091       1   3.577713          -2.577713
1000205    6040     1094       5   3.577713           1.422287
1000206    6040      562       5   3.577713           1.422287
1000207    6040     1096       4   3.577713           0.422287
1000208    6040     1097       4   3.577713           0.422287

[1000209 rows x 5 columns]


# Model Training

In [12]:
reader = Reader(rating_scale=(filtered_df['normalized_rating'].min(), filtered_df['normalized_rating'].max()))
data = Dataset.load_from_df(filtered_df[['UserID', 'MovieID', 'normalized_rating']], reader)

In [13]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [16]:
# Define models and parameter grids
models = [
    {
        'name': 'SVD',
        'algo': SVD,
        'params': {
            'n_factors': [50, 100, 150],
            'n_epochs': [20, 30],
            'lr_all': [0.005, 0.01],
            'reg_all': [0.02, 0.1]
        }
    },
    {
        'name': 'KNNBasic',
        'algo': KNNBasic,
        'params': {
            'k': [20, 40],
            'sim_options': {
                'name': ['msd', 'pearson'],
                'user_based': [False]
            }
        }
    },
    {
        'name': 'NMF',
        'algo': NMF,
        'params': {
            'n_factors': [10, 15],
            'n_epochs': [50, 100]
        }
    },
    {
        'name': 'CoClustering',
        'algo': CoClustering,
        'params': {
            'n_cltr_u': [3, 5],
            'n_cltr_i': [3, 5],
            'n_epochs': [20, 30]
        }
    }
]

results = []

In [17]:
for model_config in models:
    print(f"\n=== Tuning {model_config['name']} ===")
    
    # Hyperparameter tuning
    gs = GridSearchCV(
        model_config['algo'],
        model_config['params'],
        measures=['rmse'],
        cv=5,
        n_jobs=-1,
        pre_dispatch='2*n_jobs'
    )
    gs.fit(data)
    
    # Get best model
    best_model = gs.best_estimator['rmse']
    best_model.fit(trainset)
    
    # Generate predictions
    predictions = best_model.test(testset)
    preds = np.array([pred.est + user_means[pred.uid] for pred in predictions])
    actuals = np.array([pred.r_ui + user_means[pred.uid] for pred in predictions])
    
    preds = np.clip(preds, 1, 5)
    
    # Calculate metrics
    rmse = np.sqrt(np.mean((preds - actuals) ** 2))
    
    # Accuracy calculations
    tol_1 = np.mean(np.abs(preds - actuals) <= 1) * 100
    tol_05 = np.mean(np.abs(preds - actuals) <= 0.5) * 100
    
    results.append({
        'Model': model_config['name'],
        'Best Params': gs.best_params['rmse'],
        'RMSE': rmse,
        'Acc (±1)': tol_1,
        'Acc (±0.5)': tol_05
    })


=== Tuning SVD ===

=== Tuning KNNBasic ===
Computing the msd similarity matrix...
Done computing similarity matrix.

=== Tuning NMF ===

=== Tuning CoClustering ===


In [None]:
#preds = np.array([pred.est for pred in test_preds]).reshape(-1, 1)
#actuals = np.array([pred.r_ui for pred in test_preds]).reshape(-1, 1)

In [None]:
# Print predictions
#for pred in test_preds:
#    print(f"Predicted={pred.est:.2f}, Actual={pred.r_ui}")

In [18]:
# Display results in DataFrame
results_df = pd.DataFrame(results)
print("\n=== Model Comparison ===")
print(results_df.to_string(index=False))

# Optional: Formatting for better display
results_df.style.format({
    'RMSE': '{:.4f}',
    'Acc (±1)': '{:.2f}%',
    'Acc (±0.5)': '{:.2f}%'
})


=== Model Comparison ===
       Model                                                         Best Params     RMSE  Acc (±1)  Acc (±0.5)
         SVD {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02} 0.871230 76.323972   44.740105
    KNNBasic      {'k': 40, 'sim_options': {'name': 'msd', 'user_based': False}} 0.914352 73.984463   42.632547
         NMF                                  {'n_factors': 10, 'n_epochs': 100} 1.069651 65.615721   36.613811
CoClustering                      {'n_cltr_u': 5, 'n_cltr_i': 5, 'n_epochs': 20} 0.943528 71.055078   39.887624


Unnamed: 0,Model,Best Params,RMSE,Acc (±1),Acc (±0.5)
0,SVD,"{'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}",0.8712,76.32%,44.74%
1,KNNBasic,"{'k': 40, 'sim_options': {'name': 'msd', 'user_based': False}}",0.9144,73.98%,42.63%
2,NMF,"{'n_factors': 10, 'n_epochs': 100}",1.0697,65.62%,36.61%
3,CoClustering,"{'n_cltr_u': 5, 'n_cltr_i': 5, 'n_epochs': 20}",0.9435,71.06%,39.89%


In [None]:
# Define tolerance (e.g., predictions within ±1 stars are "correct")
tolerance = 1
correct = np.abs(preds - actuals) <= tolerance
test_accuracy = np.mean(correct) * 100

# Define stricter tolerance (e.g., predictions within ±0.5 stars are "correct")
stricter_tolerance = 0.5
s_correct = np.abs(preds - actuals) <= stricter_tolerance
s_test_accuracy = np.mean(s_correct) * 100

print(f"Accuracy (Within ±{tolerance} Stars): {test_accuracy:.2f}%")
print(f"Accuracy (Within ±{stricter_tolerance} Stars): {s_test_accuracy:.2f}%")

In [None]:
# Save the model to disk
dump(model, '../models/cf_model.pkl')  # Or use .joblib extension
print("Model saved successfully!")

# Model Training with 10M

In [3]:
# Dataset Upgrade
beeg_data = pd.read_csv(r"K:\MachineProject\Data\ml-32m\ratings.dat", sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

# Drop the Timestamp column
beeg_data = beeg_data.drop('Timestamp', axis=1)
beeg_data.columns = ['UserID', 'MovieID', 'Rating']
beeg_data.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,122,5.0
1,1,185,5.0
2,1,231,5.0
3,1,292,5.0
4,1,316,5.0


In [None]:
ratings_per_user = beeg_data.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

       user_id  num_ratings
0            1           22
1            2           20
2            3           33
3            4           38
4            5           87
...        ...          ...
69873    71563           43
69874    71564          105
69875    71565          145
69876    71566           49
69877    71567           53

[69878 rows x 2 columns]


In [None]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

count    69878.00000
mean       143.10733
std        216.71258
min         20.00000
10%         24.00000
50%         69.00000
90%        335.00000
max       7359.00000
Name: num_ratings, dtype: float64


In [None]:
beeg_data["Rating"].value_counts()

Rating
4.0    2875850
3.0    2356676
5.0    1544812
3.5     879764
2.0     790306
4.5     585022
1.0     384180
2.5     370178
1.5     118278
0.5      94988
Name: count, dtype: int64

In [None]:
beeg_data["MovieID"].describe()

count    1.000005e+07
mean     4.120291e+03
std      8.938402e+03
min      1.000000e+00
25%      6.480000e+02
50%      1.834000e+03
75%      3.624000e+03
max      6.513300e+04
Name: MovieID, dtype: float64

In [None]:
beeg_data["UserID"].describe()

count    1.000005e+07
mean     3.586986e+04
std      2.058534e+04
min      1.000000e+00
25%      1.812300e+04
50%      3.574050e+04
75%      5.360800e+04
max      7.156700e+04
Name: UserID, dtype: float64

In [None]:
beeg_data['UserID'] = beeg_data['UserID'].astype('int32')
beeg_data['MovieID'] = beeg_data['MovieID'].astype('int32')
beeg_data['Rating'] = beeg_data['Rating'].astype('float16')

beeg_data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000054 entries, 0 to 10000053
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   UserID   int32  
 1   MovieID  int32  
 2   Rating   float16
dtypes: float16(1), int32(2)
memory usage: 95.4 MB


In [None]:
# Stratify by user_id (ensure all users are represented)
subsampled_df, _ = sklearn_split(
    beeg_data,
    test_size=0.5,
    stratify=beeg_data['UserID'],  # Preserve user distribution
    random_state=42
)

In [4]:
reader = Reader(rating_scale=(beeg_data['Rating'].min(), beeg_data['Rating'].max()))
data = Dataset.load_from_df(beeg_data[['UserID', 'MovieID', 'Rating']], reader)

In [5]:
memory = Memory(location='./cache', verbose=0)

In [None]:
param_grid = {
    'n_factors': [50, 100],  # Test latent dimensions
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}


gs = GridSearchCV(
    SVD,
    param_grid,
    measures=['rmse'],
    cv=5,
    n_jobs=1,
)
gs.fit(data)

# Best RMSE score and params
print(f"Best RMSE: {gs.best_score['rmse']}")
print(f"Best params: {gs.best_params['rmse']}")

Best RMSE: 0.8375733996579366
Best params: {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


In [6]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [7]:
model = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1fcb9f060c0>

In [8]:
test_preds = model.test(testset)
accuracy.rmse(test_preds)

RMSE: 0.7975


0.7975374101851651

In [9]:
preds = np.array([pred.est for pred in test_preds]).reshape(-1, 1)
actuals = np.array([pred.r_ui for pred in test_preds]).reshape(-1, 1)

In [10]:
# Define tolerance (e.g., predictions within ±1 stars are "correct")
tolerance = 1
correct = np.abs(preds - actuals) <= tolerance
test_accuracy = np.mean(correct) * 100

# Define stricter tolerance (e.g., predictions within ±0.5 stars are "correct")
stricter_tolerance = 0.5
s_correct = np.abs(preds - actuals) <= stricter_tolerance
s_test_accuracy = np.mean(s_correct) * 100

print(f"Accuracy (Within ±{tolerance} Stars): {test_accuracy:.2f}%")
print(f"Accuracy (Within ±{stricter_tolerance} Stars): {s_test_accuracy:.2f}%")

Accuracy (Within ±1 Stars): 81.29%
Accuracy (Within ±0.5 Stars): 50.96%


In [None]:
# Save the model to disk
dump(model, '../models/cf_model_2.pkl')  # Or use .joblib extension
print("Model saved successfully!")

Model saved successfully!


# Model Training with 32M

In [None]:
# Dataset Upgrade
beegar_data = pd.read_csv(r"K:\MachineProject\Data\ml-32m\ratings.csv")

# Drop the Timestamp column
beegar_data = beegar_data.drop('timestamp', axis=1)
beegar_data.columns = ['UserID', 'MovieID', 'Rating']
beegar_data.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [None]:
ratings_per_user = beegar_data.groupby('UserID')['Rating'].count().reset_index()
ratings_per_user.columns = ['user_id', 'num_ratings']
print(ratings_per_user)

        user_id  num_ratings
0             1          141
1             2           52
2             3          147
3             4           27
4             5           33
...         ...          ...
200943   200944          298
200944   200945          108
200945   200946           23
200946   200947           61
200947   200948          236

[200948 rows x 2 columns]


In [None]:
stats = ratings_per_user['num_ratings'].describe(percentiles=[0.1, 0.5, 0.9])
print(stats)

count    200948.000000
mean        159.246193
std         282.025462
min          20.000000
10%          25.000000
50%          73.000000
90%         364.000000
max       33332.000000
Name: num_ratings, dtype: float64


In [None]:
beegar_data["Rating"].value_counts()

Rating
4.0    8367654
3.0    6054990
5.0    4596577
3.5    4290105
4.5    2974000
2.0    2028622
2.5    1685386
1.0     946675
1.5     531063
0.5     525132
Name: count, dtype: int64

In [None]:
beegar_data["MovieID"].describe()

count    3.200020e+07
mean     2.931861e+04
std      5.095816e+04
min      1.000000e+00
25%      1.233000e+03
50%      3.452000e+03
75%      4.419900e+04
max      2.927570e+05
Name: MovieID, dtype: float64

In [None]:
beegar_data["UserID"].describe()

count    3.200020e+07
mean     1.002785e+05
std      5.794905e+04
min      1.000000e+00
25%      5.005300e+04
50%      1.002970e+05
75%      1.504510e+05
max      2.009480e+05
Name: UserID, dtype: float64

In [None]:
beegar_data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   UserID   int64  
 1   MovieID  int64  
 2   Rating   float64
dtypes: float64(1), int64(2)
memory usage: 732.4 MB


In [None]:
beegar_data['UserID'] = beegar_data['UserID'].astype('int32')
beegar_data['MovieID'] = beegar_data['MovieID'].astype('int32')
beegar_data['Rating'] = beegar_data['Rating'].astype('float16')

beegar_data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   UserID   int32  
 1   MovieID  int32  
 2   Rating   float16
dtypes: float16(1), int32(2)
memory usage: 305.2 MB


In [None]:
# Stratify by user_id (ensure all users are represented)
subsampled_df, _ = sklearn_split(
    beegar_data,
    test_size=0.5,
    stratify=beegar_data['UserID'],  # Preserve user distribution
    random_state=42
)

In [None]:
reader = Reader(rating_scale=(beegar_data['Rating'].min(), beegar_data['Rating'].max()))
data = Dataset.load_from_df(beegar_data[['UserID', 'MovieID', 'Rating']], reader)

In [None]:
memory = Memory(location='./cache', verbose=0)

In [None]:
param_grid = {
    'n_factors': [50, 100],  # Test latent dimensions
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.01],
    'reg_all': [0.02, 0.1]
}


gs = GridSearchCV(
    SVD,
    param_grid,
    measures=['rmse'],
    cv=5,
    n_jobs=1,
)
gs.fit(data)

# Best RMSE score and params
print(f"Best RMSE: {gs.best_score['rmse']}")
print(f"Best params: {gs.best_params['rmse']}")

Best RMSE: 0.8375733996579366
Best params: {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.02}


In [None]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
model = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x275c6bba900>

In [None]:
test_preds = model.test(testset)
accuracy.rmse(test_preds)

RMSE: 0.7731


0.7730811014923263

In [None]:
preds = np.array([pred.est for pred in test_preds]).reshape(-1, 1)
actuals = np.array([pred.r_ui for pred in test_preds]).reshape(-1, 1)

In [28]:
# Define tolerance (e.g., predictions within ±1 stars are "correct")
tolerance = 1
correct = np.abs(preds - actuals) <= tolerance
test_accuracy = np.mean(correct) * 100

# Define stricter tolerance (e.g., predictions within ±0.5 stars are "correct")
stricter_tolerance = 0.5
s_correct = np.abs(preds - actuals) <= stricter_tolerance
s_test_accuracy = np.mean(s_correct) * 100

print(f"Accuracy (Within ±{tolerance} Stars): {test_accuracy:.2f}%")
print(f"Accuracy (Within ±{stricter_tolerance} Stars): {s_test_accuracy:.2f}%")

Accuracy (Within ±1 Stars): 83.25%
Accuracy (Within ±0.5 Stars): 54.31%


In [None]:
# Save the model to disk
dump(model, '../models/cf_model_2.pkl')  # Or use .joblib extension
print("Model saved successfully!")

Model saved successfully!
