In [6]:
import os
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

# Path to your CSV file
file_path = os.path.expanduser("./Magazine_Subscriptions.csv")

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Ensure your DataFrame has columns "userID", "itemID", "rating"
df.columns = ["userID", "itemID", "rating", "timestamp"]  # Map your columns appropriately
df = df[["userID", "itemID", "rating"]]  # Drop any unnecessary columns

# Define the Reader and load the data
reader = Reader(rating_scale=(1, 5))  # Adjust `rating_scale` if your ratings are outside this range
data = Dataset.load_from_df(df, reader)

# Train and cross-validate the SVD algorithm
algo = SVD()
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4153  1.4185  1.4186  1.4021  1.4206  1.4150  0.0067  
MAE (testset)     1.1522  1.1585  1.1599  1.1421  1.1562  1.1538  0.0064  
Fit time          1.70    1.86    2.07    1.75    1.78    1.83    0.13    
Test time         0.12    0.12    0.19    0.11    0.11    0.13    0.03    


{'test_rmse': array([1.41531314, 1.41849731, 1.41858163, 1.40212913, 1.42059093]),
 'test_mae': array([1.15224301, 1.1585406 , 1.15994218, 1.14209325, 1.15622561]),
 'fit_time': (1.6995465755462646,
  1.8588895797729492,
  2.074091911315918,
  1.750744104385376,
  1.782212495803833),
 'test_time': (0.11803030967712402,
  0.11594295501708984,
  0.19258570671081543,
  0.106781005859375,
  0.10809516906738281)}

In [1]:
import os
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

# Path to your JSONL file
file_path = os.path.expanduser("./Magazine_Subscriptions.jsonl")

# Load the JSONL file into a DataFrame
df = pd.read_json(file_path, lines=True)

# Extract only the relevant columns for Surprise
df = df[["user_id", "parent_asin", "rating"]]

# Rename columns to match Surprise's expected schema
df.columns = ["userID", "itemID", "rating"]

# Define the Reader and load the data
reader = Reader(rating_scale=(1, 5))  # Adjust rating_scale if necessary
data = Dataset.load_from_df(df, reader)

# Train and cross-validate the SVD algorithm
algo = SVD()
cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4066  1.4120  1.4041  1.4168  1.4098  1.4099  0.0044  
MAE (testset)     1.1442  1.1521  1.1437  1.1515  1.1452  1.1473  0.0037  
Fit time          2.72    3.88    2.52    3.26    3.31    3.14    0.48    
Test time         0.40    0.22    0.40    0.30    0.20    0.31    0.09    


{'test_rmse': array([1.40663772, 1.41199773, 1.40411   , 1.41677049, 1.40984638]),
 'test_mae': array([1.1441722 , 1.15208238, 1.14370796, 1.15153476, 1.14515715]),
 'fit_time': (2.7166433334350586,
  3.8829565048217773,
  2.5226891040802,
  3.2641406059265137,
  3.3082997798919678),
 'test_time': (0.4030582904815674,
  0.21792101860046387,
  0.39958930015563965,
  0.30336904525756836,
  0.2028064727783203)}

In [2]:
from surprise import accuracy
from surprise.model_selection import KFold
kf = KFold(n_splits=3)



for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
    print("-----------------------------------------")
    accuracy.mse(predictions, verbose=True)

RMSE: 1.4132
-----------------------------------------
MSE: 1.9972
RMSE: 1.4138
-----------------------------------------
MSE: 1.9988
RMSE: 1.4111
-----------------------------------------
MSE: 1.9911


In [3]:
from surprise.model_selection import GridSearchCV
param_grid = {"n_epochs": [5, 10,15,20], "lr_all": [0.002, 0.005,0.007,0.009], "reg_all": [0.4, 0.6,0.8,1]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

1.4067455138438907
{'n_epochs': 20, 'lr_all': 0.009, 'reg_all': 0.4}


In [5]:
param_grid = {"n_epochs": [40,50], "lr_all": [0.09,0.1], "reg_all": [0.3,0.4]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

1.4000578515319388
{'n_epochs': 40, 'lr_all': 0.09, 'reg_all': 0.3}


In [None]:
best_permutation=[]
epochs=[60,65,67]
lr=[0.08,0.09,0.1]
reg=[0.03,0.03,0.03]

param_grid = {"n_epochs": epochs, "lr_all": lr, "reg_all": reg}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)

gs.fit(data)

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

1.3890014833157642
{'n_epochs': 60, 'lr_all': 0.08, 'reg_all': 0.03}


In [8]:
from surprise.model_selection import cross_validate
import itertools

best_permutation = []

# Parameter grid
epochs = [60,62,64,66]
lr = [0.09,0.095,0.097,0.099]
reg = [0.3,0.3,0.3]

param_grid = list(itertools.product(epochs, lr, reg))  # Generate all combinations

# Manually evaluate two combinations at a time
for i in range(0, len(param_grid), 2):  # Iterate through the grid in steps of 2
    # Get a batch of 2 parameter combinations
    batch = param_grid[i : i + 2]
    
    for n_epochs, lr_all, reg_all in batch:
        algo = SVD(n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all)
        
        # Evaluate the algorithm
        results = cross_validate(algo, data, measures=["rmse"], cv=5, verbose=False)
        
        # Average RMSE over folds
        mean_rmse = results["test_rmse"].mean()
        
        # Append the results to best_permutation
        best_permutation.append((n_epochs, lr_all, reg_all, mean_rmse))

# Find the best combination
print(best_permutation)
print('----------------------')
best_combination = min(best_permutation, key=lambda x: x[3])  # Select by lowest RMSE
print('----------------------')
print("Best RMSE score:", best_combination[3])
print('----------------------')
print("Best parameters: n_epochs={}, lr_all={}, reg_all={}".format(
    best_combination[0], best_combination[1], best_combination[2]
))


[(60, 0.09, 0.3, 1.396124345141083), (60, 0.09, 0.3, 1.3950750853974097), (60, 0.09, 0.3, 1.395061873212433), (60, 0.095, 0.3, 1.3958940810824214), (60, 0.095, 0.3, 1.3963629859177609), (60, 0.095, 0.3, 1.3963750650320896), (60, 0.097, 0.3, 1.393574950283839), (60, 0.097, 0.3, 1.3946794643190792), (60, 0.097, 0.3, 1.3944628692872476), (60, 0.099, 0.3, 1.3968445210801477), (60, 0.099, 0.3, 1.3941968978441812), (60, 0.099, 0.3, 1.3958874124153964), (62, 0.09, 0.3, 1.3971066744605072), (62, 0.09, 0.3, 1.3955413514716402), (62, 0.09, 0.3, 1.3956865861069716), (62, 0.095, 0.3, 1.395057729921922), (62, 0.095, 0.3, 1.3962923059755674), (62, 0.095, 0.3, 1.3957095810566438), (62, 0.097, 0.3, 1.3934793069412605), (62, 0.097, 0.3, 1.3952679947975497), (62, 0.097, 0.3, 1.395081779244439), (62, 0.099, 0.3, 1.395395395023343), (62, 0.099, 0.3, 1.3959765361209073), (62, 0.099, 0.3, 1.3960949810297665), (64, 0.09, 0.3, 1.3934147464198094), (64, 0.09, 0.3, 1.3947320623472355), (64, 0.09, 0.3, 1.3944838