#### User based recommendation system

Install necessary external libraries

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
!pip install joblib



Import libraries to be used inside the project

In [None]:
import pandas as pd
import pandas_gbq
import numpy as np
import joblib
import pickle
from scipy import sparse
from surprise import Dataset, Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from joblib import parallel_backend

Set up credentials and project_id

In [None]:
PROJECT_ID: str = "proyectofinal-389001"
pandas_gbq.context.project = PROJECT_ID
pandas_gbq.context.dialect = "standard"

Perform query on Google Maps reviews table in BigQuery

In [None]:
query_users = f"""
SELECT
    user_id,
    business_id,
    rating
FROM `{PROJECT_ID}.Google.Reviews`
"""

users_df = pd.read_gbq(
    query=query_users,
    location="us"
)

Take a random sample of around 20% of the DataFrame

In [None]:
users_df = users_df.sample(frac=0.2)

Filter all ratings higher or equal than 3

In [None]:
res = users_df[users_df["rating"] >= 3]

Turns the DataFrame into an object readable by surprise
Values where taken from unique records of rating column

In [None]:
reader = Reader(line_format="user item rating", rating_scale=(1, 5))
data = Dataset.load_from_df(res, reader)

Create training and test sets

In [None]:
train_set, test_set = train_test_split(data, test_size=.2)

Instantiate model

In [None]:
user_model = SVD()

Train the model

In [None]:
user_model.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd09dd834d0>

Predict based on test set

In [None]:
predictions = user_model.test(test_set)

Calculate MAE (Mean Absolute Error)

In [None]:
accuracy.mae(predictions)

MAE:  0.5696


0.5695724517017111

Show a single prediction from the test set

In [None]:
predictions[10]

Prediction(uid=1.182920086e+20, iid='0x80803550e8d2cbd9:0xbc89dc0b1315dae7', r_ui=5.0, est=4.548114093127432, details={'was_impossible': False})

At this point, we should look for the best hyper-parameters to train the model with.
<br>
To accomplish this, we use cross validation:

In [None]:
from surprise.model_selection import cross_validate

rmse_test_means = [] #  perform cross-validation and return de results of the error metrics from the models
# Factors should be a list of powers of 2
# e.g: factors = [1,2,4,8,16,32,64,128,256]
num_partitions: int = 5
factors = [pow(2, n) for n in range(num_partitions)] #  list of latent factors for the model

for factor in factors: 
    print("==================================================")
    print(f"Evaluting with {factor} factors")
    # Instantiate an SVD model with n latent factors
    eval_model = SVD(n_factors=factor)
    # Apply cross_validate function for this model with 5 cross validation partitions
    cv_results = cross_validate(eval_model, data, measures=["RMSE", "MAE"], cv=num_partitions, verbose=True)
    # Return RMSE and MAE for each partition
    rmse_test_means.append(np.mean(cv_results["test_rmse"]))

Evaluando con  1  factores
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6679  0.6707  0.6723  0.6699  0.6695  0.6701  0.0015  
MAE (testset)     0.5689  0.5708  0.5715  0.5698  0.5697  0.5701  0.0009  
Fit time          8.48    8.62    13.58   12.76   9.12    10.51   2.19    
Test time         0.94    0.91    1.38    0.92    0.91    1.01    0.18    
Evaluando con  2  factores
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6716  0.6686  0.6715  0.6692  0.6708  0.6703  0.0012  
MAE (testset)     0.5713  0.5694  0.5710  0.5698  0.5707  0.5704  0.0007  
Fit time          9.01    13.40   13.15   9.32    9.21    10.82   2.01    
Test time         0.90    1.35    0.87    1.41    0.89    1.09    0.24    
Evaluando con  4  factores
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

       

Using found parameters, search for the best posible model

In [None]:
# Dict of params with different values for number of latent factors,
# number of epochs, learning rate and regularization
params = {
    "n_factors": [5, 50, 100],
    "n_epochs": [5, 10, 20],
    "lr_all": [0.001, 0.002, 0.005],
    "reg_all": [0.002, 0.02, 0.2]
}
# Find best model based on this params
gs = GridSearchCV(SVD, params, measures=["RMSE", "MAE"], cv=3, n_jobs=-1)
gs.fit(data)

print(f"Best scores: rmse -> {gs.best_score['rmse']}, mae -> {gs.best_score['mae']}")
print(f"Best parameters: rmse -> {gs.best_params['rmse']}, mae -> {gs.best_params['mae']}")

mejores puntajes : rmse -> 0.6708501240485192  mae -> 0.5724308090415433
mejores parámetros : rmse -> {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}  mae -> {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.002}


Model with the found params

In [None]:
best_model = SVD(n_factors=5, n_epochs=5, lr_all=0.005, reg_all=0.002)

Train and test said model

In [None]:
best_model = gs.best_estimator["rmse"]
best_model.fit(train_set)
predictions = best_model.test(test_set)

Save model as a joblib object

In [None]:
joblib.dump(best_model, "modelo.joblib")

['modelo.joblib']