#### User based recommendation system

Install necessary external libraries

In [3]:
!pip install scikit-surprise



Import libraries to be used inside the project

In [4]:
import joblib
import numpy as np
import pandas as pd
import pandas_gbq
from surprise import (
    Reader,
    Dataset,
    accuracy,
    SVD
)
from surprise.model_selection import (
    train_test_split,
    cross_validate,
    GridSearchCV
)

Set up credentials and project_id

In [5]:
PROJECT_ID: str = "proyectofinal-389001"
pandas_gbq.context.project = PROJECT_ID
pandas_gbq.context.dialect = "standard"

Perform query on Google Maps reviews table in BigQuery

In [6]:
query_users: str = f"""--sql
SELECT
    user_id,
    business_id,
    rating
FROM `{PROJECT_ID}.Google.Reviews`;
"""

users_df = pd.read_gbq(
    query=query_users,
    location="us"
)

Take a random sample of around 20% of the DataFrame

In [7]:
users_df = users_df.sample(frac=.2)

Filter all ratings higher or equal than 3

In [8]:
res = users_df[users_df["rating"] >= 3]

Turns the DataFrame into an object readable by surprise
Values where taken from unique records of rating column

In [9]:
reader = Reader(line_format="user item rating", rating_scale=(1, 5))
data = Dataset.load_from_df(res, reader)

Create training and test sets

In [10]:
train_set, test_set = train_test_split(data, test_size=.2)

Instantiate model

In [11]:
user_model = SVD()

Train the model

In [12]:
user_model.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f3f6a62b310>

Predict based on test set

In [13]:
predictions = user_model.test(test_set)

Calculate MAE (Mean Absolute Error)

In [14]:
accuracy.mae(predictions)

MAE:  0.5124


0.5123668463587575

Show a single prediction from the test set

In [15]:
predictions[10]

Prediction(uid='1.1788490110017312e+20', iid='0x89c6c1c9505b23d5:0xfab519270506a6a1', r_ui=5.0, est=4.613030962893049, details={'was_impossible': False})

At this point, we should look for the best hyper-parameters to train the model with.
<br>
To accomplish this, we use cross validation:

In [16]:
rmse_test_means = [] #  perform cross-validation and return de results of the error metrics from the models
# Factors should be a list of powers of 2
# e.g: factors = [1,2,4,8,16,32,64,128,256]
num_partitions: int = 5
factors = [pow(2, n) for n in range(num_partitions)] #  list of latent factors for the model

for factor in factors: 
    print("==================================================")
    print(f"Evaluting with {factor} factors")
    # Instantiate an SVD model with n latent factors
    eval_model = SVD(n_factors=factor)
    # Apply cross_validate function for this model with 5 cross validation partitions
    cv_results = cross_validate(eval_model, data, measures=["RMSE", "MAE"], cv=num_partitions, verbose=True)
    # Return RMSE and MAE for each partition
    rmse_test_means.append(np.mean(cv_results["test_rmse"]))

Evaluting with 1 factors
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6208  0.6198  0.6200  0.6195  0.6220  0.6204  0.0009  
MAE (testset)     0.5100  0.5094  0.5096  0.5093  0.5107  0.5098  0.0005  
Fit time          40.45   41.13   41.21   41.27   41.51   41.11   0.36    
Test time         5.65    3.34    3.25    5.48    5.53    4.65    1.11    
Evaluting with 2 factors
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6208  0.6196  0.6201  0.6203  0.6217  0.6205  0.0007  
MAE (testset)     0.5101  0.5094  0.5098  0.5098  0.5105  0.5099  0.0004  
Fit time          41.68   42.37   43.00   42.79   41.98   42.36   0.49    
Test time         3.39    3.32    3.28    5.47    5.58    4.21    1.08    
Evaluting with 4 factors
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

             

Using found parameters, search for the best posible model

In [17]:
# Dict of params with different values for number of latent factors,
# number of epochs, learning rate and regularization
params = {
    "n_factors": [5, 50, 100],
    "n_epochs": [5, 10, 20],
    "lr_all": [0.001, 0.002, 0.005],
    "reg_all": [0.002, 0.02, 0.2]
}
# Find best model based on this params
gs = GridSearchCV(SVD, params, measures=["RMSE", "MAE"], cv=3, n_jobs=-1)
gs.fit(data)

print(f"Best scores: rmse -> {gs.best_score['rmse']}, mae -> {gs.best_score['mae']}")
print(f"Best parameters: rmse -> {gs.best_params['rmse']}, mae -> {gs.best_params['mae']}")



Best scores: rmse -> 0.6232414784531176, mae -> 0.5140059307111632
Best parameters: rmse -> {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}, mae -> {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.002}


Model with the found params

In [18]:
best_model = SVD(n_factors=5, n_epochs=20, lr_all=0.005, reg_all=0.2)

Train and test said model

In [19]:
best_model = gs.best_estimator["rmse"]
best_model.fit(train_set)
predictions = best_model.test(test_set)

Make a connection to Google Cloud Storage to store the trained model

In [20]:
from google.cloud import storage

def upload_bucket(bucket_name: str,
                  source_file_object: str,
                  destination_blob_name: str
                  ) -> None:
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name=bucket_name)
    blob = bucket.blob(blob_name=destination_blob_name)
    
    blob.upload_from_file(file_obj=source_file_object)

Save model as a joblib object and upload to Google Cloud Storage Bucket

In [None]:
with open("model.joblib", "wb") as jobfile_dump:
    joblib.dump(best_model, jobfile_dump, protocol=4)

with open("model.joblib", "rb") as jobfile_load:
    upload_bucket(
        bucket_name="ml-models",
        source_file_object=jobfile_load,
        destination_blob_name="datawise-consulting/model.joblib"
    )