In [None]:
!pip install scikit-surprise
!pip install recmetrics

In [2]:
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import LabelEncoder
import surprise
from surprise import accuracy
from surprise.model_selection import GridSearchCV, train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import recmetrics

data_path = Path("/content/drive/MyDrive/merged_data.csv")

In [3]:
main_df = pd.read_csv(data_path)
main_df = main_df[main_df.state.isin(['PA'])] # Top three states: PA, FL, LA

In [4]:
# create a label encoder for user_id and business_id
(user_encoder, business_encoder) = (LabelEncoder(), LabelEncoder())

# fit the encoders to the data
user_encoder.fit(main_df['user_id'])
business_encoder.fit(main_df['business_id'])

# encode the user_id and business_id columns
main_df['user_id_encoded'] = user_encoder.transform(main_df['user_id'])
main_df['business_id_encoded'] = business_encoder.transform(main_df['business_id'])

main_df.head()

Unnamed: 0,user_id,business_id,name,state,stars,text,user_id_encoded,business_id_encoded
4,s_9uD6zqVU-9cnKO9pDKtg,jMi5SL9vb6nLJGRjw0HK3Q,Civera's Deli,PA,5,Civera's Deli is the cleanest deli in Delco. E...,35748,16884
6,JOrDiXIgpb0sjtd7Cr3CdA,RUfWgnSSQKjRNBpK0wSxfg,Isabella Pizza,PA,1,Yuck. I ordered here this morning for breakfas...,13018,10173
7,PXywFGVuClrMdFcq5RjsEg,7WWLPXpOjrh_1EvjFuw3hQ,The Drake Tavern,PA,5,Probably the best place to eat in Jenkintown. ...,16808,3076
17,FyIoTnxVTbNar4KdG6cC1w,N1a7z4ID9K0Hqz-Zf3V3yw,US Inspect,PA,5,I recently had a home inspection with Nate Fel...,10781,8582
19,Nro6ABevZFu-8TFDKS-5bw,ctHjyadbDQAtUFfkcAFEHw,Zahav,PA,5,I had the amazing opportunity to eat at Zahab ...,15760,14510


In [5]:
# Load the DataFrame into a Surprise Dataset object
reader = surprise.Reader(rating_scale=(1, 5))
data = surprise.Dataset.load_from_df(main_df[['user_id_encoded', 'business_id_encoded', 'stars']], reader)

data.df.head()

Unnamed: 0,user_id_encoded,business_id_encoded,stars
4,35748,16884,5
6,13018,10173,1
7,16808,3076,5
17,10781,8582,5
19,15760,14510,5


In [6]:
(len(data.df["user_id_encoded"].unique()), len(data.df["business_id_encoded"].unique()))

(40502, 22984)

In [7]:
# Split the data into training and testing sets
train, test = train_test_split(data, test_size=0.25)

In [8]:
(train.n_items, train.n_users)

(22849, 39541)

In [9]:
df_test = pd.DataFrame(test, columns=["user_id", "business_id", "stars"])

# Groupby user_id to get all the actual businesses
df_test = df_test.groupby("user_id", as_index=False)["business_id"].agg({'actual': lambda x: list(set(x))})
df_test = df_test.set_index("user_id")
df_test.head()

Unnamed: 0_level_0,actual
user_id,Unnamed: 1_level_1
0,"[7009, 17445, 1769, 7506, 6867]"
1,"[20164, 3627, 21489, 21690, 8026]"
3,"[6024, 19212, 1553, 17821, 3874, 2343, 16812, ..."
4,"[4571, 1876, 15741, 11022]"
5,"[2219, 16177, 10003, 9402, 21691, 6843]"


# Nearest Neighbors based Models

> [link](https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#prediction-algorithms)

- KnnBase (Both USer-User and Item-Item)
- KnnWithMeans (Both USer-User and Item-Item)

In [10]:
eval_metrics = {
    "model": ["BaseKNN-U", "BaseKNN-I", "MeanRatingKnn-U", "MeanRatingKnn-I"],
    "MAE": [0.0] * 4,
    "RMSE": [0.0] * 4
    }
base_metrics_df = pd.DataFrame(eval_metrics)

## KNN (User-User)

In [11]:
knn_uu_fitted = surprise.KNNBasic(k=40, sim_options={'name': 'cosine', 'user_based': True}).fit(train)
knn_uu_fitted

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f652fa03dc0>

In [12]:
preds = knn_uu_fitted.test(test)
df_preds = pd.DataFrame(
    preds, columns=['uid', 'iid', 'actual', 'preds', 'details']
    ).drop(columns=['details'])
df_preds.head()

Unnamed: 0,uid,iid,actual,preds
0,27348,11735,5.0,4.075
1,31027,19677,3.0,3.6
2,19671,18487,5.0,3.25
3,3259,17199,2.0,5.0
4,39251,11,4.0,3.975


In [13]:
base_metrics_df.loc[base_metrics_df.model == 'BaseKNN-U', "MAE"] = accuracy.mae(preds)
base_metrics_df.loc[base_metrics_df.model == 'BaseKNN-U', "RMSE"] = accuracy.rmse(preds)

MAE:  0.9456
RMSE: 1.2229


In [14]:
del knn_uu_fitted

In [15]:
knn_ii_fitted = surprise.KNNBasic(k=40, sim_options={'name': 'cosine', 'user_based': False}).fit(train)
knn_ii_fitted

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f651a29f310>

In [16]:
preds = knn_ii_fitted.test(test)
df_preds = pd.DataFrame(
    preds, columns=['uid', 'iid', 'actual', 'preds', 'details']
    ).drop(columns=['details'])
df_preds.head()

Unnamed: 0,uid,iid,actual,preds
0,27348,11735,5.0,4.399614
1,31027,19677,3.0,3.398725
2,19671,18487,5.0,2.665986
3,3259,17199,2.0,5.0
4,39251,11,4.0,3.153561


In [17]:
base_metrics_df.loc[base_metrics_df.model == 'BaseKNN-I', "MAE"] = accuracy.mae(preds)
base_metrics_df.loc[base_metrics_df.model == 'BaseKNN-I', "RMSE"] = accuracy.rmse(preds)

MAE:  0.9725
RMSE: 1.2706


In [18]:
del knn_ii_fitted

In [19]:
knn_mean_uu_fitted = surprise.KNNWithMeans(k=40, sim_options={'name': 'cosine', 'user_based': True}).fit(train)
knn_mean_uu_fitted

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f651a2a1850>

In [20]:
preds = knn_mean_uu_fitted.test(test)
df_preds = pd.DataFrame(
    preds, columns=['uid', 'iid', 'actual', 'preds', 'details']
    ).drop(columns=['details'])
df_preds.head()

Unnamed: 0,uid,iid,actual,preds
0,27348,11735,5.0,4.413273
1,31027,19677,3.0,3.070974
2,19671,18487,5.0,2.334813
3,3259,17199,2.0,4.061905
4,39251,11,4.0,3.075862


In [21]:
base_metrics_df.loc[base_metrics_df.model == 'MeanRatingKnn-U', "MAE"] = accuracy.mae(preds)
base_metrics_df.loc[base_metrics_df.model == 'MeanRatingKnn-U', "RMSE"] = accuracy.rmse(preds)

MAE:  0.8901
RMSE: 1.1752


In [22]:
del knn_mean_uu_fitted

In [23]:
knn_mean_ii_fitted = surprise.KNNWithMeans(k=40, sim_options={'name': 'cosine', 'user_based': False}).fit(train)
knn_mean_ii_fitted

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f651a2a85e0>

In [25]:
preds = knn_mean_ii_fitted.test(test)
df_preds = pd.DataFrame(
    preds, columns=['uid', 'iid', 'actual', 'preds', 'details']
    ).drop(columns=['details'])
df_preds.head()

Unnamed: 0,uid,iid,actual,preds
0,27348,11735,5.0,4.394067
1,31027,19677,3.0,3.962664
2,19671,18487,5.0,2.777675
3,3259,17199,2.0,5.0
4,39251,11,4.0,3.356263


In [26]:
base_metrics_df.loc[base_metrics_df.model == 'MeanRatingKnn-I', "MAE"] = accuracy.mae(preds)
base_metrics_df.loc[base_metrics_df.model == 'MeanRatingKnn-I', "RMSE"] = accuracy.rmse(preds)

MAE:  0.8754
RMSE: 1.1629


In [27]:
del knn_mean_ii_fitted

In [28]:
base_metrics_df

Unnamed: 0,model,MAE,RMSE
0,BaseKNN-U,0.945642,1.222901
1,BaseKNN-I,0.972544,1.270608
2,MeanRatingKnn-U,0.890081,1.175203
3,MeanRatingKnn-I,0.875376,1.162908
