# Collaborative filtering approach

In [None]:
from surprise.model_selection import cross_validate
from surprise import (Dataset, Reader,
                      accuracy, KNNBasic,
                      model_selection,  SVD)
from datasets import load_dataset
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Loading and creation of datasets


* Downloaded the entire dataset from [Link](McAuley-LabAmazon-Reviews-2023)
* Used only: *rating, user_id, parent_aisin, helpful_vote e verified_purchase*
* Creation of the dataframe Pandas




In [None]:
dataset_review = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Video_Games", trust_remote_code=True)

temp_data = pd.DataFrame(dataset_review["full"])
df = temp_data[['rating', 'user_id', 'parent_asin','verified_purchase', 'title', 'text']]
df = df.drop_duplicates()
df = df.dropna(subset=['user_id', 'parent_asin', 'rating'])

## Explorative analysis of the dataset
### Descriptive statistics
* distribution of the variables
* measures of central tendency ( mean, median e mode )
* dispersion measures ( varianza, deviazione standard e intervallo interquartile)

### Correlation analysis
* linear correlation (TO-DO?)

### Dataset filtering
  Filtering of the dataset is made on:
* verified purchase = true
* minimum number of reviews

### Variables distribution

#### Rating distribution

In [None]:
rating_counts = df['rating'].value_counts()
print(rating_counts)
# Creazione del grafico a barre
plt.figure(figsize=(10, 6))
plt.bar(rating_counts.index.astype(str), rating_counts.values, color='blue')
plt.xlabel('Ratings')
plt.ylabel('Number of reviews')
plt.title('Rating distribution (1-5)')
plt.xticks(range(5), labels=rating_counts.index.astype(str))
plt.show()

### Central tendency measures

In [None]:
df.describe()

### Dispersion measures

#### Variance and standard deviation of rating

In [None]:
print(f"Valore massimo presente: {df['rating'].max():>3}")
print(f"Valore minimo presente: {df['rating'].min():>4}")
print(f"Varianza: {round(df['rating'].var(), 3):>20}")
print(f"Deviazione standard: {round(df['rating'].std(), 3):>8}")

### Dataset filtering

In [None]:
min_reviews_per_user = 15
min_reviews_per_product = 10

df_filtring = df.drop_duplicates()

df_filtring = df_filtring[df_filtring['verified_purchase'] == True]

user_review_counts = df_filtring['user_id'].value_counts()
users_with_min_reviews = user_review_counts[user_review_counts >= min_reviews_per_user].index

filtered_df = df[df['user_id'].isin(users_with_min_reviews)]

item_review_counts = filtered_df.groupby('parent_asin')['user_id'].nunique()
products_with_min_reviews = item_review_counts[item_review_counts >= min_reviews_per_product].index

filtered_df = filtered_df[filtered_df['parent_asin'].isin(products_with_min_reviews)]
filtered_df = filtered_df[filtered_df['verified_purchase'] == True]
num_products = filtered_df['parent_asin'].nunique()
num_users = filtered_df['user_id'].nunique()
num_reviews = len(filtered_df)

print(f'Numero di prodotti: {num_products}')
print(f'Numero di utenti: {num_users}')
print(f'Numero di recensioni totali: {num_reviews}')


## K-NN

* Creation of the dataset with surprise
*   Looking for the best conf of the kNN 
*   Study the best RMSE, MSE

### KNN, SVD comparison


In [None]:
reader = Reader(rating_scale=(1, 5))
reviews_filtered_surprise = Dataset.load_from_df(filtered_df[['user_id', 'parent_asin', 'rating']], reader)

#### Best conf. with KNN

In [None]:
param_grid = {
    'k': list(range(15, 45, 5)),
    'sim_options': {
        'name': ['cosine', 'msd'],
        'user_based': [True, False],
    },
}
# Initialize and train the Grid Search
gs = model_selection.GridSearchCV(KNNBasic, param_grid,
                                  measures=["rmse", "mse"],
                                  cv=5,
                                  n_jobs=-1)
gs.fit(reviews_filtered_surprise)

print(f'Best RMSE = {gs.best_score["rmse"]:.4f}')
print(f'Best configuration = {gs.best_params["rmse"]}')

#### Best conf with SVD

In [None]:
param_grid = {
    'n_factors': list(range(80, 160, 20)),
    'n_epochs': list(range(10, 50, 10)),
    'biased': [True, False]
    }
gs = model_selection.GridSearchCV(SVD, param_grid,
                                  measures=["rmse", "mse"],
                                  cv=5,
                                  n_jobs=-1)
gs.fit(reviews_filtered_surprise)
print(f'Best RMSE = {gs.best_score["rmse"]:.4f}')
print(f'Best configuration = {gs.best_params["rmse"]}')

In [None]:
filtered_df

## Matrix filling with kNN

*   Creation of the test and training set
*   Matrix filling a
* Reccomended items



In [None]:
trainset = reviews_filtered_surprise.build_full_trainset()
algo = KNNBasic(k=40, sim_options={'name': 'cosine', 'user_based': False})
algo.fit(trainset)

In [None]:
users_id = filtered_df["user_id"].unique()
items_id = filtered_df["parent_asin"].unique()
filled_rating_matrix = []
for uid in users_id:
  filled_rating_matrix.append([])
  for iid in items_id:
    res = algo.predict(uid=uid, iid=iid)
    if res.r_ui is not None:
      filled_rating_matrix[-1].append(0)
    else:
      filled_rating_matrix[-1].append(res.est)

filled_rating_matrix = np.array(filled_rating_matrix)

In [None]:
filled_rating_matrix


### Recommended list

In [None]:
res_df = pd.DataFrame(filled_rating_matrix)
res_df.columns = items_id
res_df = res_df.set_index(users_id)
# Sort each row by the score
def sort_columns(row):
  sorted_columns = sorted(row.items(), key=lambda x: x[1], reverse=True)
  return [col[0] for col in sorted_columns]
rec_lists = pd.DataFrame(list(res_df.apply(sort_columns, axis=1)),
                         index=res_df.index)

In [None]:
rec_lists[:5]

## Segmentation of the user, based on cluster algo


#### Cluster number

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import numpy as np

user_similarity = cosine_similarity(filled_rating_matrix)
graph = True

max_clusters = 10

'''
Elbow: punto dove smette di crescere/decrescere velocemente
Silhouette: più il valore è alto meglio è
Calinski-Harabasz: più il valore è alto meglio è
Davies-Bouldin: più il valore è basso meglio è
'''

wcss = []
silhouette_scores = []
calinski_scores = []
davies_scores = []

for i in range(2, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, random_state=42)
    clusters = kmeans.fit_predict(user_similarity)

    # WCSS (Elbow Method)
    wcss.append(kmeans.inertia_)

    # Silhouette Score
    silhouette_scores.append(silhouette_score(user_similarity, clusters))

    # Calinski-Harabasz Index
    calinski_scores.append(calinski_harabasz_score(user_similarity, clusters))

    # Davies-Bouldin Index
    davies_scores.append(davies_bouldin_score(user_similarity, clusters))

# Standardize the scores
wcss = np.array(wcss)
silhouette_scores = np.array(silhouette_scores)
calinski_scores = np.array(calinski_scores)
davies_scores = np.array(davies_scores)

wcss_std = (wcss - wcss.mean()) / wcss.std()
silhouette_std = (silhouette_scores - silhouette_scores.mean()) / silhouette_scores.std()
calinski_std = (calinski_scores - calinski_scores.mean()) / calinski_scores.std()
davies_std = (davies_scores - davies_scores.mean()) / davies_scores.std()

# Combine the standardized scores (Note: WCSS should be minimized, so we take its negative)
combined_scores = -wcss_std + silhouette_std + calinski_std - davies_std

# Find the number of clusters that minimizes the combined score
optimal_clusters = np.argmin(combined_scores) + 2

if graph:
  # Plotting the results
  fig, axs = plt.subplots(2, 2, figsize=(15, 10))

  # Elbow Method
  axs[0, 0].plot(range(2, max_clusters + 1), wcss, marker='o')
  axs[0, 0].scatter(optimal_clusters, wcss[optimal_clusters - 2], color='red', s=100, zorder=5)
  axs[0, 0].set_xlabel('Number of Clusters')
  axs[0, 0].set_ylabel('WCSS')
  axs[0, 0].set_title('Elbow Method')

  # Silhouette Score
  axs[0, 1].plot(range(2, max_clusters + 1), silhouette_scores, marker='o')
  axs[0, 1].scatter(optimal_clusters, silhouette_scores[optimal_clusters - 2], color='red', s=100, zorder=5)
  axs[0, 1].set_xlabel('Number of Clusters')
  axs[0, 1].set_ylabel('Silhouette Score')
  axs[0, 1].set_title('Silhouette Score Method')

  # Calinski-Harabasz Index
  axs[1, 0].plot(range(2, max_clusters + 1), calinski_scores, marker='o')
  axs[1, 0].scatter(optimal_clusters, calinski_scores[optimal_clusters - 2], color='red', s=100, zorder=5)
  axs[1, 0].set_xlabel('Number of Clusters')
  axs[1, 0].set_ylabel('Calinski-Harabasz Index')
  axs[1, 0].set_title('Calinski-Harabasz Index Method')

  # Davies-Bouldin Index
  axs[1, 1].plot(range(2, max_clusters + 1), davies_scores, marker='o')
  axs[1, 1].scatter(optimal_clusters, davies_scores[optimal_clusters - 2], color='red', s=100, zorder=5)
  axs[1, 1].set_xlabel('Number of Clusters')
  axs[1, 1].set_ylabel('Davies-Bouldin Index')
  axs[1, 1].set_title('Davies-Bouldin Index Method')

  plt.tight_layout()
  plt.show()

print(f"Optimal number of clusters: {optimal_clusters}")


In [None]:
kmeans = KMeans(n_clusters=optimal_clusters)

clusters = kmeans.fit_predict(user_similarity)
user_cluster_mapping = {uid: cluster for uid, cluster in zip(users_id, clusters)}

## Top k ITEMS for USER

In [None]:
def select_top_k_recommendations(rec_lists, k):
    top_k_recommendations = {}
    for user_id, row in rec_lists.iterrows():
        top_k_recommendations[user_id] = row[:k].tolist()
    return top_k_recommendations

k = int(input('Insersici il numero di item per ogni lista: '))
top_k_recommendations = select_top_k_recommendations(rec_lists, k)
for user_id, recommendations in top_k_recommendations.items():
    print("User:", user_id)
    print("Top", len(recommendations), "Recommendations:", recommendations)
    print()

## Matrix filling with SVD



In [None]:
trainset = reviews_filtered_surprise.build_full_trainset()
algo = SVD(n_factors=80, n_epochs=20, biased=True)
algo.fit(trainset)

In [None]:
users_id = filtered_df["user_id"].unique()
items_id = filtered_df["parent_asin"].unique()
filled_rating_matrix = []
for uid in users_id:
  filled_rating_matrix.append([])
  for iid in items_id:
    res = algo.predict(uid=uid, iid=iid)
    if res.r_ui is not None:
      filled_rating_matrix[-1].append(0)
    else:
      filled_rating_matrix[-1].append(res.est)

filled_rating_matrix = np.array(filled_rating_matrix)

In [None]:
filled_rating_matrix

In [None]:
res_df = pd.DataFrame(filled_rating_matrix)
res_df.columns = items_id
res_df = res_df.set_index(users_id)
# Sort each row by the score
def sort_columns(row):
  sorted_columns = sorted(row.items(), key=lambda x: x[1], reverse=True)
  return [col[0] for col in sorted_columns]
rec_lists = pd.DataFrame(list(res_df.apply(sort_columns, axis=1)),
                         index=res_df.index)

In [None]:
rec_lists[:5]

In [None]:
def select_top_k_recommendations(rec_lists, k):
    top_k_recommendations = {}
    for user_id, row in rec_lists.iterrows():
        top_k_recommendations[user_id] = row[:k].tolist()
    return top_k_recommendations

# Esempio di utilizzo
k = int(input('Insersici il numero di item per ogni lista: '))
top_k_recommendations = select_top_k_recommendations(rec_lists, k)
for user_id, recommendations in top_k_recommendations.items():
    print("User:", user_id)
    print("Top", len(recommendations), "Recommendations:", recommendations)
    print()