# PROGETTO BASE

In [1]:
from surprise.model_selection import cross_validate
from surprise import (Dataset, Reader,
                      accuracy, KNNBasic,
                      model_selection,  SVD)
from datasets import load_dataset
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Loading e creazione del dataset


* Download dell'intero dataset da [link sito](McAuley-LabAmazon-Reviews-2023)
* Salvataggio solo dei dati fondamentali: *rating, user_id, parent_aisin, helpful_vote e verified_purchase*
* Creazione del dataframe Pandas




In [2]:
dataset_review = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Video_Games", trust_remote_code=True)

temp_data = pd.DataFrame(dataset_review["full"])
df = temp_data[['rating', 'user_id', 'parent_asin','verified_purchase', 'title', 'text']]
df = df.drop_duplicates()
df = df.dropna(subset=['user_id', 'parent_asin', 'rating'])

## Analisi Esplorativa del Dataset
### Statistiche Descrittie
* Distribuzione delle variabili
* Misure di Tendenza Centrale ( media, mediana e moda )
* Misure di Dispersione ( varianza, deviazione standard e intervallo interquartile)

### Analisi di Corelazione
* Correlazione Lineare (TO-DO?)

### Filtraggio del Dataset e nuove statistiche
  Il filtraggio delle recensioni è fatto secondo:
* acquisto verificato o meno
* numero minimo di recensioni

Il filtraggio degli utenti è fatto secondo:
* numero minimo di recensioni

### Distribuzione delle Variabili

#### Distribuzione della variabile 'rating' all'interno del dataset

In [None]:
rating_counts = df['rating'].value_counts()
print(rating_counts)
# Creazione del grafico a barre
plt.figure(figsize=(10, 6))
plt.bar(rating_counts.index.astype(str), rating_counts.values, color='blue')
plt.xlabel('Ratings')
plt.ylabel('Numero di Recensioni')
plt.title('Distribuzione dei ratings (1-5)')
plt.xticks(range(5), labels=rating_counts.index.astype(str))
plt.show()

### Misure di Tendenza Centrale

In [None]:
df.describe()

### Misure di Dispersione

#### Varianza e Deviazione Standard della variabile 'rating'

In [None]:
print(f"Valore massimo presente: {df['rating'].max():>3}")
print(f"Valore minimo presente: {df['rating'].min():>4}")
print(f"Varianza: {round(df['rating'].var(), 3):>20}")
print(f"Deviazione standard: {round(df['rating'].std(), 3):>8}")

### Filtraggio del Dataset

In [3]:
min_reviews_per_user = 15
min_reviews_per_product = 10

df_filtring = df.drop_duplicates()

df_filtring = df_filtring[df_filtring['verified_purchase'] == True]

user_review_counts = df_filtring['user_id'].value_counts()
users_with_min_reviews = user_review_counts[user_review_counts >= min_reviews_per_user].index

filtered_df = df[df['user_id'].isin(users_with_min_reviews)]

item_review_counts = filtered_df.groupby('parent_asin')['user_id'].nunique()
products_with_min_reviews = item_review_counts[item_review_counts >= min_reviews_per_product].index

filtered_df = filtered_df[filtered_df['parent_asin'].isin(products_with_min_reviews)]
filtered_df = filtered_df[filtered_df['verified_purchase'] == True]
num_products = filtered_df['parent_asin'].nunique()
num_users = filtered_df['user_id'].nunique()
num_reviews = len(filtered_df)

print(f'Numero di prodotti: {num_products}')
print(f'Numero di utenti: {num_users}')
print(f'Numero di recensioni totali: {num_reviews}')


Numero di prodotti: 3672
Numero di utenti: 6513
Numero di recensioni totali: 91791


## K-NN

* Creazione del Dataset con Surprise
*   Individuazione della configurazione dell'algoritmo KNN
*   Valutazione attraverso MSE e RMSE

### Confronto di KNN con SVD


### Creazione del Dataset Surprise

In [None]:
reader = Reader(rating_scale=(1, 5))
reviews_filtered_surprise = Dataset.load_from_df(filtered_df[['user_id', 'parent_asin', 'rating']], reader)

### Identificazione della configurazione ottimale

#### Configurazione ottimale con KNN

In [None]:
param_grid = {
    'k': list(range(15, 45, 5)),
    'sim_options': {
        'name': ['cosine', 'msd'],
        'user_based': [True, False],
    },
}
# Initialize and train the Grid Search
gs = model_selection.GridSearchCV(KNNBasic, param_grid,
                                  measures=["rmse", "mse"],
                                  cv=5,
                                  n_jobs=-1)
gs.fit(reviews_filtered_surprise)

print(f'Best RMSE = {gs.best_score["rmse"]:.4f}')
print(f'Best configuration = {gs.best_params["rmse"]}')

#### Configurazione ottimale con SVD

In [None]:
param_grid = {
    'n_factors': list(range(80, 160, 20)),
    'n_epochs': list(range(10, 50, 10)),
    'biased': [True, False]
    }
gs = model_selection.GridSearchCV(SVD, param_grid,
                                  measures=["rmse", "mse"],
                                  cv=5,
                                  n_jobs=-1)
gs.fit(reviews_filtered_surprise)
print(f'Best RMSE = {gs.best_score["rmse"]:.4f}')
print(f'Best configuration = {gs.best_params["rmse"]}')

In [None]:
filtered_df

## Filling della Matrice di Rating KNN

*   Creazione del set di traning e di fit
*   Filling della matrice e visualizzazione della stessa
* Creazionee  visualizzazione degli items raccomandanti per ogni utente



In [None]:
trainset = reviews_filtered_surprise.build_full_trainset()
algo = KNNBasic(k=40, sim_options={'name': 'cosine', 'user_based': False})
algo.fit(trainset)

In [None]:
users_id = filtered_df["user_id"].unique()
items_id = filtered_df["parent_asin"].unique()
filled_rating_matrix = []
for uid in users_id:
  filled_rating_matrix.append([])
  for iid in items_id:
    res = algo.predict(uid=uid, iid=iid)
    if res.r_ui is not None:
      filled_rating_matrix[-1].append(0)
    else:
      filled_rating_matrix[-1].append(res.est)

filled_rating_matrix = np.array(filled_rating_matrix)

In [None]:
filled_rating_matrix


### creazione della lista di raccomandazione

In [None]:
res_df = pd.DataFrame(filled_rating_matrix)
res_df.columns = items_id
res_df = res_df.set_index(users_id)
# Sort each row by the score
def sort_columns(row):
  sorted_columns = sorted(row.items(), key=lambda x: x[1], reverse=True)
  return [col[0] for col in sorted_columns]
rec_lists = pd.DataFrame(list(res_df.apply(sort_columns, axis=1)),
                         index=res_df.index)

In [None]:
rec_lists[:5]

## Segmentazione degli utenti
* Individuazione del numero ottimale di cluster
* Suddivisione in cluster degli utenti

#### Individuazione del numero di cluster

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import numpy as np

user_similarity = cosine_similarity(filled_rating_matrix)
graph = True

max_clusters = 10

'''
Elbow: punto dove smette di crescere/decrescere velocemente
Silhouette: più il valore è alto meglio è
Calinski-Harabasz: più il valore è alto meglio è
Davies-Bouldin: più il valore è basso meglio è
'''

wcss = []
silhouette_scores = []
calinski_scores = []
davies_scores = []

for i in range(2, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, random_state=42)
    clusters = kmeans.fit_predict(user_similarity)

    # WCSS (Elbow Method)
    wcss.append(kmeans.inertia_)

    # Silhouette Score
    silhouette_scores.append(silhouette_score(user_similarity, clusters))

    # Calinski-Harabasz Index
    calinski_scores.append(calinski_harabasz_score(user_similarity, clusters))

    # Davies-Bouldin Index
    davies_scores.append(davies_bouldin_score(user_similarity, clusters))

# Standardize the scores
wcss = np.array(wcss)
silhouette_scores = np.array(silhouette_scores)
calinski_scores = np.array(calinski_scores)
davies_scores = np.array(davies_scores)

wcss_std = (wcss - wcss.mean()) / wcss.std()
silhouette_std = (silhouette_scores - silhouette_scores.mean()) / silhouette_scores.std()
calinski_std = (calinski_scores - calinski_scores.mean()) / calinski_scores.std()
davies_std = (davies_scores - davies_scores.mean()) / davies_scores.std()

# Combine the standardized scores (Note: WCSS should be minimized, so we take its negative)
combined_scores = -wcss_std + silhouette_std + calinski_std - davies_std

# Find the number of clusters that minimizes the combined score
optimal_clusters = np.argmin(combined_scores) + 2

if graph:
  # Plotting the results
  fig, axs = plt.subplots(2, 2, figsize=(15, 10))

  # Elbow Method
  axs[0, 0].plot(range(2, max_clusters + 1), wcss, marker='o')
  axs[0, 0].scatter(optimal_clusters, wcss[optimal_clusters - 2], color='red', s=100, zorder=5)
  axs[0, 0].set_xlabel('Number of Clusters')
  axs[0, 0].set_ylabel('WCSS')
  axs[0, 0].set_title('Elbow Method')

  # Silhouette Score
  axs[0, 1].plot(range(2, max_clusters + 1), silhouette_scores, marker='o')
  axs[0, 1].scatter(optimal_clusters, silhouette_scores[optimal_clusters - 2], color='red', s=100, zorder=5)
  axs[0, 1].set_xlabel('Number of Clusters')
  axs[0, 1].set_ylabel('Silhouette Score')
  axs[0, 1].set_title('Silhouette Score Method')

  # Calinski-Harabasz Index
  axs[1, 0].plot(range(2, max_clusters + 1), calinski_scores, marker='o')
  axs[1, 0].scatter(optimal_clusters, calinski_scores[optimal_clusters - 2], color='red', s=100, zorder=5)
  axs[1, 0].set_xlabel('Number of Clusters')
  axs[1, 0].set_ylabel('Calinski-Harabasz Index')
  axs[1, 0].set_title('Calinski-Harabasz Index Method')

  # Davies-Bouldin Index
  axs[1, 1].plot(range(2, max_clusters + 1), davies_scores, marker='o')
  axs[1, 1].scatter(optimal_clusters, davies_scores[optimal_clusters - 2], color='red', s=100, zorder=5)
  axs[1, 1].set_xlabel('Number of Clusters')
  axs[1, 1].set_ylabel('Davies-Bouldin Index')
  axs[1, 1].set_title('Davies-Bouldin Index Method')

  plt.tight_layout()
  plt.show()

print(f"Optimal number of clusters: {optimal_clusters}")


#### Segmentazione degli utenti


In [None]:
kmeans = KMeans(n_clusters=optimal_clusters)

clusters = kmeans.fit_predict(user_similarity)
user_cluster_mapping = {uid: cluster for uid, cluster in zip(users_id, clusters)}

## Creazione della Top k Items List per ogni utente

In [None]:
def select_top_k_recommendations(rec_lists, k):
    top_k_recommendations = {}
    for user_id, row in rec_lists.iterrows():
        top_k_recommendations[user_id] = row[:k].tolist()
    return top_k_recommendations

k = int(input('Insersici il numero di item per ogni lista: '))
top_k_recommendations = select_top_k_recommendations(rec_lists, k)
for user_id, recommendations in top_k_recommendations.items():
    print("User:", user_id)
    print("Top", len(recommendations), "Recommendations:", recommendations)
    print()

## Filling della Matrice di Rating SVD

*   Creazione del set di traning e di fit
*   Filling della matrice e visualizzazione della stessa
* Creazionee  visualizzazione degli items raccomandanti per ogni utente



In [None]:
trainset = reviews_filtered_surprise.build_full_trainset()
algo = SVD(n_factors=80, n_epochs=20, biased=True)
algo.fit(trainset)

In [None]:
users_id = filtered_df["user_id"].unique()
items_id = filtered_df["parent_asin"].unique()
filled_rating_matrix = []
for uid in users_id:
  filled_rating_matrix.append([])
  for iid in items_id:
    res = algo.predict(uid=uid, iid=iid)
    if res.r_ui is not None:
      filled_rating_matrix[-1].append(0)
    else:
      filled_rating_matrix[-1].append(res.est)

filled_rating_matrix = np.array(filled_rating_matrix)

In [None]:
filled_rating_matrix


### creazione della lista di raccomandazione

In [None]:
res_df = pd.DataFrame(filled_rating_matrix)
res_df.columns = items_id
res_df = res_df.set_index(users_id)
# Sort each row by the score
def sort_columns(row):
  sorted_columns = sorted(row.items(), key=lambda x: x[1], reverse=True)
  return [col[0] for col in sorted_columns]
rec_lists = pd.DataFrame(list(res_df.apply(sort_columns, axis=1)),
                         index=res_df.index)

In [None]:
rec_lists[:5]

In [None]:
def select_top_k_recommendations(rec_lists, k):
    top_k_recommendations = {}
    for user_id, row in rec_lists.iterrows():
        top_k_recommendations[user_id] = row[:k].tolist()
    return top_k_recommendations

# Esempio di utilizzo
k = int(input('Insersici il numero di item per ogni lista: '))
top_k_recommendations = select_top_k_recommendations(rec_lists, k)
for user_id, recommendations in top_k_recommendations.items():
    print("User:", user_id)
    print("Top", len(recommendations), "Recommendations:", recommendations)
    print()

# PROGETTO INTERMEDIO

### Pre-Processing

In [4]:
from datasets import load_dataset
import numpy as np
import pandas as pd

In [5]:
dataset_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Video_Games", split="full", trust_remote_code=True)

In [6]:
temp_df_meta = pd.DataFrame(dataset_meta)
df_meta = temp_df_meta[['title','description','parent_asin', 'rating_number']]

In [7]:
df_meta_filtered = df_meta[df_meta['rating_number'] > 10]
df_meta_filtered = df_meta_filtered[df_meta_filtered['description'].apply(lambda x: len(x) >15)]
df_meta_filtered = df_meta_filtered.reset_index(drop=True)



print(f"Numero totale di prodotti prima dell'applicazione dei filtri: {len(df_meta):>10}")
print(f"Numero totale di prodotti dopo l'applicazione dei filtri: {len(df_meta_filtered):>11}")
df_meta_filtered

Numero totale di prodotti prima dell'applicazione dei filtri:     137269
Numero totale di prodotti dopo l'applicazione dei filtri:        6738


Unnamed: 0,title,description,parent_asin,rating_number
0,Turbo: Super Stunt Squad - Nintendo 3DS,"[Product Description, Turbo: Super Stunt Squad...",B00BJH85SW,26
1,"Warhammer 40,000 Dawn of War Game of the Year ...","[From the Manufacturer, This Game of The Year ...",B001EYUX4Y,68
2,MX vs ATV Reflex [Download],"[Product Description, BECOME THE RIDER. Dual c...",B004BLJPZU,25
3,CH PRODUCTS Eclipse Yoke Discover Bundle (2006...,"[Product Description, Making a great thing, ev...",B005ILK04M,28
4,Art of Murder: Cards of Destiny - PC,"[Product Description, Young FBI agent Nicole B...",B002KAKSJQ,29
...,...,...,...,...
6733,LittleBigPlanet Karting - Playstation 3,"[Product Description, The Fastest LittleBigPla...",B00EN8FKFA,773
6734,Skins Stickers for Xbox One Games Controller -...,"[GameXcel ®, Xbox One Skin Sticker is printed ...",B00OFLATM4,20
6735,Reiso 2 Packs NGC Controllers Classic Wired Co...,[Do you yearn to relive the NGC days with a cl...,B07MQFY2CV,638
6736,Star Wars Battlefront - Xbox,"[From the Manufacturer, Battlefront, is an act...",B000PDY2JW,458


In [8]:
min_reviews_per_user = 30

df_filtring_meta = df.drop_duplicates()

df_filtring_meta = df_filtring_meta[df_filtring_meta['verified_purchase'] == True]

user_review_counts = df_filtring_meta['user_id'].value_counts()
users_with_min_reviews = user_review_counts[user_review_counts >= min_reviews_per_user].index
filtered_df_meta_avan = df[df['user_id'].isin(users_with_min_reviews)]
item_review_counts = filtered_df_meta_avan.groupby('parent_asin')['user_id'].nunique()
filtered_df_meta_avan = filtered_df_meta_avan[filtered_df_meta_avan['verified_purchase'] == True]
num_products = filtered_df_meta_avan['parent_asin'].nunique()
num_users = filtered_df_meta_avan['user_id'].nunique()
num_reviews = len(filtered_df_meta_avan)

print(f'Numero di prodotti: {num_products}')
print(f'Numero di utenti: {num_users}')
print(f'Numero di recensioni totali: {num_reviews}')


Numero di prodotti: 19154
Numero di utenti: 1144
Numero di recensioni totali: 52147


In [9]:
filtered_df = filtered_df_meta_avan[filtered_df_meta_avan['parent_asin'].isin(df_meta_filtered['parent_asin'])]

In [None]:
filtered_df

In [None]:
df_meta_filtered

## Processamento degli attributi testuali dei diversi prodotti (almeno i campi title e description) con le tecniche di Natural Language Processing viste in laboratorio.

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def list_to_str(lst):
    return str(lst)

df_meta_filtered['description'] = df_meta_filtered['description'].apply(list_to_str)

In [None]:
lemmatizer = WordNetLemmatizer() # meglio dello stemmer
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalnum()]
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)
    else:
        return ''

# ho tolto le colonne title e description rating_number, helpful_vote, verified_purchase e lasciato solo quelle processate
df_meta_filtered['text'] = (df_meta_filtered['title'] + ' ' + df_meta_filtered["description"]).apply(preprocess_text)
df_meta_filtered.drop_duplicates()
df_meta_filtered.sample(1)

## Text Embedding - BoW Model

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
import string

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
punctuation = set(string.punctuation)

vocab = set()
bow_model = []
raw_text = df_meta_filtered["text"]
for text in (raw_text):
    word_counts = defaultdict(int)
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    
    vocab.update(tokens)
    for word in tokens:
        word_counts[word] += 1
    
    bow_model.append(word_counts)

vocab = list(vocab)
print(f"Numero di parole nel vocabolario: {len(vocab)}")
print(f"Le 10 parole più frequenti nel primo documento: {sorted(vocab, key=lambda x: bow_model[0].get(x, 0), reverse=True)[:10]}")


In [None]:
bow_data = pd.DataFrame(0, index=range(len(raw_text)), columns=list(vocab))
for i in range(len(df_meta_filtered['text'])):
  bow_data.loc[i, bow_model[i].keys()] = bow_model[i].values()
bow_data


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(df_meta_filtered['text'])
bow_dataset = pd.DataFrame(bow_model.toarray(), columns=vectorizer.get_feature_names_out())
bow_dataset["parent_asin"] = df_meta_filtered["parent_asin"]

In [None]:
bow_dataset

In [None]:
user_id = 'AHLK5V5OBWUPTZZMJ2XIKBR4LUHA'
print(f'User: {user_id}')
user_ratings = filtered_df[filtered_df['user_id'] == user_id]
rated_items = bow_dataset[bow_dataset['parent_asin'].isin(user_ratings['parent_asin'])]
print(f'# rated items: {len(rated_items)}')
dataset = pd.merge(rated_items, user_ratings, on="parent_asin")
dataset = dataset.drop(columns=["parent_asin", "user_id", "verified_purchase", "title_y", "text_y"])
dataset.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns="rating_y"),
                                                    dataset['rating_y'],
                                                    test_size=0.20,
                                                    random_state=0)
neigh_reg = KNeighborsRegressor(n_neighbors=10, metric="cosine")
neigh_reg.fit(X_train, y_train)
y_pred = neigh_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'MSE = {mse:.6f}')
print(f'RMSE = {rmse:.6f}')

In [None]:
mse_users = []
for user_id in filtered_df["user_id"].unique():
  user_ratings = filtered_df[filtered_df['user_id'] == user_id]
  rated_items = bow_dataset[bow_dataset['parent_asin'].isin(user_ratings['parent_asin'])]
  dataset = pd.merge(rated_items, user_ratings, on="parent_asin")
  dataset = dataset.drop(columns=["parent_asin", "user_id", "verified_purchase", "title_y", "text_y"])
  try:
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns="rating_y"),
                                                        dataset['rating_y'],
                                                        test_size=0.20,
                                                        random_state=0)
    neigh_reg = KNeighborsRegressor(n_neighbors=min(20, len(X_train)),
                                    metric="cosine")
    neigh_reg.fit(X_train, y_train)
    y_pred = neigh_reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_users.append(mse)
  except:
    continue

In [None]:
print(f"Average MSE over users: {np.mean(mse_users):.2f}")
print(f"Average RMSE over users: {np.sqrt(np.mean(mse_users)):.2f}")

## Text Embedding - Transformers Models

In [None]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('sentence-transformers/average_word_embeddings_komninos')

In [None]:
embeddings = model.encode(df_meta_filtered["text"])

In [None]:
df_meta_filtered

In [None]:
embeddings_dataset = pd.DataFrame(embeddings)
embeddings_dataset["parent_asin"] = df_meta_filtered["parent_asin"]
embeddings_dataset

In [None]:
mse_users = []
for user_id in filtered_df["user_id"].unique():
    user_ratings = filtered_df[filtered_df['user_id'] == user_id]
    rated_items = embeddings_dataset[embeddings_dataset['parent_asin'].isin(user_ratings['parent_asin'])]
    dataset_rec = pd.merge(rated_items, user_ratings, on="parent_asin")
    dataset_rec = dataset_rec.drop(columns=["parent_asin", "user_id"])
    dataset_rec = pd.get_dummies(dataset_rec, columns=dataset_rec.select_dtypes(include=['object']).columns)
    dataset_rec = dataset_rec.dropna()
    dataset_rec.columns = dataset_rec.columns.astype(str)
    if len(dataset_rec) == 0 or 'rating' not in dataset_rec.columns:
        continue
    try:
        X_train, X_test, y_train, y_test = train_test_split(dataset_rec.drop(columns="rating"),
                                                            dataset_rec['rating'],
                                                            test_size=0.20,
                                                            random_state=0)
        if len(X_train) < 2:
            continue
        neigh_reg = KNeighborsRegressor(n_neighbors=min(40, len(X_train)), metric="cosine")
        neigh_reg.fit(X_train, y_train)
        y_pred = neigh_reg.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_users.append(mse)
    except Exception as e:
        print(f'Error for user {user_id}: {e}')
        continue

if mse_users:
    average_mse = np.mean(mse_users)
    print(f'Average MSE: {average_mse:.6f}')
else:
    print('No MSE values calculated.')


In [None]:
print(f"Average MSE over users: {np.mean(mse_users):.2f}")
print(f"Average RMSE over users: {np.sqrt(np.mean(mse_users)):.2f}")

# PROGETTO AVANZATO IMPLEMENTARE CLASSIFICAZIONE CON KNN

## Processamento degli attributi testuali con tecniche di Natural Language Processing

### Merge dei DF 

In [10]:
min_reviews_per_user = 15

df_filtring = df.drop_duplicates()

df_filtring = df_filtring[df_filtring['verified_purchase'] == True]

user_review_counts = df_filtring['user_id'].value_counts()
users_with_min_reviews = user_review_counts[user_review_counts >= min_reviews_per_user].index

filtered_df_avan = df[df['user_id'].isin(users_with_min_reviews)]

item_review_counts = filtered_df_avan.groupby('parent_asin')['user_id'].nunique()
products_with_min_reviews = item_review_counts[item_review_counts >= min_reviews_per_product].index

filtered_df_avan = filtered_df_avan[filtered_df_avan['parent_asin'].isin(products_with_min_reviews)]
filtered_df_avan = filtered_df_avan[filtered_df_avan['verified_purchase'] == True]
num_products = filtered_df_avan['parent_asin'].nunique()
num_users = filtered_df_avan['user_id'].nunique()
num_reviews = len(filtered_df_avan)

print(f'Numero di prodotti: {num_products}')
print(f'Numero di utenti: {num_users}')
print(f'Numero di recensioni totali: {num_reviews}')

Numero di prodotti: 3672
Numero di utenti: 6513
Numero di recensioni totali: 91791


In [11]:
import pandas as pd
merged_df = pd.merge(df_meta, filtered_df_avan, on='parent_asin')
# Funzione per convertire liste in stringhe
def list_to_str(lst):
    return str(lst)
# Converti le colonne con liste in stringhe
merged_df['description'] = merged_df['description'].apply(list_to_str)

# Rimuovi i duplicati dal DataFrame risultante
#merged_df = merged_df.drop_duplicates()


merged_df


Unnamed: 0,title_x,description,parent_asin,rating_number,rating,user_id,verified_purchase,title_y,text
0,NBA 2K17 - Early Tip Off Edition - PlayStation 4,['Following the record-breaking launch of NBA ...,B00Z9TLVK0,223,5.0,AGPAKMKDJXOR47OUALYZSVWMLO4Q,True,Five Stars,"Excellent game, hands down the best basketball..."
1,NBA 2K17 - Early Tip Off Edition - PlayStation 4,['Following the record-breaking launch of NBA ...,B00Z9TLVK0,223,5.0,AFIAQIEOMSH77WE44HNZ5YX73H7A,True,Love the game - no killing and shooting and th...,Love the game - no killing and shooting and th...
2,NBA 2K17 - Early Tip Off Edition - PlayStation 4,['Following the record-breaking launch of NBA ...,B00Z9TLVK0,223,5.0,AGQ22LX6AG4JXHRO6KQZ7X5LVBZA,True,NICE,I love this game. Graphics and gameplay are to...
3,NBA 2K17 - Early Tip Off Edition - PlayStation 4,['Following the record-breaking launch of NBA ...,B00Z9TLVK0,223,5.0,AH2G73SA2D2ESTV4PDHWQZNIXI6Q,True,NBA 2K17- Early Tip-Off Edition,I just recently received this game. I'm still ...
4,NBA 2K17 - Early Tip Off Edition - PlayStation 4,['Following the record-breaking launch of NBA ...,B00Z9TLVK0,223,4.0,AG5BPUUAI5DCU7GHJ24KIALYA4BA,True,Four Stars,Good nba game.
...,...,...,...,...,...,...,...,...,...
91786,Dying Light 2 Stay Human - PlayStation 4,"[""Over twenty years ago in Harran, we fought t...",B0862GHVT9,1351,5.0,AHF2KKGKOEYSYKJLGZQRRX4R2K4A,True,Its a Good Game :),Its worth a buy :) I like it just like the fir...
91787,Dying Light 2 Stay Human - PlayStation 4,"[""Over twenty years ago in Harran, we fought t...",B0862GHVT9,1351,5.0,AGNCIIB5HZZDDBECJNEU7HLVM6LA,True,One great zombie game,I played and beat the 1st Dying Light. This on...
91788,Dying Light 2 Stay Human - PlayStation 4,"[""Over twenty years ago in Harran, we fought t...",B0862GHVT9,1351,4.0,AEUGKS4VF3SPY5FS4FJOWBS5UK2A,True,Good Game but with some glitches.,Huge fan of the original Dying Light!! The Pa...
91789,Dying Light 2 Stay Human - PlayStation 4,"[""Over twenty years ago in Harran, we fought t...",B0862GHVT9,1351,1.0,AHHUAOW6QEXCAK2PT3SH7YZTVA7Q,True,did not work on playstation 2,Did not work on playstation


### Tokenizing 

In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [13]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/Gabriele/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Gabriele/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Gabriele/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
lemmatizer = WordNetLemmatizer() # meglio dello stemmer
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalnum()]
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)
    else:
        return ''
# ho tolto le colonne title e description rating_number, helpful_vote, verified_purchase e lasciato solo quelle processate
merged_df['title_processed'] = merged_df['title_y'].apply(preprocess_text)
merged_df['text_processed'] = merged_df['text'].apply(preprocess_text)
merged_df.drop_duplicates()
merged_df.head


<bound method NDFrame.head of                                                 title_x  \
0      NBA 2K17 - Early Tip Off Edition - PlayStation 4   
1      NBA 2K17 - Early Tip Off Edition - PlayStation 4   
2      NBA 2K17 - Early Tip Off Edition - PlayStation 4   
3      NBA 2K17 - Early Tip Off Edition - PlayStation 4   
4      NBA 2K17 - Early Tip Off Edition - PlayStation 4   
...                                                 ...   
91786          Dying Light 2 Stay Human - PlayStation 4   
91787          Dying Light 2 Stay Human - PlayStation 4   
91788          Dying Light 2 Stay Human - PlayStation 4   
91789          Dying Light 2 Stay Human - PlayStation 4   
91790          Dying Light 2 Stay Human - PlayStation 4   

                                             description parent_asin  \
0      ['Following the record-breaking launch of NBA ...  B00Z9TLVK0   
1      ['Following the record-breaking launch of NBA ...  B00Z9TLVK0   
2      ['Following the record-breaking launch

### Applicazione etichette per la sentiment

In [15]:
def sentiment_label(rating):
    if rating <= 2:
        return 0
    elif rating == 3:
        return 1
    else:
        return 2
merged_df['sentiment'] = merged_df['rating'].apply(sentiment_label)

In [16]:
merged_df

Unnamed: 0,title_x,description,parent_asin,rating_number,rating,user_id,verified_purchase,title_y,text,title_processed,text_processed,sentiment
0,NBA 2K17 - Early Tip Off Edition - PlayStation 4,['Following the record-breaking launch of NBA ...,B00Z9TLVK0,223,5.0,AGPAKMKDJXOR47OUALYZSVWMLO4Q,True,Five Stars,"Excellent game, hands down the best basketball...",five star,excellent game hand best basketball video game...,2
1,NBA 2K17 - Early Tip Off Edition - PlayStation 4,['Following the record-breaking launch of NBA ...,B00Z9TLVK0,223,5.0,AFIAQIEOMSH77WE44HNZ5YX73H7A,True,Love the game - no killing and shooting and th...,Love the game - no killing and shooting and th...,love game killing shooting graphic amazing,love game killing shooting graphic amazing,2
2,NBA 2K17 - Early Tip Off Edition - PlayStation 4,['Following the record-breaking launch of NBA ...,B00Z9TLVK0,223,5.0,AGQ22LX6AG4JXHRO6KQZ7X5LVBZA,True,NICE,I love this game. Graphics and gameplay are to...,nice,love game graphic gameplay top notch play onli...,2
3,NBA 2K17 - Early Tip Off Edition - PlayStation 4,['Following the record-breaking launch of NBA ...,B00Z9TLVK0,223,5.0,AH2G73SA2D2ESTV4PDHWQZNIXI6Q,True,NBA 2K17- Early Tip-Off Edition,I just recently received this game. I'm still ...,nba early edition,recently received game still learning use cont...,2
4,NBA 2K17 - Early Tip Off Edition - PlayStation 4,['Following the record-breaking launch of NBA ...,B00Z9TLVK0,223,4.0,AG5BPUUAI5DCU7GHJ24KIALYA4BA,True,Four Stars,Good nba game.,four star,good nba game,2
...,...,...,...,...,...,...,...,...,...,...,...,...
91786,Dying Light 2 Stay Human - PlayStation 4,"[""Over twenty years ago in Harran, we fought t...",B0862GHVT9,1351,5.0,AHF2KKGKOEYSYKJLGZQRRX4R2K4A,True,Its a Good Game :),Its worth a buy :) I like it just like the fir...,good game,worth buy like like first game,2
91787,Dying Light 2 Stay Human - PlayStation 4,"[""Over twenty years ago in Harran, we fought t...",B0862GHVT9,1351,5.0,AGNCIIB5HZZDDBECJNEU7HLVM6LA,True,One great zombie game,I played and beat the 1st Dying Light. This on...,one great zombie game,played beat 1st dying light one seems like goi...,2
91788,Dying Light 2 Stay Human - PlayStation 4,"[""Over twenty years ago in Harran, we fought t...",B0862GHVT9,1351,4.0,AEUGKS4VF3SPY5FS4FJOWBS5UK2A,True,Good Game but with some glitches.,Huge fan of the original Dying Light!! The Pa...,good game glitch,huge fan original dying light parkour side que...,2
91789,Dying Light 2 Stay Human - PlayStation 4,"[""Over twenty years ago in Harran, we fought t...",B0862GHVT9,1351,1.0,AHHUAOW6QEXCAK2PT3SH7YZTVA7Q,True,did not work on playstation 2,Did not work on playstation,work playstation 2,work playstation,0


In [17]:
merged_df = merged_df.drop(columns=[ 'title_x','title_y','description','rating_number', 'rating' ,'user_id', 'verified_purchase' ,'text'])

In [18]:
merged_df = merged_df.sample(frac=0.6, random_state=42)

In [19]:
merged_df["text"] = merged_df["title_processed"] + " " + merged_df["text_processed"]


In [20]:
merged_df = merged_df.drop(columns=['title_processed' , 'text_processed'])

In [22]:
merged_df

Unnamed: 0,parent_asin,sentiment,text
19372,B001TOMQVC,2,life name many game soul caliber legend tried ...
2362,B0083GAF12,2,four star little old generation like fut ball ...
43099,B004NBXRCU,2,liked first game bought one darkness 2 good l...
86546,B0017HW5LM,2,solid game solid game one best solid improveme...
35965,B08Y5DN2FX,0,super hard get dock fit well bought suppose do...
...,...,...,...
47999,B0051D8PGM,2,good game good game enjoying lot son almost ps...
54142,B006JKARN0,2,four star ok
25728,B07X3ZWL7X,2,witcher gwent awesome one best game made love ...
80993,B00BGAA3S2,2,decent accessory wait find game much use yet s...


In [23]:
import pandas as pd
import numpy as np

neutral_count = len(merged_df[merged_df['sentiment'] == 1])
negative_count = len(merged_df[merged_df['sentiment'] == 0])
min_count = min(neutral_count, negative_count)
positive_sample = merged_df[merged_df['sentiment'] == 2].sample(neutral_count + negative_count, random_state=42)
neutral_sample = merged_df[merged_df['sentiment'] == 1].sample(min_count, random_state=42)
negative_sample = merged_df[merged_df['sentiment'] == 0].sample(min_count, random_state=42)
balanced_df = pd.concat([positive_sample, neutral_sample, negative_sample])
merged_df = balanced_df.dropna()

In [24]:
merged_df

Unnamed: 0,parent_asin,sentiment,text
68813,B003Q9RGRW,2,cool thing came handy needed br br used thing ...
59114,B00D5SZ04K,2,waiting game played series th game like lose l...
87003,B00KUZEFBK,2,collector edition kid pleased got package got ...
66117,B017W175Y8,2,five star love buy
21110,B005OGKYVK,2,five star love love love
...,...,...,...
446,B01GY35HKE,0,okay game getting play whole thing disk strict...
8783,B015XC3B46,0,one star second one buy constantly break time ...
84104,B08B6PLXHC,0,crappy collector edition buy game forego colle...
40587,B07BDJHFQD,0,switch plan take case hard put even harder tak...


## EMBEDDING WITH BoW

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [34]:
merged_df

Unnamed: 0,parent_asin,sentiment,text
68813,B003Q9RGRW,2,cool thing came handy needed br br used thing ...
59114,B00D5SZ04K,2,waiting game played series th game like lose l...
87003,B00KUZEFBK,2,collector edition kid pleased got package got ...
66117,B017W175Y8,2,five star love buy
21110,B005OGKYVK,2,five star love love love
...,...,...,...
446,B01GY35HKE,0,okay game getting play whole thing disk strict...
8783,B015XC3B46,0,one star second one buy constantly break time ...
84104,B08B6PLXHC,0,crappy collector edition buy game forego colle...
40587,B07BDJHFQD,0,switch plan take case hard put even harder tak...


In [36]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression  # Ad esempio, puoi scegliere un modello di classificazione
from sklearn.metrics import classification_report, accuracy_score


In [37]:
# Vettorizza il testo usando CountVectorizer
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(merged_df["text"])
bow_dataset = pd.DataFrame(bow_model.toarray(), columns=vectorizer.get_feature_names_out())
bow_dataset["parent_asin"] = merged_df["parent_asin"].values
bow_dataset["sentiment"] = merged_df["sentiment"].values

In [38]:
bow_dataset["sentiment"]

0        2
1        2
2        2
3        2
4        2
        ..
13630    0
13631    0
13632    0
13633    0
13634    0
Name: sentiment, Length: 13635, dtype: int64

In [39]:
merged_df

Unnamed: 0,parent_asin,sentiment,text
68813,B003Q9RGRW,2,cool thing came handy needed br br used thing ...
59114,B00D5SZ04K,2,waiting game played series th game like lose l...
87003,B00KUZEFBK,2,collector edition kid pleased got package got ...
66117,B017W175Y8,2,five star love buy
21110,B005OGKYVK,2,five star love love love
...,...,...,...
446,B01GY35HKE,0,okay game getting play whole thing disk strict...
8783,B015XC3B46,0,one star second one buy constantly break time ...
84104,B08B6PLXHC,0,crappy collector edition buy game forego colle...
40587,B07BDJHFQD,0,switch plan take case hard put even harder tak...


In [40]:
X = bow_dataset.drop(columns=["parent_asin", "sentiment"])  # Features
y = bow_dataset["sentiment"]  # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=2000)  # Esempio di modello di regressione logistica
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Valutazione del modello
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.69      0.67       623
           1       0.57      0.47      0.52       662
           2       0.82      0.86      0.84      1442

    accuracy                           0.73      2727
   macro avg       0.68      0.68      0.68      2727
weighted avg       0.72      0.73      0.72      2727

Accuracy: 0.7282728272827282


In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, y_train, y_test = train_test_split(bow_dataset.drop(columns=["parent_asin", "sentiment"]),
                                                    bow_dataset["sentiment"],
                                                    test_size=0.3,
                                                    random_state=42)
neigh = KNeighborsClassifier(n_neighbors=30)
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
bow_dataset

## EMBEDDING WITH TRASFOMERS

In [26]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

In [42]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [43]:
input_texts = "classification: " + merged_df["text"]
input_texts = input_texts.tolist()
input_texts[:5]

['classification: five star limb rubber easily pose ever need',
 'classification: bad shooter hang call duty battlefield game whould like weapon game dame good story line game remembering miltary people try',
 'classification: good legend many annoyance game take away fun factor incredibly frustratingly stupid camera lara getting stuck terrain invisible wall height change ground hour become quite tired playing contrast legend none issue solid gameplay underworld feel fighting engine every step let need proceed game none fun legend overall good game sad see gameplay take step backwards',
 'classification: best 3d mario game since super mario 64 best 3d mario game since super mario actually 2d super mario fan love',
 'classification: one best lego game date one great fun xbox one quite frankly fewer bug lego game date one polish lego marvel lot fun keep lego gamer busy time']

In [44]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True, safe_serialization=True)

<All keys matched successfully>


In [45]:
batch_dict = tokenizer(input_texts, padding=True, truncation=True, return_tensors='pt')
print("input_ids:", batch_dict["input_ids"].shape)
print("attention_mask:", batch_dict["attention_mask"].shape)

input_ids: torch.Size([682, 512])
attention_mask: torch.Size([682, 512])


In [41]:
merged_df = merged_df.sample(frac=0.05)

In [46]:
batch_size = 100
n_instance = batch_dict["input_ids"].shape[0]
n_batch = n_instance // batch_size + 1

embeddings = torch.empty((0, 768))
for i in range(n_batch):
  start = i * batch_size
  end = (i + 1) * batch_size
  print(f"{start} -> {end}")
  with torch.no_grad():
      model_output = model(input_ids=batch_dict["input_ids"][start:end],
                           token_type_ids=batch_dict["token_type_ids"][start:end],
                           attention_mask=batch_dict["attention_mask"][start:end])
  output_pooled = mean_pooling(model_output, batch_dict['attention_mask'][start:end])
  embeddings = torch.cat([embeddings, output_pooled])

0 -> 100
100 -> 200
200 -> 300
300 -> 400
400 -> 500
500 -> 600
600 -> 700


## PREDIZIONE DEL SENTIMENT

In [48]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, y_train, y_test = train_test_split(embeddings,
                                                    merged_df["sentiment"],
                                                    test_size=0.2,
                                                    random_state=42)
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

y_pred = neigh.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.72      0.68        32
           1       0.64      0.51      0.57        35
           2       0.86      0.90      0.88        70

    accuracy                           0.76       137
   macro avg       0.71      0.71      0.71       137
weighted avg       0.75      0.76      0.75       137

Accuracy: 0.7591240875912408
