# Content Based approach

### Pre-Processing

In [None]:
from datasets import load_dataset
import numpy as np
import pandas as pd

In [None]:
dataset_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Video_Games", split="full", trust_remote_code=True)

In [None]:
temp_df_meta = pd.DataFrame(dataset_meta)
df_meta = temp_df_meta[['title','description','parent_asin', 'rating_number']]

In [None]:
df_meta_filtered = df_meta[df_meta['rating_number'] > 10]
df_meta_filtered = df_meta_filtered[df_meta_filtered['description'].apply(lambda x: len(x) >15)]
df_meta_filtered = df_meta_filtered.reset_index(drop=True)



print(f"Numero totale di prodotti prima dell'applicazione dei filtri: {len(df_meta):>10}")
print(f"Numero totale di prodotti dopo l'applicazione dei filtri: {len(df_meta_filtered):>11}")
df_meta_filtered

In [None]:
min_reviews_per_user = 30

df_filtring_meta = df.drop_duplicates()

df_filtring_meta = df_filtring_meta[df_filtring_meta['verified_purchase'] == True]

user_review_counts = df_filtring_meta['user_id'].value_counts()
users_with_min_reviews = user_review_counts[user_review_counts >= min_reviews_per_user].index
filtered_df_meta_avan = df[df['user_id'].isin(users_with_min_reviews)]
item_review_counts = filtered_df_meta_avan.groupby('parent_asin')['user_id'].nunique()
filtered_df_meta_avan = filtered_df_meta_avan[filtered_df_meta_avan['verified_purchase'] == True]
num_products = filtered_df_meta_avan['parent_asin'].nunique()
num_users = filtered_df_meta_avan['user_id'].nunique()
num_reviews = len(filtered_df_meta_avan)

print(f'Numero di prodotti: {num_products}')
print(f'Numero di utenti: {num_users}')
print(f'Numero di recensioni totali: {num_reviews}')


In [None]:
filtered_df = filtered_df_meta_avan[filtered_df_meta_avan['parent_asin'].isin(df_meta_filtered['parent_asin'])]

In [None]:
filtered_df

In [None]:
df_meta_filtered

## Processing text column

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def list_to_str(lst):
    return str(lst)

df_meta_filtered['description'] = df_meta_filtered['description'].apply(list_to_str)

In [None]:
lemmatizer = WordNetLemmatizer() # meglio dello stemmer
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalnum()]
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)
    else:
        return ''

# ho tolto le colonne title e description rating_number, helpful_vote, verified_purchase e lasciato solo quelle processate
df_meta_filtered['text'] = (df_meta_filtered['title'] + ' ' + df_meta_filtered["description"]).apply(preprocess_text)
df_meta_filtered.drop_duplicates()
df_meta_filtered.sample(1)

## Text Embedding - BoW Model

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
import string

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
punctuation = set(string.punctuation)

vocab = set()
bow_model = []
raw_text = df_meta_filtered["text"]
for text in (raw_text):
    word_counts = defaultdict(int)
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    
    vocab.update(tokens)
    for word in tokens:
        word_counts[word] += 1
    
    bow_model.append(word_counts)

vocab = list(vocab)
print(f"Numero di parole nel vocabolario: {len(vocab)}")
print(f"Le 10 parole più frequenti nel primo documento: {sorted(vocab, key=lambda x: bow_model[0].get(x, 0), reverse=True)[:10]}")


In [None]:
bow_data = pd.DataFrame(0, index=range(len(raw_text)), columns=list(vocab))
for i in range(len(df_meta_filtered['text'])):
  bow_data.loc[i, bow_model[i].keys()] = bow_model[i].values()
bow_data


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(df_meta_filtered['text'])
bow_dataset = pd.DataFrame(bow_model.toarray(), columns=vectorizer.get_feature_names_out())
bow_dataset["parent_asin"] = df_meta_filtered["parent_asin"]

In [None]:
bow_dataset

In [None]:
user_id = 'AHLK5V5OBWUPTZZMJ2XIKBR4LUHA'
print(f'User: {user_id}')
user_ratings = filtered_df[filtered_df['user_id'] == user_id]
rated_items = bow_dataset[bow_dataset['parent_asin'].isin(user_ratings['parent_asin'])]
print(f'# rated items: {len(rated_items)}')
dataset = pd.merge(rated_items, user_ratings, on="parent_asin")
dataset = dataset.drop(columns=["parent_asin", "user_id", "verified_purchase", "title_y", "text_y"])
dataset.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns="rating_y"),
                                                    dataset['rating_y'],
                                                    test_size=0.20,
                                                    random_state=0)
neigh_reg = KNeighborsRegressor(n_neighbors=10, metric="cosine")
neigh_reg.fit(X_train, y_train)
y_pred = neigh_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'MSE = {mse:.6f}')
print(f'RMSE = {rmse:.6f}')

In [None]:
mse_users = []
for user_id in filtered_df["user_id"].unique():
  user_ratings = filtered_df[filtered_df['user_id'] == user_id]
  rated_items = bow_dataset[bow_dataset['parent_asin'].isin(user_ratings['parent_asin'])]
  dataset = pd.merge(rated_items, user_ratings, on="parent_asin")
  dataset = dataset.drop(columns=["parent_asin", "user_id", "verified_purchase", "title_y", "text_y"])
  try:
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns="rating_y"),
                                                        dataset['rating_y'],
                                                        test_size=0.20,
                                                        random_state=0)
    neigh_reg = KNeighborsRegressor(n_neighbors=min(20, len(X_train)),
                                    metric="cosine")
    neigh_reg.fit(X_train, y_train)
    y_pred = neigh_reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_users.append(mse)
  except:
    continue

In [None]:
print(f"Average MSE over users: {np.mean(mse_users):.2f}")
print(f"Average RMSE over users: {np.sqrt(np.mean(mse_users)):.2f}")

## Text Embedding - Transformers Models

In [None]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('sentence-transformers/average_word_embeddings_komninos')

In [None]:
embeddings = model.encode(df_meta_filtered["text"])

In [None]:
df_meta_filtered

In [None]:
embeddings_dataset = pd.DataFrame(embeddings)
embeddings_dataset["parent_asin"] = df_meta_filtered["parent_asin"]
embeddings_dataset

In [None]:
mse_users = []
for user_id in filtered_df["user_id"].unique():
    user_ratings = filtered_df[filtered_df['user_id'] == user_id]
    rated_items = embeddings_dataset[embeddings_dataset['parent_asin'].isin(user_ratings['parent_asin'])]
    dataset_rec = pd.merge(rated_items, user_ratings, on="parent_asin")
    dataset_rec = dataset_rec.drop(columns=["parent_asin", "user_id"])
    dataset_rec = pd.get_dummies(dataset_rec, columns=dataset_rec.select_dtypes(include=['object']).columns)
    dataset_rec = dataset_rec.dropna()
    dataset_rec.columns = dataset_rec.columns.astype(str)
    if len(dataset_rec) == 0 or 'rating' not in dataset_rec.columns:
        continue
    try:
        X_train, X_test, y_train, y_test = train_test_split(dataset_rec.drop(columns="rating"),
                                                            dataset_rec['rating'],
                                                            test_size=0.20,
                                                            random_state=0)
        if len(X_train) < 2:
            continue
        neigh_reg = KNeighborsRegressor(n_neighbors=min(40, len(X_train)), metric="cosine")
        neigh_reg.fit(X_train, y_train)
        y_pred = neigh_reg.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        mse_users.append(mse)
    except Exception as e:
        print(f'Error for user {user_id}: {e}')
        continue

if mse_users:
    average_mse = np.mean(mse_users)
    print(f'Average MSE: {average_mse:.6f}')
else:
    print('No MSE values calculated.')


In [None]:
print(f"Average MSE over users: {np.mean(mse_users):.2f}")
print(f"Average RMSE over users: {np.sqrt(np.mean(mse_users)):.2f}")