# Progetto Intermedio - Parte 2

# Indice

- [Inizializzazione del notebook](#inizial)
  - [Installazioni](#instal)
  - [Import](#import)
  - [Utils](#utils)
- [Importazione ed elaborazione dei dati](#dati)
  - [Lettura dati](#lett_dati)
- [Sistema di raccomandazione](#sis)
  - [TFIDF lemmatizing](#tfidf_lemm)
  - [KNN TFIDF lemmatizing](#knn_tfidf_lemm)
  - [TFIDF stemming](#tfidf_stem)
  - [KNN TFIDF stemming](#knn_tfidf_stem)
  - [Transformers](#transformers)


# Inizializzazione del notebook <a class="anchor"  id="inizial"></a>


## Installazioni <a class="anchor"  id="instal"></a>

In [1]:
!pip install datasets



In [2]:
!pip install nltk
import nltk
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('punkt')

!pip install sentence-transformers

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [3]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

## Import <a class="anchor"  id="import"></a>

In [4]:
from datasets import load_dataset

from surprise.model_selection import train_test_split

from bokeh.io import output_notebook
output_notebook()

import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from sentence_transformers import SentenceTransformer

2024-06-13 07:59:25.883241: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-13 07:59:25.883383: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-13 07:59:26.066949: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [6]:
stop_words.update([',','.','!','?',';','-','...','[',']','{','}','(',')','&',':', '--','<','>','^','#','\\','/','\'','\"','br', '<br />', '\'s', '&#34', '34', 'n\'t', '\`', "''", '``'])

## Utils <a class="anchor"  id="utils"></a>

In [7]:
def text_processing(sentences, method="lemmatize"):
    tokenized_sentences = []
    for i, text in enumerate(sentences) :
        word_token = word_tokenize(text)
        tokens = []
        for word in word_token:
            if word.casefold() not in stop_words:
                if method == "lemmatize":
                    tokens.append(lemmatizer.lemmatize(word))
                else:
                    tokens.append(stemmer.stem(word))
        tokenized_sentences.append(tokens)
    return tokenized_sentences

In [8]:
def cut_dataset(reviews, min_n_reviews_user, min_n_reviews_item):
    user_review_count = reviews.groupby("user_id").count()[["parent_asin"]].reset_index()
    print("numero di utenti: ", len(user_review_count[user_review_count["parent_asin"] > 0]))
    print("numero di utenti con un numero di recensioni > ", min_n_reviews_user, " : ", len(user_review_count[user_review_count["parent_asin"] > min_n_reviews_user]))
    users_id = user_review_count[user_review_count["parent_asin"] > min_n_reviews_user]["user_id"].unique().tolist()

    item_review_count = reviews.groupby("parent_asin").count()[["user_id"]].reset_index()
    print("numero di item: ", len(item_review_count[item_review_count["user_id"] > 0]))
    print("numero di item con un numero di recensioni > ", min_n_reviews_item, " : ", len(item_review_count[item_review_count["user_id"] > min_n_reviews_item]))
    items_id = item_review_count[item_review_count["user_id"] > min_n_reviews_item]["parent_asin"].tolist()

    reviews = reviews[reviews["user_id"].isin(users_id)].reset_index(drop=True)
    reviews = reviews[reviews["parent_asin"].isin(items_id)].reset_index(drop=True)

    # users_id e items_id vengono "risettati" in quanto può essere che un utente abbia dato due recensioni (e quindi venga incluso nella prima lista)
    # ma che le due recensioni appartengano a prodotti che hanno solo una recensione e quindi vengono eliminate le reviews -> quindi gli utenti (o gli item)
    # in questione non hanno più reviews (e il numero diminuisce)

    return reviews, reviews["user_id"].unique(), reviews["parent_asin"].unique()

In [9]:
def add_title(df):
    df["text"] = df["title"]+ " " +df["text"]
    return df

In [10]:
def identity_tokenizer(text):
    return text

In [11]:
def create_TFIDF(tokenized_sentences):
    vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase = False)
    tfidf_model = vectorizer.fit_transform(tokenized_sentences)
    return tfidf_model.toarray(), vectorizer.get_feature_names_out()

In [12]:
def create_tfidf_data(vocab, tfidf_descriptions, items_info):
    data = pd.DataFrame(0.0, index=range(len(items_info)), columns=list(vocab))
    for i in range(len(items_info)):
        if i % 1000 == 0:
            print(i)
        data.loc[i, vocab] = tfidf_descriptions[i]

    data["parent_asin"] = items_info["parent_asin"]
    return data

In [13]:
def split_and_test(data, rating_col_name, test_size = 0.20, random_state = 0,n_neighbors = 30, metric = "cosine"):
    #faccio la stessa cosa per ogni user
    mse_users = []
    i = 0
    for user_id in users_id:
        i = i + 1
        if i % 100 == 0:
            print(i)
        # item valutati dall'utente
        user_ratings = reviews[reviews['user_id'] == user_id][["parent_asin", "rating", "user_id"]]
        rated_items = data[data['parent_asin'].isin(user_ratings['parent_asin'])]

        
        # creo il dataset
        dataset = pd.merge(rated_items, user_ratings, on="parent_asin")
        dataset = dataset.drop(columns=["parent_asin", "user_id"])
        if len(rated_items) == 0:
            continue
            
        # Split train/test
        try:
            X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=[rating_col_name]),
                                                        pd.Series(dataset[rating_col_name]),
                                                        test_size=test_size,
                                                        train_size= 1 - test_size,
                                                        random_state=random_state)
            # Train k-NN
            neigh_reg = KNeighborsRegressor(n_neighbors=min(n_neighbors, len(X_train)),
                                            metric=metric, n_jobs = -1)
            neigh_reg.fit(X_train, y_train)
            # Test k-NN
            y_pred = neigh_reg.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            mse_users.append(mse)
        

        except:
            continue
    
    print(f"Average MSE over users: {np.mean(mse_users):.2f}")
    print(f"Average RMSE over users: {np.sqrt(np.mean(mse_users)):.2f}")

# Importazione ed elaborazione dei dati <a class="anchor"  id="dati"></a>

## Lettura dati <a class="anchor"  id="lett_dati"></a>

In [14]:
reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Software", trust_remote_code=True)
reviews = reviews["full"].to_pandas()
items_info = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Software", split="full", trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.87G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/256M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

In [15]:
reviews, users_id, items_id = cut_dataset(reviews, 30, 60)

numero di utenti:  2589466
numero di utenti con un numero di recensioni >  30  :  3027
numero di item:  89246
numero di item con un numero di recensioni >  60  :  7007


In [16]:
print(len(users_id))
print(len(items_id))

3025
6059


In [17]:
#togliamo da items_info tutte quelle righe (items) che non hanno parent_asin in items_id
items_info = items_info.filter(lambda row: row["parent_asin"] in items_id)

Filter:   0%|          | 0/89251 [00:00<?, ? examples/s]

In [18]:
len(items_info)

6059

In [19]:
descriptions = items_info["description"]

In [20]:
#se un oggetto ha più descrizioni la compattiamo in un'unica stringa
for i in range(len(descriptions)):
    description = descriptions[i]
    if len(description) >= 1:
        flattened_description = ''
        for j in range(len(description)):
            flattened_description = flattened_description + description[j] + ' '
        descriptions[i] = flattened_description
    elif len(description) == 0:
        descriptions[i] = ''

In [21]:
print(descriptions[:5])

["Just Escape, whether it's a medieval castle or an abandoned space station you'll need to solve puzzles and find clues to unlock the door and Just Escape. ", 'Tax Software that helps you\xa0get your taxes done right and your maximum refund Tax Software that helps you\xa0get your taxes done right and your maximum refund We’ll search more than 350 deductions and credits (1040, Schedule A) to help make tax preparation easy We’ll search more than 350 deductions and credits (1040, Schedule A) to help make tax preparation easy Get expert answers to your questions by phone (fees may apply) by Intuit Get expert answers to your questions by phone (fees may apply) by Intuit Income Tax Software that accurately deduct mortgage interest and property taxes Income Tax Software that accurately deduct mortgage interest and property taxes Includes one TurboTax State product download (State efile not included) Includes one TurboTax State product download (State efile not included) ', 'Frozen Characters 

In [22]:
for i in range(len(descriptions)):
    descriptions[i] = items_info[i]["title"] + " " +  descriptions[i]

In [23]:
descriptions[:5]

["Just Escape Just Escape, whether it's a medieval castle or an abandoned space station you'll need to solve puzzles and find clues to unlock the door and Just Escape. ",
 'TurboTax Deluxe 2014 Fed + State + Fed Efile Tax Software - Win [Download] OLD VERSION Tax Software that helps you\xa0get your taxes done right and your maximum refund Tax Software that helps you\xa0get your taxes done right and your maximum refund We’ll search more than 350 deductions and credits (1040, Schedule A) to help make tax preparation easy We’ll search more than 350 deductions and credits (1040, Schedule A) to help make tax preparation easy Get expert answers to your questions by phone (fees may apply) by Intuit Get expert answers to your questions by phone (fees may apply) by Intuit Income Tax Software that accurately deduct mortgage interest and property taxes Income Tax Software that accurately deduct mortgage interest and property taxes Includes one TurboTax State product download (State efile not incl

# Sistema di raccomandazione <a class="anchor"  id="sis"></a>

## TFIDF lemmatizing <a class="anchor"  id="tfidf_lemm"></a>

In [24]:
descriptions_tokens = text_processing(descriptions,"lemmatize")

In [25]:
tfidf_lemmatized_descriptions, tfidf_lemmatized_vocab = create_TFIDF(descriptions_tokens)



## KNN TFIDF lemmatizing <a class="anchor"  id="knn_tfidf_lemm"></a>

In [26]:
tfidf_lemmatized_data = create_tfidf_data(tfidf_lemmatized_vocab, tfidf_lemmatized_descriptions, items_info)

0
1000
2000
3000
4000
5000
6000


In [27]:
tfidf_lemmatized_data

Unnamed: 0,$,%,''Who,'+,'-,'.GIMP,'.LibreOffice,'00,'06,'08,...,������Note,����������������������������������������������,🎎,🐾,📈,📊,📋,🔭🌘,🖇,parent_asin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B00K7BMELK
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B00NG7JVSQ
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B00JVOJ5T8
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B06XXH983G
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B008K6IB5C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B002ABOYXG
6055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B005ZKC4FO
6056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B077HV61JX
6057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B07H4PVKFL


In [28]:
split_and_test(tfidf_lemmatized_data, "rating_y", test_size = 0.20, random_state = 0, n_neighbors = 10, metric = "cosine")

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
Average MSE over users: 1.26
Average RMSE over users: 1.12


## TFIDF stemming <a class="anchor"  id="tfidf_stem"></a>

In [29]:
descriptions_tokens = text_processing(descriptions,"stem") 

In [30]:
tfidf_stemmed_descriptions, tfidf_stemmed_vocab = create_TFIDF(descriptions_tokens)



## KNN TFIDF stemming <a class="anchor"  id="knn_tfidf_stem"></a>

In [31]:
tfidf_stemmed_data = create_tfidf_data(tfidf_stemmed_vocab, tfidf_stemmed_descriptions, items_info)

0
1000
2000
3000
4000
5000
6000


In [32]:
tfidf_stemmed_data

Unnamed: 0,$,%,''who,'+,'-,'.gimp,'.libreoffic,'00,'06,'08,...,������note,����������������������������������������������,🎎,🐾,📈,📊,📋,🔭🌘,🖇,parent_asin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B00K7BMELK
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B00NG7JVSQ
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B00JVOJ5T8
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B06XXH983G
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B008K6IB5C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B002ABOYXG
6055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B005ZKC4FO
6056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B077HV61JX
6057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B07H4PVKFL


In [33]:
split_and_test(tfidf_stemmed_data, "rating", test_size = 0.20, random_state = 0, n_neighbors = 10, metric = "cosine")

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
Average MSE over users: 1.26
Average RMSE over users: 1.12


## Transformers <a class="anchor"  id="transformers"></a>

In [34]:
model = SentenceTransformer("all-mpnet-base-v2", trust_remote_code=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [35]:
descriptions_embeddings = model.encode(descriptions)

Batches:   0%|          | 0/190 [00:00<?, ?it/s]

In [36]:
transformer_data = pd.DataFrame(descriptions_embeddings)
transformer_data["parent_asin"] = items_info["parent_asin"]
transformer_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,parent_asin
0,0.040341,0.003383,-0.000643,0.032872,-0.055705,0.003810,-0.067916,-0.002483,0.001360,0.026387,...,-0.052795,0.026210,0.038070,0.053189,0.004155,0.027578,0.010805,-0.012228,0.027441,B00K7BMELK
1,-0.053503,0.076281,-0.032436,-0.031746,-0.044706,0.028633,0.035499,0.036215,0.005998,0.000930,...,-0.009944,-0.050364,0.052189,0.003987,-0.005688,0.026414,-0.025215,-0.031102,-0.019407,B00NG7JVSQ
2,0.045250,-0.021313,-0.011630,0.049763,0.032782,0.002181,0.059741,-0.034130,0.011219,0.017865,...,0.030342,0.048949,0.025312,0.012506,0.165004,-0.022514,0.020248,0.008357,-0.012365,B00JVOJ5T8
3,-0.009961,0.038883,-0.013869,0.044603,-0.017418,-0.010695,-0.025698,0.040974,0.012944,0.022994,...,0.030675,-0.005606,0.058243,0.021328,0.042942,0.013901,0.007631,-0.028027,-0.011741,B06XXH983G
4,0.023801,0.027598,-0.004035,0.013692,0.006188,0.007105,0.023307,0.012472,-0.021750,0.026763,...,0.016769,-0.003526,-0.023551,0.002363,0.022564,0.016859,0.011603,-0.020284,-0.023535,B008K6IB5C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6054,-0.022524,-0.035725,-0.000186,0.024922,-0.056619,0.007904,0.061026,0.021636,-0.082136,-0.017370,...,0.057404,0.041782,0.009636,0.041862,-0.048541,-0.007210,0.000780,-0.048705,-0.034009,B002ABOYXG
6055,-0.018085,0.020532,-0.014997,0.005117,-0.050530,0.037537,0.028906,0.018427,0.076708,-0.015832,...,0.002027,0.068017,-0.009821,0.010703,0.010819,-0.006718,0.009925,0.020336,-0.016233,B005ZKC4FO
6056,-0.029699,0.070605,-0.013625,-0.039177,0.014352,0.012115,0.079701,-0.026488,0.031717,0.023494,...,0.037632,0.001447,-0.027095,-0.026205,0.052085,0.004683,0.004958,0.014593,-0.031256,B077HV61JX
6057,-0.022301,0.052907,-0.020814,-0.062655,-0.045985,0.008763,0.021693,0.059787,0.052451,-0.014686,...,-0.011488,-0.015151,0.062959,-0.015746,0.032045,-0.021656,-0.013567,-0.045351,-0.002954,B07H4PVKFL


In [37]:
split_and_test(transformer_data, "rating", test_size = 0.20, random_state = 0, n_neighbors = 10, metric = "cosine")

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
Average MSE over users: 1.26
Average RMSE over users: 1.12
