In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import numpy
import scipy

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Download WordNet and other required corpora
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df = pd.read_csv('pre_processed.csv')
df = df[:20000]
df.head()

Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars_x,useful,funny,cool,text,date,...,Hotels & Travel,Arts & Entertainment,Burgers,Shopping,Cafes,Pizza,Cocktail Bars,Italian,Mexican,Event Planning & Services
0,0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,...,0,0,0,0,0,0,0,0,0,0
1,1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18,...,0,0,0,0,0,0,0,0,0,0
2,2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,...,0,0,0,0,0,0,0,0,0,0
3,3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,...,0,0,0,0,0,0,0,0,0,0
4,4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,...,0,0,0,0,0,0,0,0,0,0


In [4]:
def normalize_text(text):

    text = text.lower()

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and word.isalpha()]


    return ' '.join(filtered_tokens)


In [5]:
df['text_normalize'] = df['text'].apply(lambda x : normalize_text(x))
df[['text', 'text_normalize']]

Unnamed: 0,text,text_normalize
0,"If you decide to eat here, just be aware it is...",decide eat aware going take hours beginning en...
1,I've taken a lot of spin classes over the year...,taken lot spin classes years nothing compares ...
2,Family diner. Had the buffet. Eclectic assortm...,family diner buffet eclectic assortment large ...
3,"Wow! Yummy, different, delicious. Our favo...",wow yummy different delicious favorite lamb cu...
4,Cute interior and owner (?) gave us tour of up...,cute interior owner gave us tour upcoming area...
...,...,...
19995,Manager is a complete asshole. If you have a ...,manager complete asshole brain value money sta...
19996,"If you're looking for Bingo around St Louis, t...",looking bingo around st louis place large buil...
19997,Unfortunately we had a bad experience here...w...,unfortunately bad experience actually ate outs...
19998,Meh. This pizza was basically a deep-dish grea...,meh pizza basically grease pie style wings ok ...


In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['text_normalize'])


In [7]:
embeddings.shape

(20000, 384)

In [8]:
df = df
X, y = embeddings, df['stars_x']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0
)

In [9]:
#vectorizer = TfidfVectorizer()
#X_train_transformed = vectorizer.fit_transform(X_train)
#X_test_transformed = vectorizer.transform(X_test)

In [10]:
clustering = AgglomerativeClustering(n_clusters=5, linkage='ward')
labels = clustering.fit_predict(X_train)

In [11]:
from sklearn.metrics import (rand_score, adjusted_mutual_info_score,
                             homogeneity_score, completeness_score,
                             v_measure_score, fowlkes_mallows_score)


def score_clustering(true_labels, predicted_labels):
  d = {
      "Rand index": rand_score(true_labels, predicted_labels),
      "Adjusted Mutual Info": adjusted_mutual_info_score(true_labels, predicted_labels),
      "Homogeneity": homogeneity_score(true_labels, predicted_labels),
      "Completeness": completeness_score(true_labels, predicted_labels),
      "V measure": v_measure_score(true_labels, predicted_labels),
      "Fowlkes Mallows": fowlkes_mallows_score(true_labels, predicted_labels)
}
  for k, v in d.items():
    print(f"{k:21s}:{v}")

In [14]:
#y_pred = clustering.predict(X_test)
score_clustering(labels, y_train)

Rand index           :0.6130090094895215
Adjusted Mutual Info :0.049760598919980976
Homogeneity          :0.049152202692392996
Completeness         :0.05112156894531099
V measure            :0.050117546746556274
Fowlkes Mallows      :0.3079437522997316


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
linkage_matrix = linkage(X, method='ward', metric='euclidean')

# Crea il dendrogramma
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix, labels=df['stars_x'], leaf_rotation=90, leaf_font_size=10)
plt.title('Dendrogramma Gerarchico')
plt.xlabel('Indice del campione')
plt.ylabel('Distanza')
plt.show()

ValueError: could not convert string to float: "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker."