## Upload dataset and cleaning

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
dataset = pd.read_csv("/users/antoniozappia/Desktop/Reviews.csv")

In [3]:
len(dataset)

568454

In [4]:
dataset.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [5]:
dataset['ProfileName'].fillna('', inplace=True)
dataset['Summary'].fillna('', inplace=True)
dataset.set_index('Id', inplace=True)
#dataset['Datetime'] = dataset.Time.apply(lambda value: pd.to_datetime(value, unit='s'))

In [6]:
dataset.isnull().sum()

ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64

In [7]:
dataset

Unnamed: 0_level_0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...
568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


##  Pre-process Dataset

In [8]:
## We represent th data with TF-IDF
## How do you pre-process data? Since you aim to characterize products by their review, do you want to consider words that appear in too many or too few documents?
## Svd method

### Represent Data with TF-IDF

Our goal in this part of Homework è definire da zero un algoritmo KMeans Clustering considerando come dataset iniziale il dataset proposto Amazon Food. Per fare questo è necessario preparare il dataset al Clustering. La preparazione è fortemente influenzata dalle nostre scelte, da quali variabili considerare nel Clustering. Scegliamo come colonne del dataset l'ID e il Text. Scegliendo queste due colonne è necessario rappresentare i dati (Text column) with TF-IDF per creare una matrice dove per ogni parola riporto il tf-idf associato a ogni documento. 

In [20]:
new_dataset = dataset.copy()

In [21]:
new_dataset = new_dataset.reset_index()

In [22]:
new_dataset = new_dataset.drop(["Id","UserId", "ProfileName", "HelpfulnessNumerator", "HelpfulnessDenominator", "Score", "Summary", "Time"], axis =1)

In [23]:
new_dataset

Unnamed: 0,ProductId,Text
0,B001E4KFG0,I have bought several of the Vitality canned d...
1,B00813GRG4,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,This is a confection that has been around a fe...
3,B000UA0QIQ,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,Great taffy at a great price. There was a wid...
...,...,...
568449,B001EO7N10,Great for sesame chicken..this is a good if no...
568450,B003S1WTCU,I'm disappointed with the flavor. The chocolat...
568451,B004I613EE,"These stars are small, so you can give 10-15 o..."
568452,B004I613EE,These are the BEST treats for training and rew...


In [24]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [25]:
lemmatizer = WordNetLemmatizer()

In [26]:
stop_words = set(stopwords.words("english"))

In [27]:
def clean_text(text):
    words = pos_tag(word_tokenize(text)) 
    possible_tag = ["J", "R", "V", "N"]
    filtered_words = []
    for word,pos in words:
        if word.lower() not in stop_words and word.isalpha() and pos[0] in possible_tag:
            filtered_words.append(lemmatizer.lemmatize(word.lower(),"v"))
    return ' '.join(filtered_words)

In [29]:
new_dataset["Text"] = new_dataset["Text"].apply(lambda text: (clean_text(text)))

In [30]:
new_dataset.head()

Unnamed: 0,ProductId,Text
0,B001E4KFG0,buy several vitality can dog food products fin...
1,B00813GRG4,product arrive label jumbo salt peanuts peanut...
2,B000LQOCH0,confection centuries light pillowy citrus gela...
3,B000UA0QIQ,look secret ingredient robitussin believe find...
4,B006K2ZZ7K,great taffy great price wide assortment yummy ...


In [31]:
new_dataset['Text'].fillna('', inplace=True)
for i in range(len(new_dataset)):
    if new_dataset.loc[i,"Text"] == "" :
        new_dataset.drop(i,inplace=True)

In [None]:
with open("New_database.csv", "w") as text_file:
    text_file.write(new_dataset.to_csv(index=False))

## Tf-Idf Vectorizer

In [3]:
new_dataset = pd.read_csv("New_database.csv")

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidf_vectorizer=TfidfVectorizer(max_features=30000, stop_words='english',use_idf=True) 
tfidf_new=tfidf_vectorizer.fit_transform(new_dataset["Text"])

In [6]:
print(tfidf_new.shape)

(568452, 30000)


## SVD Method

In [9]:
from sklearn.decomposition import TruncatedSVD
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

### Trade OFF Number Components - Variance

In [10]:
def best_compenents (tf_idf,n_components_initial,n_components_final,goal):
    variance_init = 0
    components = 0
    for k in tqdm(range(n_components_initial, n_components_final)):
        components += 1
        svd = TruncatedSVD(n_components=k)
        svd.fit(tf_idf)
        variance = float(np.cumsum(np.round(svd.explained_variance_ratio_, decimals=3)*100)[-1:])
        print(variance, components)
        if variance > variance_init:
            variance_init = variance
            if variance_init >= goal:
                break
    return components

###

In [None]:
best_compenents(tfidf_new,500,700,60)

  0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
def best_compenents2 (tf_idf,n_components,step):
    lista = []
    for k in range (1, n_components,step):
        lista.append(k)
    dictionary = dict.fromkeys(lista, 0)
    for k in tqdm(range (1, n_components,step)):
        svd = TruncatedSVD(n_components=k)
        svd.fit(tf_idf)
        variance = float(np.cumsum(np.round(svd.explained_variance_ratio_, decimals=3)*100)[-1:])
        dictionary[k] = variance
        del svd
    
    return dictionary
    
def plot_components (dictionary):
    fig_dims = (12, 8)
    fig, ax = plt.subplots(figsize=fig_dims)
    dataset = pd.DataFrame.from_dict(dictionary, orient='index').reset_index()
    dataset.columns = ['Number of Components', 'Variance']
    ax = sns.barplot(data = dataset, x = "Number of Components", y= "Variance",palette="ch:.25")
    plt.ylim(0,100)
    sns.set_style("whitegrid")
    plt.title('Trade-Off Number of Components/Variacne')
    ax.set(xlabel='Number of components', ylabel='Variance')
    plt.show()
    

In [None]:
match = best_compenents2 (tfidf_new,1200,100)

In [None]:
match

In [None]:
plot_components(match)

In [19]:
svd = TruncatedSVD(n_components=K)
new_matrix = svd.fit(tfidf_new)

KeyboardInterrupt: 

## Analysis Clustering