In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

import matplotlib.pyplot as plt

import time
from datetime import datetime

import tba3102

In [None]:
tba3102.set_default_pandas_options(max_columns=11)

np.random.seed(int(round(time.time())))

In [None]:
df = pd.read_csv('../data/very_cleaned.csv', index_col=0)
tba3102.data_quality_report(df)

In [None]:
# to allow you to reproduce the same clustering result, you can print out the random value used for the sampling
# random_state = np.random.randint(2**31-1)
random_state = 945649140
print('random_state: {}'.format(random_state))

In [None]:
df = df.sample(frac=0.50, replace=False, random_state=random_state)
df.reset_index(inplace=True)
tba3102.data_quality_report(df)

In [None]:
print('Text processing started at {}'.format(datetime.now()))

In [None]:
tv = TfidfVectorizer(min_df=0.01, max_df=1.0, norm='l2', use_idf=True, smooth_idf=True)
tv_matrix = tv.fit_transform(df['reviews.text_very_cleaned'])
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names_out()
df_tfidf = pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

df_tfidf.to_csv('../data/tfidf.csv')
print('Number of terms extracted is {}'.format(df_tfidf.columns.size))

In [None]:
similarity_matrix = cosine_similarity(tv_matrix)
df_similarity = pd.DataFrame(similarity_matrix)
df_similarity

In [None]:
check_pairwise_similarity = True
highest_similarity = 0.0
highest_i = 0
highest_j = 0

stop_index = len(df_similarity) 

if check_pairwise_similarity:
    
    for i in range(len(df_similarity)):
        
        for j in range(df_similarity.columns.size):
            
            if i == j:
                
                break
            
            else:
                
                if df_similarity.loc[i,j] > highest_similarity:
                    
                    highest_similarity = df_similarity.loc[i,j]
                    highest_i = i
                    highest_j = j
                    
                    print('Current highest pairwise document similarity is {} between review {} and review {}'.format(highest_similarity, df.loc[highest_i,'index'], df.loc[highest_j,'index']))
    
    print('Highest pairwise document similarity is {} between review {} and review {}'.format(highest_similarity, df.loc[highest_i,'index'], df.loc[highest_j,'index']))

else:
    
    print('Skipping pairwise similarity checking...')

In [None]:
Z = linkage(similarity_matrix, 'ward')

plt.figure(figsize=(15, 12))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Data point')
plt.ylabel('Distance')
dendrogram(Z)
plt.axhline(y=1.0, c='k', ls='--', lw=0.5)

In [None]:
# for random_state = 945649140
max_dist = 230.0
cluster_labels = fcluster(Z, max_dist, criterion='distance')
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
df_cluster = pd.concat([df, cluster_labels], axis=1)
df_cluster.to_csv('../data/cluster.csv')

In [None]:
print('Text processing ended at {}'.format(datetime.now()))