In [1]:
# external libraries
!python -m pip install -U plotly
!python -m pip install -U gensim
!python -m pip install -U kneed
!python -m pip install -U scikit-learn
!python -m pip install -U pyLDAvis
!python -m pip install -U torchvision 
!python -m pip install -U spacy 



### Data source
>### <span style="color:#AF33FF">'a'   :   'THE COMMON LAW.txt'</span>

>### <span style="color:#FF5533">'b'   :   'THE CONSTITUTION OF THE UNITED STATES OF AMERICA.txt'</span>

>### <span style="color:#3BE53C">'c'   :   'THE-ENGLISH-CONSTITUTION.txt'</span>

>### <span style="color:#E53BD5">'d'   :   'THE-LIFE-OF-THE-BEE.txt'</span>

>### <span style="color:#E5C33B">'e'   :   'THE STANDARD ELECTRICAL DICTIONARY.txt'</span>

>### <span style="color:#3BE3E5">'f'   :   'THE-PHILOSOPHY-OF-MATHEMATICS.txt'</span>

>### <span style="color:#FF5BC8">'g'   :   'WHITE-HOUSE-COOK-BOOK.txt</span>

In [674]:
# libraries used
import pandas as pd
import numpy as np 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt 
import seaborn as sns

#!pip install kneed
from kneed import KneeLocator

from sklearn.cluster import KMeans 
from scipy.cluster import hierarchy 
from sklearn.cluster import AgglomerativeClustering 
from sklearn.mixture import GaussianMixture

from gensim.models.doc2vec import Doc2Vec

from gensim.models.coherencemodel import CoherenceModel
from scipy.cluster.hierarchy import linkage as lkg

from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis

import pyLDAvis.gensim_models as gensimvis
%matplotlib inline
from sklearn.cluster import KMeans
from gensim.matutils import corpus2csc
import torch
import torchvision

# customized helper functions
import measures

In [671]:
# render setting for vscode
import plotly.io as pio
pio.renderers.default = "notebook"

### Helper functions

In [4]:
def make_SVDtsne_2d(df, label=None, books=('a','b','c','d','e','f','g')):
  reduced = TruncatedSVD(n_components=7, random_state=0).fit_transform(df)

  tsne_results = TSNE(n_components=2, 
                      perplexity=40, 
                      init='pca', # globally more stable to initialization
                      learning_rate = 'auto',
                      n_iter=1000, 
                      random_state=0).fit_transform(reduced)

  df = pd.DataFrame()
  df['tsne-2d-one'] = tsne_results[:,0]
  df['tsne-2d-two'] = tsne_results[:,1]
  df['label_True'] = df_out['label'].loc[df_out['label'].isin(books)].reset_index(drop=True)
  print(df)
  if label is not None:
    dic = dict(enumerate(df_out['label'].loc[df_out['label'].isin(books)].unique()))
    lbl = list(label)
    for i, l in zip(range(0,len(lbl)), lbl):
      try:
        lbl[i] = dic[l].upper()
      except:
        lbl[i] = 'Outlier'
    df['label_Pred'] = lbl
  
  print(df)
  if label is None: fig = px.scatter(df, x='tsne-2d-one', y='tsne-2d-two', color="label_True", symbol="label_True", width=1200, height=900)
  if label is not None: fig = px.scatter(df, x='tsne-2d-one', y='tsne-2d-two', color="label_Pred", symbol="label_True", width=1200, height=900)

  fig.show()

In [5]:
def make_tsne_2d(df, label=None, books=('a','b','c','d','e','f','g')):
    # need to read raw our.csv first
    # df must be vectorized DataFrame
    
    tsne = TSNE(n_components=2, perplexity=40, n_iter=1000, random_state=0)
    tsne_results = tsne.fit_transform(df)
    
    df = pd.DataFrame()
    df['tsne-2d-one'] = tsne_results[:,0]
    df['tsne-2d-two'] = tsne_results[:,1]
    df['label_True'] = df_out['label'].loc[df_out['label'].isin(books)].reset_index(drop=True)
    if label is not None:
      dic = dict(enumerate(df_out['label'].unique()))
      lbl = list(label)
      for i, l in zip(range(0,len(lbl)), lbl):
        try:
          lbl[i] = dic[l].upper()
        except:
          lbl[i] = 'Outlier'
      df['label_Pred'] = lbl
    
    if label is None: fig = px.scatter(df, x='tsne-2d-one', y='tsne-2d-two', color="label_True", symbol="label_True", width=1200, height=900)
    if label is not None: fig = px.scatter(df, x='tsne-2d-one', y='tsne-2d-two', color="label_Pred", symbol="label_True", width=1200, height=900)
    fig.show()

In [6]:
def elbow_method(data, number):
    wcss = []
    for i in range(1, number+1):
        kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)
    kn = KneeLocator(range(1, 10+1), wcss, curve='convex', direction='decreasing')

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(1, number+1)),
                            y=wcss))
    fig.add_vline(x=kn.knee, line_width=3, line_dash="dash", line_color="green")

    fig.update_layout(title='Elbow Method',
                      xaxis_title='Number of clusters',
                      yaxis_title='WCSS',
                      title_x=0.5,
                      height=500, 
                      width=800)
    fig.show()

In [685]:
def compare_predict(trans_data, data, modl, vidl = None,
  n_clusters=5,
  linkage='ward',
  n_components = 2,
  covariance_type = 'diag',
  random_state = 0,
  books=('a','b','c','d','e','f','g')):

  X = trans_data[data['label'].isin(books)]

  # clustering parameters
  if 'n_clusters' in modl.get_params():
    modl.set_params(n_clusters = n_clusters)
  if 'linkage' in modl.get_params():
    modl.set_params(linkage = linkage)
  # em parameters
  if 'n_components' in modl.get_params():
    modl.set_params(n_components = n_components)
  if 'covariance_type' in modl.get_params():
    modl.set_params(covariance_type = covariance_type)
  if 'random_state' in modl.get_params():
    modl.set_params(random_state = random_state)

  pred = modl.fit_predict(X)

  pred_map = measures.label_to_cluster_num(pred=pred, k=len(books), books= books)
  label_data_in_books = data[data['label'].isin(books)]['label'] # retrieve label rows that belongs to any of the books
  mapped_label = [pred_map[label] for label in label_data_in_books]

  if vidl is not None:
    if vidl == 'tsne':
      make_tsne_2d(X, pred, books)
    if vidl == 'SVDtsne':
      make_SVDtsne_2d(X, pred, books)
    if vidl == 'agglom':
      fig = ff.create_dendrogram(X, orientation='left', labels=pred, linkagefun=lambda x: lkg(X, linkage, metric='euclidean'))
      fig.update_layout(height=2000, width=1200)
      fig.show()

  return modl, pred, np.array(mapped_label)

In [8]:
# Fetch data from google drive. equivalent to out.csv
url='https://drive.google.com/file/d/1IBelm4cZNHpl8336gQ6VTIYzZrtC02ln/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df_out = pd.read_csv(url)

df_out.drop('Unnamed: 0', axis=1, inplace=True)
df_out

Unnamed: 0,segment,label
0,act known circumst harm may accept gener test ...,a
1,aris whether represent part contract If contra...,a
2,section cite godbolt see F. N. B G fitzh abr d...,a
3,row bell c.c see treasur hidden anoth land D. ...,a
4,give trespass pretti nearli thing action case ...,a
...,...,...
1395,lamb chop fri tomato potato la crãªme rais bis...,g
1396,glass rhine wine IV glass water V glass champa...,g
1397,cover cloth set away cool place hour dinner ti...,g
1398,transpar veil egg cook white firm lift griddl ...,g


### BOW

In [9]:
# Create CountVectorizer object
bow_vec = CountVectorizer()
# Generate matrix of word vectors
bow_matrix = bow_vec.fit_transform(df_out['segment'])
# print(bow_matrix.toarray())
df_bow = pd.DataFrame(bow_matrix.toarray())
df_bow.columns = bow_vec.get_feature_names_out()
# df_bow


In [10]:
# elbow_method(df_bow, 10)

##### K Means

In [11]:
bow_kmeans5, bow_kmeans5_pred, bow_kmeans5_cluster_num = compare_predict(df_bow, df_out, KMeans(), n_clusters= 5)

a:[  0   0   0 200   0   0   0]	a mapped to 3
b:[132   0   0  68   0   0   0]	b mapped to 0
c:[  1   0   0 199   0   0   0]	c mapped to 3
d:[  0   0   0 200   0   0   0]	d mapped to 3
e:[  0   0 194   6   0   0   0]	e mapped to 2
f:[  0 197   0   3   0   0   0]	f mapped to 1
g:[  0   0   0  18 182   0   0]	g mapped to 4


In [12]:
bow_kmeans7, bow_kmeans7_pred, bow_kmeans7_cluster_num = compare_predict(df_bow, df_out, KMeans(), n_clusters=7)

a:[200   0   0   0   0   0   0]	a mapped to 0
b:[ 28   0   0  16 156   0   0]	b mapped to 4
c:[199   0   0   0   1   0   0]	c mapped to 0
d:[200   0   0   0   0   0   0]	d mapped to 0
e:[  3   0   0   1   0 164  32]	e mapped to 5
f:[  3   0 197   0   0   0   0]	f mapped to 2
g:[ 18 182   0   0   0   0   0]	g mapped to 1


##### EM

In [13]:
bow_gmm5, bow_gmm5_pred, bow_gmm5_cluster_num = compare_predict(df_bow, df_out, GaussianMixture(), n_components=5)

a:[  0   0   0   0 200   0   0]	a mapped to 4
b:[  0   0   0 151  49   0   0]	b mapped to 3
c:[  0   0   0   1 199   0   0]	c mapped to 4
d:[  0   0   0   0 200   0   0]	d mapped to 4
e:[196   0   0   0   4   0   0]	e mapped to 0
f:[  0   0 197   0   3   0   0]	f mapped to 2
g:[  0 178   0   0  22   0   0]	g mapped to 1


In [14]:
bow_gmm7, bow_gmm7_pred, bow_gmm7_cluster_num = compare_predict(df_bow, df_out, GaussianMixture(), n_components=7)

a:[  0   0   0   0 199   1   0]	a mapped to 4
b:[  0   0   0 152  42   6   0]	b mapped to 3
c:[  0   0   0   1  13 186   0]	c mapped to 5
d:[  0   0   0   0 200   0   0]	d mapped to 4
e:[196   0   0   0   4   0   0]	e mapped to 0
f:[  0   0 197   0   3   0   0]	f mapped to 2
g:[ 0 96  0  0 23  0 81]	g mapped to 1


##### Hierarchical 

In [15]:
bow_agglom5, bow_agglom5_pred, bow_agglom5_cluster_num = compare_predict(df_bow, df_out, AgglomerativeClustering(), n_clusters=5, linkage='ward')

a:[  0   0 200   0   0   0   0]	a mapped to 2
b:[  0 192   8   0   0   0   0]	b mapped to 1
c:[  0   2 197   1   0   0   0]	c mapped to 2
d:[  0   0 200   0   0   0   0]	d mapped to 2
e:[193   1   1   0   5   0   0]	e mapped to 0
f:[  0   0   0   0 200   0   0]	f mapped to 4
g:[  0   0   0 200   0   0   0]	g mapped to 3


In [16]:
bow_agglom7, bow_agglom7_pred, bow_agglom7_cluster_num = compare_predict(df_bow, df_out, AgglomerativeClustering(), n_clusters=7, linkage='ward')

a:[  0   0 200   0   0   0   0]	a mapped to 2
b:[192   0   8   0   0   0   0]	b mapped to 0
c:[  2   0 197   1   0   0   0]	c mapped to 2
d:[  0   0 200   0   0   0   0]	d mapped to 2
e:[  1 154   1   0   5  38   1]	e mapped to 1
f:[  0   0   0   0 200   0   0]	f mapped to 4
g:[  0   0   0 200   0   0   0]	g mapped to 3


### TF-IDF

In [17]:
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(df_out['segment'])
df_tfidf = pd.DataFrame(tfidf_matrix.toarray())
df_tfidf.columns = tfidf_vec.get_feature_names_out()

##### K Means

In [18]:
tfidf_kmeans5, tfidf_kmeans5_pred, tfidf_kmeans5_cluster_num = compare_predict(df_tfidf, df_out, KMeans(), n_clusters= 5)

a:[  0   0   0 200   0   0   0]	a mapped to 3
b:[  0   0 188  12   0   0   0]	b mapped to 2
c:[  0   0   4 196   0   0   0]	c mapped to 3
d:[  0   0   0 200   0   0   0]	d mapped to 3
e:[  0   0   0   4 196   0   0]	e mapped to 4
f:[197   0   0   3   0   0   0]	f mapped to 0
g:[  0 186   0  14   0   0   0]	g mapped to 1


In [19]:
tfidf_kmeans7, tfidf_kmeans7_pred, tfidf_kmeans7_cluster_num = compare_predict(df_tfidf, df_out, KMeans(), n_clusters= 7)

a:[  0   0   0   0 199   1   0]	a mapped to 4
b:[  0 196   0   0   1   3   0]	b mapped to 1
c:[  0   1   0   0   1 198   0]	c mapped to 5
d:[200   0   0   0   0   0   0]	d mapped to 0
e:[  0   0 200   0   0   0   0]	e mapped to 2
f:[  0   0   0   0   0   3 197]	f mapped to 6
g:[  2   0   0 195   2   1   0]	g mapped to 3


##### EM

In [20]:
tfidf_gmm5, tfidf_gmm5_pred, tfidf_gmm5_cluster_num = compare_predict(df_tfidf, df_out, GaussianMixture(), n_components=5)

a:[  0   0 178  22   0   0   0]	a mapped to 2
b:[  0   0   6 194   0   0   0]	b mapped to 3
c:[  0   0 197   3   0   0   0]	c mapped to 2
d:[  0   0 200   0   0   0   0]	d mapped to 2
e:[  0   0   2   0 198   0   0]	e mapped to 4
f:[  0 197   3   0   0   0   0]	f mapped to 1
g:[186   0  14   0   0   0   0]	g mapped to 0


In [21]:
tfidf_gmm7, tfidf_gmm7_pred, tfidf_gmm7_cluster_num = compare_predict(df_tfidf, df_out, GaussianMixture(), n_components=7)

a:[  0   0   1   0   0 199   0]	a mapped to 5
b:[  0   0   2 193   0   5   0]	b mapped to 3
c:[  0   0 197   3   0   0   0]	c mapped to 2
d:[200   0   0   0   0   0   0]	d mapped to 0
e:[  0   0   0   0 198   2   0]	e mapped to 4
f:[  0 197   2   0   0   1   0]	f mapped to 1
g:[  2   0   0   0   0   4 194]	g mapped to 6


##### Hierarchical

In [22]:
tfidf_agglom5, tfidf_agglom5_pred, tfidf_agglom5_cluster_num = compare_predict(df_tfidf, df_out, AgglomerativeClustering(), n_clusters=5, linkage='ward')

a:[167   0   0   0  33   0   0]	a mapped to 0
b:[  5   2   0   0 193   0   0]	b mapped to 4
c:[189   0   0   0  11   0   0]	c mapped to 0
d:[200   0   0   0   0   0   0]	d mapped to 0
e:[  0 195   5   0   0   0   0]	e mapped to 1
f:[  3   0 197   0   0   0   0]	f mapped to 2
g:[  0   2   0 198   0   0   0]	g mapped to 3


In [23]:
tfidf_agglom7, tfidf_agglom7_pred, tfidf_agglom7_cluster_num = compare_predict(df_tfidf, df_out, AgglomerativeClustering(), n_clusters=7, linkage='ward')

a:[  0   0 164   0  33   0   3]	a mapped to 2
b:[  0   2   0   0 193   0   5]	b mapped to 4
c:[  0   0   0   0  11   3 186]	c mapped to 6
d:[  0   0   0   0   0 199   1]	d mapped to 5
e:[  5 195   0   0   0   0   0]	e mapped to 1
f:[197   0   0   0   0   0   3]	f mapped to 0
g:[  0   2   0 198   0   0   0]	g mapped to 3


### LDA

In [24]:
def lda_corpus(data):
  for i in range(len(data.index)):
    data['segment'][i] = data['segment'][i].split()
  data_lemmatized = data['segment'].tolist()
  # Create Dictionary 
  id2word = corpora.Dictionary(data_lemmatized)  
  # Create Corpus 
  texts = data_lemmatized  
  # Term Document Frequency 
  corpus = [id2word.doc2bow(text) for text in texts]  
  return corpus

In [25]:
lda_model = torch.load('./LDA/lda_model.pt')
lda_df_out = df_out.copy()
corpus = lda_corpus(lda_df_out)

In [26]:
df_lda = np.zeros(shape=(1400,4))
for i, row in enumerate(lda_model[corpus]):
  for  j, (topic_num, prop_topic) in enumerate(row):
    df_lda[i, topic_num] = prop_topic

##### K Means

In [27]:
lda_kmeans5, lda_kmeans5_pred, lda_kmeans5_cluster_num = compare_predict(df_lda, df_out, KMeans(), n_clusters= 5)

a:[ 21 117   1   2  59   0   0]	a mapped to 1
b:[197   1   0   0   2   0   0]	b mapped to 0
c:[191   1   0   0   8   0   0]	c mapped to 0
d:[  0 188   1   2   9   0   0]	d mapped to 1
e:[  0   0 199   0   1   0   0]	e mapped to 2
f:[  0 199   0   0   1   0   0]	f mapped to 1
g:[  0   0   1 199   0   0   0]	g mapped to 3


In [28]:
lda_kmeans7, lda_kmeans7_pred, lda_kmeans7_cluster_num = compare_predict(df_lda, df_out, KMeans(), n_clusters= 7)

a:[47  1 14  2 41 86  9]	a mapped to 5
b:[  1   0 183   0   1   0  15]	b mapped to 2
c:[  0   0 136   0   5   1  58]	c mapped to 2
d:[ 48   1   0   2   3 146   0]	d mapped to 5
e:[  0 199   0   0   1   0   0]	e mapped to 1
f:[  3   0   0   0   1 196   0]	f mapped to 5
g:[  0   1   0 199   0   0   0]	g mapped to 3


##### EM

In [29]:
lda_gmm5, lda_gmm5_pred, lda_gmm5_cluster_num = compare_predict(df_lda, df_out, GaussianMixture(), n_components=5)

a:[ 56   0  18   0 126   0   0]	a mapped to 4
b:[  0   0 178   0  22   0   0]	b mapped to 2
c:[  0   0 183   0  17   0   0]	c mapped to 2
d:[ 81   0   0   0 119   0   0]	d mapped to 4
e:[  0 184   0   0  16   0   0]	e mapped to 1
f:[189   0   0   0  11   0   0]	f mapped to 0
g:[  0   0   0 173  27   0   0]	g mapped to 3


In [30]:
lda_gmm7, lda_gmm7_pred, lda_gmm7_cluster_num = compare_predict(df_lda, df_out, GaussianMixture(), n_components=7)

a:[ 11   0  10   0 114  56   9]	a mapped to 4
b:[  1   0 160   0  14   0  25]	b mapped to 2
c:[  0   0 114   0   9   0  77]	c mapped to 2
d:[96  0  0  0 23 81  0]	d mapped to 0
e:[  7 166   0   0  27   0   0]	e mapped to 1
f:[  1   0   0   0  10 189   0]	f mapped to 5
g:[ 27   0   0 173   0   0   0]	g mapped to 3


##### Hierarchical

In [31]:
lda_agglom5, lda_agglom5_pred, lda_agglom5_cluster_num = compare_predict(df_lda, df_out, AgglomerativeClustering(), n_clusters=5, linkage='ward')

a:[ 23   2 103   1  71   0   0]	a mapped to 2
b:[199   0   1   0   0   0   0]	b mapped to 0
c:[194   0   6   0   0   0   0]	c mapped to 0
d:[  0   2  69   1 128   0   0]	d mapped to 4
e:[  0   0   1 199   0   0   0]	e mapped to 3
f:[  0   0   4   0 196   0   0]	f mapped to 4
g:[  0 200   0   0   0   0   0]	g mapped to 1


In [32]:
lda_agglom7, lda_agglom7_pred, lda_agglom7_cluster_num = compare_predict(df_lda, df_out, AgglomerativeClustering(), n_clusters=7, linkage='ward')

a:[77  2 13  1 71 10 26]	a mapped to 0
b:[  1   0  33   0   0 166   0]	b mapped to 5
c:[  1   0  79   0   0 115   5]	c mapped to 5
d:[ 69   2   0   1 128   0   0]	d mapped to 4
e:[  0   0   0 199   0   0   1]	e mapped to 3
f:[  3   0   0   0 196   0   1]	f mapped to 4
g:[  0 200   0   0   0   0   0]	g mapped to 1


### Doc2Vec

In [33]:
# Load the pretrained Doc2Vec transformer
d2v_model = Doc2Vec.load('./Doc2Vec_model/doc2vec.model')
d2v_model_high = Doc2Vec.load('./Doc2Vec_model/doc2vec_high.model')
d2v_model_low = Doc2Vec.load('./Doc2Vec_model/doc2vec_low.model')

In [34]:
#generate the list of tokenized words
text_list = df_out['segment'].values.tolist()

tokenized_list = []
for segment in text_list:
    tokenized_segment = segment.split()
    tokenized_list.append(tokenized_segment)

In [35]:
#generate the list of word vectors
x_d2v = [d2v_model.infer_vector(d) for d in tokenized_list]
df_d2v = pd.DataFrame(x_d2v)
# df_d2v

##### K Means

In [39]:
d2v_kmeans5, d2v_kmeans5_pred, d2v_kmeans5_cluster_num = compare_predict(df_d2v, df_out, KMeans(), n_clusters= 5)

a:[  1 170  22   1   6   0   0]	a mapped to 1
b:[  0 195   2   0   3   0   0]	b mapped to 1
c:[  0  21 179   0   0   0   0]	c mapped to 2
d:[  0   7 182   1  10   0   0]	d mapped to 2
e:[186   5   4   1   4   0   0]	e mapped to 0
f:[  0   6   0 194   0   0   0]	f mapped to 3
g:[  0   7   1   2 190   0   0]	g mapped to 4


In [40]:
d2v_kmeans7, d2v_kmeans7_pred, d2v_kmeans7_cluster_num = compare_predict(df_d2v, df_out, KMeans(), n_clusters= 7)

a:[ 37   4   1   1   7 147   3]	a mapped to 5
b:[ 87   0   0   0   5 108   0]	b mapped to 5
c:[  1  15   0   0 170  14   0]	c mapped to 4
d:[  1 167   1   0  13  12   6]	d mapped to 1
e:[  6   4   1 184   0   2   3]	e mapped to 3
f:[  1   0 194   0   0   5   0]	f mapped to 2
g:[  6   1   2   0   0   3 188]	g mapped to 6


##### Hierarchical

In [41]:
d2v_agglom5, d2v_agglom5_pred, d2v_agglom5_cluster_num = compare_predict(df_d2v, df_out, AgglomerativeClustering(), n_clusters=5, linkage='ward')

a:[196   1   1   0   2   0   0]	a mapped to 0
b:[190   1   0   1   8   0   0]	b mapped to 0
c:[ 52 148   0   0   0   0   0]	c mapped to 1
d:[ 20 175   0   0   5   0   0]	d mapped to 1
e:[  5   4   0 185   6   0   0]	e mapped to 3
f:[  4   0 194   0   2   0   0]	f mapped to 2
g:[  3   1   0   0 196   0   0]	g mapped to 4


In [42]:
d2v_agglom7, d2v_agglom7_pred, d2v_agglom7_cluster_num = compare_predict(df_d2v, df_out, AgglomerativeClustering(), n_clusters=7, linkage='ward')

a:[  0   2   1 178   1  18   0]	a mapped to 3
b:[  1   8   0 155   0  35   1]	b mapped to 3
c:[  0   0   0  52  25   0 123]	c mapped to 6
d:[  0   5   0  20 169   0   6]	d mapped to 4
e:[185   6   0   3   4   2   0]	e mapped to 0
f:[  0   2 194   4   0   0   0]	f mapped to 2
g:[  0 196   0   3   1   0   0]	g mapped to 1


##### EM

In [43]:
d2v_gmm5, d2v_gmm5_pred, d2v_gmm5_cluster_num = compare_predict(df_d2v, df_out, GaussianMixture(), n_components=5)

a:[ 16   4   1 172   7   0   0]	a mapped to 3
b:[  3   3   0 192   2   0   0]	b mapped to 3
c:[182   0   0  18   0   0   0]	c mapped to 0
d:[181   0   1   6  12   0   0]	d mapped to 0
e:[  4 193   1   1   1   0   0]	e mapped to 1
f:[  0   1 194   5   0   0   0]	f mapped to 2
g:[  0   0   0   3 197   0   0]	g mapped to 4


In [44]:
d2v_gmm7, d2v_gmm7_pred, d2v_gmm7_cluster_num = compare_predict(df_d2v, df_out, GaussianMixture(), n_components=7)

a:[  6   0   1  46 139   3   5]	a mapped to 4
b:[ 3  1  0 98 97  0  1]	b mapped to 3
c:[175   0   0   3  22   0   0]	c mapped to 0
d:[178   0   1   2   8   0  11]	d mapped to 0
e:[  4 128   1   3   0  63   1]	e mapped to 1
f:[  0   0 195   3   1   1   0]	f mapped to 2
g:[  0   0   0   7   0   1 192]	g mapped to 6


### Evaluation

##### Kappa Score

In [45]:
print("Kappa score for BOW----------------------------------")
bow_kmeans5.kappa = measures.get_kappa(bow_kmeans5_pred, bow_kmeans5_cluster_num)
bow_kmeans7.kappa = measures.get_kappa(bow_kmeans7_pred, bow_kmeans7_cluster_num)
bow_agglom5.kappa = measures.get_kappa(bow_agglom5_pred, bow_agglom5_cluster_num)
bow_agglom7.kappa = measures.get_kappa(bow_agglom7_pred, bow_agglom7_cluster_num)
bow_gmm5.kappa = measures.get_kappa(bow_gmm5_pred, bow_gmm5_cluster_num)
bow_gmm7.kappa = measures.get_kappa(bow_gmm7_pred, bow_gmm7_cluster_num)
print("Kappa score for TFIDF--------------------------------")
tfidf_kmeans5.kappa = measures.get_kappa(tfidf_kmeans5_pred, tfidf_kmeans5_cluster_num)
tfidf_kmeans7.kappa = measures.get_kappa(tfidf_kmeans7_pred, tfidf_kmeans7_cluster_num)
tfidf_agglom5.kappa = measures.get_kappa(tfidf_agglom5_pred, tfidf_agglom5_cluster_num)
tfidf_agglom7.kappa = measures.get_kappa(tfidf_agglom7_pred, tfidf_agglom7_cluster_num)
tfidf_gmm5.kappa = measures.get_kappa(tfidf_gmm5_pred, tfidf_gmm5_cluster_num)
tfidf_gmm7.kappa = measures.get_kappa(tfidf_gmm7_pred, tfidf_gmm7_cluster_num)
print("Kappa score for LDA----------------------------------")
lda_kmeans5.kappa = measures.get_kappa(lda_kmeans5_pred, lda_kmeans5_cluster_num)
lda_kmeans7.kappa = measures.get_kappa(lda_kmeans7_pred, lda_kmeans7_cluster_num)
lda_agglom5.kappa = measures.get_kappa(lda_agglom5_pred, lda_agglom5_cluster_num)
lda_agglom7.kappa = measures.get_kappa(lda_agglom7_pred, lda_agglom7_cluster_num)
lda_gmm5.kappa = measures.get_kappa(lda_gmm5_pred, lda_gmm5_cluster_num)
lda_gmm7.kappa = measures.get_kappa(lda_gmm7_pred, lda_gmm7_cluster_num)
print("Kappa score for Doc2Vec------------------------------")
d2v_kmeans5.kappa = measures.get_kappa(d2v_kmeans5_pred, d2v_kmeans5_cluster_num)
d2v_kmeans7.kappa = measures.get_kappa(d2v_kmeans7_pred, d2v_kmeans7_cluster_num)
d2v_agglom5.kappa = measures.get_kappa(d2v_agglom5_pred, d2v_agglom5_cluster_num)
d2v_agglom7.kappa = measures.get_kappa(d2v_agglom7_pred, d2v_agglom7_cluster_num)
d2v_gmm5.kappa = measures.get_kappa(d2v_gmm5_pred, d2v_gmm5_cluster_num)
d2v_gmm7.kappa = measures.get_kappa(d2v_gmm7_pred, d2v_gmm7_cluster_num)

Kappa score for BOW----------------------------------
kappa: 0.9041642897889333
kappa: 0.9000979431929481
kappa: 0.9824707846410684
kappa: 0.9447903694479037
kappa: 0.9215157536190747
kappa: 0.8476928848318119
Kappa score for TFIDF--------------------------------
kappa: 0.9637356482777933
kappa: 0.9875
kappa: 0.9412816281628162
kappa: 0.9433333333333334
kappa: 0.9513888888888888
kappa: 0.9816666666666667
Kappa score for LDA----------------------------------
kappa: 0.8909193936818246
kappa: 0.7601451222789573
kappa: 0.833047832388984
kappa: 0.7163363821138211
kappa: 0.7740465963816218
kappa: 0.6698675094202018
Kappa score for Doc2Vec------------------------------
kappa: 0.9041979207790498
kappa: 0.7946168768186227
kappa: 0.8931578947368422
kappa: 0.8258706467661692
kappa: 0.9182092687409742
kappa: 0.7451246605776352


In [46]:
model_name = [['bow kmean5', 'bow kmean7', 'bow agglom5', 'bow agglom7', 'bow em5', 'bow em7'],
                ['tfidf kmean5', 'tfidf kmean7', 'tfidf agglom5', 'tfidf agglom7', 'tfidf em5', 'tfidf em7'],
        ['lda kmean5', 'lda kmean7', 'lda agglom5', 'lda agglom7', 'lda em5', 'lda em7'],
        ['d2v kmean5', 'd2v kmean7', 'd2v agglom5', 'd2v agglom7', 'd2v em5', 'd2v em7']]

scores_kappa = [[bow_kmeans5.kappa, bow_kmeans7.kappa, bow_agglom5.kappa, bow_agglom7.kappa, bow_gmm5.kappa, bow_gmm7.kappa],
        [tfidf_kmeans5.kappa, tfidf_kmeans7.kappa, tfidf_agglom5.kappa, tfidf_agglom7.kappa, tfidf_gmm5.kappa, tfidf_gmm7.kappa],
        [lda_kmeans5.kappa, lda_kmeans7.kappa, lda_agglom5.kappa, lda_agglom7.kappa, lda_gmm5.kappa, lda_gmm7.kappa],
        [d2v_kmeans5.kappa, d2v_kmeans7.kappa, d2v_agglom5.kappa, d2v_agglom7.kappa, d2v_gmm5.kappa, d2v_gmm7.kappa]
        ]

In [678]:
import plotly.graph_objects as go
fig = go.Figure()
COLOR_FEATURE = np.arange(100, 300, 50)
COLOR_MODEL = np.arange(40, 280,40)

for i, color_feature in enumerate(COLOR_FEATURE):
    for j, color_model in enumerate(COLOR_MODEL):
        fig.add_trace(go.Bar(x = [model_name[i][j]],
                            y = [scores_kappa[i][j]],
                            name = model_name[i][j],
                            marker_color= 'rgb('+str(color_feature)+', 120,'+str(color_model)+')',
                            text= [scores_kappa[i][j]]
                            ))

fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    title_text='Kappa Scores for Different Models'
)
fig.show()

##### Silhouette Score

In [679]:
print("silhouette score for BOW----------------------------------")
bow_kmeans5.silhouette = measures.get_silhouette_score(df_bow, bow_kmeans5_cluster_num)
bow_kmeans7.silhouette = measures.get_silhouette_score(df_bow, bow_kmeans7_cluster_num)
bow_agglom5.silhouette = measures.get_silhouette_score(df_bow, bow_agglom5_cluster_num)
bow_agglom7.silhouette = measures.get_silhouette_score(df_bow, bow_agglom7_cluster_num)
bow_gmm5.silhouette = measures.get_silhouette_score(df_bow, bow_gmm5_cluster_num)
bow_gmm7.silhouette = measures.get_silhouette_score(df_bow, bow_gmm7_cluster_num)
print("silhouette score for TFIDF--------------------------------")
tfidf_kmeans5.silhouette = measures.get_silhouette_score(df_tfidf, tfidf_kmeans5_cluster_num)
tfidf_kmeans7.silhouette = measures.get_silhouette_score(df_tfidf, tfidf_kmeans7_cluster_num)
tfidf_agglom5.silhouette = measures.get_silhouette_score(df_tfidf, tfidf_agglom5_cluster_num)
tfidf_agglom7.silhouette = measures.get_silhouette_score(df_tfidf, tfidf_agglom7_cluster_num)
tfidf_gmm5.silhouette = measures.get_silhouette_score(df_tfidf, tfidf_gmm5_cluster_num)
tfidf_gmm7.silhouette = measures.get_silhouette_score(df_tfidf, tfidf_gmm7_cluster_num)
print("silhouette score for LDA----------------------------------")
lda_kmeans5.silhouette = measures.get_silhouette_score(df_lda, lda_kmeans5_cluster_num)
lda_kmeans7.silhouette = measures.get_silhouette_score(df_lda, lda_kmeans7_cluster_num)
lda_agglom5.silhouette = measures.get_silhouette_score(df_lda, lda_agglom5_cluster_num)
lda_agglom7.silhouette = measures.get_silhouette_score(df_lda, lda_agglom7_cluster_num)
lda_gmm5.silhouette = measures.get_silhouette_score(df_lda, lda_gmm5_cluster_num)
lda_gmm7.silhouette = measures.get_silhouette_score(df_lda, lda_gmm7_cluster_num)
print("silhouette score for Doc2Vec------------------------------")
d2v_kmeans5.silhouette = measures.get_silhouette_score(df_d2v, d2v_kmeans5_cluster_num)
d2v_kmeans7.silhouette = measures.get_silhouette_score(df_d2v, d2v_kmeans7_cluster_num)
d2v_agglom5.silhouette = measures.get_silhouette_score(df_d2v, d2v_agglom5_cluster_num)
d2v_agglom7.silhouette = measures.get_silhouette_score(df_d2v, d2v_agglom7_cluster_num)
d2v_gmm5.silhouette = measures.get_silhouette_score(df_d2v, d2v_gmm5_cluster_num)
d2v_gmm7.silhouette = measures.get_silhouette_score(df_d2v, d2v_gmm7_cluster_num)

silhouette score for BOW----------------------------------
silhouette score: 0.018804597541345527
silhouette score: 0.018804597541345527
silhouette score: 0.018804597541345527
silhouette score: 0.018804597541345527
silhouette score: 0.018804597541345527
silhouette score: 0.007131359440162388
silhouette score for TFIDF--------------------------------
silhouette score: 0.030931113887031663
silhouette score: 0.03567439495290303
silhouette score: 0.030931113887031663
silhouette score: 0.03567439495290303
silhouette score: 0.030931113887031663
silhouette score: 0.03567439495290303
silhouette score for LDA----------------------------------
silhouette score: 0.8363730788298622
silhouette score: 0.8363730788298622
silhouette score: 0.6557344486375999
silhouette score: 0.6557344486375999
silhouette score: 0.4937617455925064
silhouette score: 0.4988500934439026
silhouette score for Doc2Vec------------------------------
silhouette score: 0.3014127314090729
silhouette score: 0.3014127314090729
sil

In [680]:
scores_silhouette = [[bow_kmeans5.silhouette, bow_kmeans7.silhouette, bow_agglom5.silhouette, bow_agglom7.silhouette, bow_gmm5.silhouette, bow_gmm7.silhouette],
        [tfidf_kmeans5.silhouette, tfidf_kmeans7.silhouette, tfidf_agglom5.silhouette, tfidf_agglom7.silhouette, tfidf_gmm5.silhouette, tfidf_gmm7.silhouette],
        [lda_kmeans5.silhouette, lda_kmeans7.silhouette, lda_agglom5.silhouette, lda_agglom7.silhouette, lda_gmm5.silhouette, lda_gmm7.silhouette],
        [d2v_kmeans5.silhouette, d2v_kmeans7.silhouette, d2v_agglom5.silhouette, d2v_agglom7.silhouette, d2v_gmm5.silhouette, d2v_gmm7.silhouette]
        ]

In [681]:
fig = go.Figure()

for i, color_feature in enumerate(COLOR_FEATURE):
    for j, color_model in enumerate(COLOR_MODEL):
        fig.add_trace(go.Bar(x = [model_name[i][j]],
                            y = [scores_silhouette[i][j]],
                            name = model_name[i][j],
                            marker_color= 'rgb('+str(color_feature)+', 120,'+str(color_model)+')',
                            text= [scores_silhouette[i][j]]
                            ))

fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    title_text='Silhouette Scores for Different Models'
)
fig.show()

##### Coherence

In [51]:
# detailed coherence score vs num_top see LDA/LDA_ipynb.ipynb, under compute_coherence_values(...)
print("Coherence score for LDA----------------------------------")
data_lemmatized = lda_df_out['segment'].tolist()
id2word = corpora.Dictionary(data_lemmatized)  
cm = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='u_mass')
lda_coherence = measures.get_coherence(cm)

Coherence score for LDA----------------------------------
coherence: -1.252494182666128


##### Rand Index

In [682]:
print("Rand score for BOW----------------------------------")
bow_kmeans5.rand = measures.get_rand_score(bow_kmeans5_pred, bow_kmeans5_cluster_num)
bow_kmeans7.rand = measures.get_rand_score(bow_kmeans7_pred, bow_kmeans7_cluster_num)
bow_agglom5.rand = measures.get_rand_score(bow_agglom5_pred, bow_agglom5_cluster_num)
bow_agglom7.rand = measures.get_rand_score(bow_agglom7_pred, bow_agglom7_cluster_num)
bow_gmm5.rand = measures.get_rand_score(bow_gmm5_pred, bow_gmm5_cluster_num)
bow_gmm7.rand = measures.get_rand_score(bow_gmm7_pred, bow_gmm7_cluster_num)
print("Rand score for TFIDF--------------------------------")
tfidf_kmeans5.rand = measures.get_rand_score(tfidf_kmeans5_pred, tfidf_kmeans5_cluster_num)
tfidf_kmeans7.rand = measures.get_rand_score(tfidf_kmeans7_pred, tfidf_kmeans7_cluster_num)
tfidf_agglom5.rand = measures.get_rand_score(tfidf_agglom5_pred, tfidf_agglom5_cluster_num)
tfidf_agglom7.rand = measures.get_rand_score(tfidf_agglom7_pred, tfidf_agglom7_cluster_num)
tfidf_gmm5.rand = measures.get_rand_score(tfidf_gmm5_pred, tfidf_gmm5_cluster_num)
tfidf_gmm7.rand = measures.get_rand_score(tfidf_gmm7_pred, tfidf_gmm7_cluster_num)
print("Rand score for LDA----------------------------------")
lda_kmeans5.rand = measures.get_rand_score(lda_kmeans5_pred, lda_kmeans5_cluster_num)
lda_kmeans7.rand = measures.get_rand_score(lda_kmeans7_pred, lda_kmeans7_cluster_num)
lda_agglom5.rand = measures.get_rand_score(lda_agglom5_pred, lda_agglom5_cluster_num)
lda_agglom7.rand = measures.get_rand_score(lda_agglom7_pred, lda_agglom7_cluster_num)
lda_gmm5.rand = measures.get_rand_score(lda_gmm5_pred, lda_gmm5_cluster_num)
lda_gmm7.rand = measures.get_rand_score(lda_gmm7_pred, lda_gmm7_cluster_num)
print("Rand score for Doc2Vec------------------------------")
d2v_kmeans5.rand = measures.get_rand_score(d2v_kmeans5_pred, d2v_kmeans5_cluster_num)
d2v_kmeans7.rand = measures.get_rand_score(d2v_kmeans7_pred, d2v_kmeans7_cluster_num)
d2v_agglom5.rand = measures.get_rand_score(d2v_agglom5_pred, d2v_agglom5_cluster_num)
d2v_agglom7.rand = measures.get_rand_score(d2v_agglom7_pred, d2v_agglom7_cluster_num)
d2v_gmm5.rand = measures.get_rand_score(d2v_gmm5_pred, d2v_gmm5_cluster_num)
d2v_gmm7.rand = measures.get_rand_score(d2v_gmm7_pred, d2v_gmm7_cluster_num)

Rand score for BOW----------------------------------
rand score: 0.8168571082835502
rand score: 0.8713383588318966
rand score: 0.9690152037238356
rand score: 0.9528611923907633
rand score: 0.8447856048292877
rand score: 0.796117045427862
Rand score for TFIDF--------------------------------
rand score: 0.9240297055025066
rand score: 0.9750631378402579
rand score: 0.8866794156279854
rand score: 0.8930313871582037
rand score: 0.8989430684765087
rand score: 0.9635165812111574
Rand score for LDA----------------------------------
rand score: 0.8324318760679159
rand score: 0.6898040085069513
rand score: 0.7604005477262062
rand score: 0.6616742900535785
rand score: 0.6368507158309153
rand score: 0.5539978103468054
Rand score for Doc2Vec------------------------------
rand score: 0.803579616844753
rand score: 0.803579616844753
rand score: 0.7780992217117533
rand score: 0.7085335321325159
rand score: 0.8297366873266565
rand score: 0.6968318358335472


In [683]:
scores_rand = [[bow_kmeans5.rand, bow_kmeans7.rand, bow_agglom5.rand, bow_agglom7.rand, bow_gmm5.rand, bow_gmm7.rand],
        [tfidf_kmeans5.rand, tfidf_kmeans7.rand, tfidf_agglom5.rand, tfidf_agglom7.rand, tfidf_gmm5.rand, tfidf_gmm7.rand],
        [lda_kmeans5.rand, lda_kmeans7.rand, lda_agglom5.rand, lda_agglom7.rand, lda_gmm5.rand, lda_gmm7.rand],
        [d2v_kmeans5.rand, d2v_kmeans7.rand, d2v_agglom5.rand, d2v_agglom7.rand, d2v_gmm5.rand, d2v_gmm7.rand]
        ]

In [724]:
fig = go.Figure()

for i, color_feature in enumerate(COLOR_FEATURE):
    for j, color_model in enumerate(COLOR_MODEL):
        fig.add_trace(go.Bar(x = [model_name[i][j]],
                            y = [scores_rand[i][j]],
                            name = model_name[i][j],
                            marker_color= 'rgb('+str(color_feature)+', 120,'+str(color_model)+')',
                            text= [scores_rand[i][j]]
                            ))

fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    title_text='Rand Index for Different Models'
)
fig.show()

### Error Analysis on book c being in the same cluster with book d on model doc2vec kmeans(n_cluster = 5)

In [655]:
d2v_kmeans5, d2v_kmeans5_pred, d2v_kmeans5_cluster_num = compare_predict(df_d2v, df_out, KMeans(), n_clusters=5)

a:[  1 170  22   1   6   0   0]	a mapped to 1
b:[  0 195   2   0   3   0   0]	b mapped to 1
c:[  0  21 179   0   0   0   0]	c mapped to 2
d:[  0   7 182   1  10   0   0]	d mapped to 2
e:[186   5   4   1   4   0   0]	e mapped to 0
f:[  0   6   0 194   0   0   0]	f mapped to 3
g:[  0   7   1   2 190   0   0]	g mapped to 4


In [749]:
topn95 = 100 # topn for similarity above .95
test_cluster = 2
while d2v_model.wv.most_similar(positive=d2v_kmeans5.cluster_centers_[test_cluster], topn=topn95)[-1][1] > 0.95:
    topn95 = topn95 + 5

most_similar_words_5 = d2v_model.wv.most_similar(positive=d2v_kmeans5.cluster_centers_[test_cluster], topn=topn95)
list_most_similar_words_5 = [word[0] for word in most_similar_words_5]

a_dict = dict()
for k in range(0,7):
    for i, segment in enumerate(tokenized_list[k*200:k*200+200]):
        for word in segment:
            if word in list_most_similar_words_5:
                if k not in a_dict:
                    a_dict[k] = dict()
                if word not in a_dict[k]:
                    a_dict[k][word] = 1
                else:
                    a_dict[k][word] = a_dict[k][word] + 1
for k in range(0, 7):
    total_count = sum(a_dict[k].values())
    print("[{}to{}]len set: {} -------- len list: {}".
    format(k*200, k*200+200, len(a_dict[k]), total_count))

[0to200]len set: 82 -------- len list: 1508
[200to400]len set: 82 -------- len list: 1025
[400to600]len set: 98 -------- len list: 2907
[600to800]len set: 91 -------- len list: 2603
[800to1000]len set: 50 -------- len list: 404
[1000to1200]len set: 52 -------- len list: 606
[1200to1400]len set: 69 -------- len list: 1066


In [755]:
COLOR_LABEL = np.arange(30, 230, 30)
list_unique_similar_word = dict()
list_unique_similar_word_freq = dict()
for i in range(0, 7):
    list_unique_similar_word[i] = list()
    list_unique_similar_word_freq[i] = list()
    for word in a_dict[i]:
        list_unique_similar_word[i].append(word)
        list_unique_similar_word_freq[i].append(a_dict[i][word])


In [760]:
fig = go.Figure()
for i, color_label in enumerate(COLOR_LABEL):
    fig.add_trace(go.Bar(x = list_unique_similar_word[i],
                            y = list_unique_similar_word_freq[i],
                            name = chr(ord('a') + i),
                            marker_color= 'rgb('+str(color_label)+', 120,120)',
                            ))
fig.show()

In [None]:
# draw word cloud
temp = set()
for word in a_dict[2]:
    if (word not in a_dict[1] or word not in a_dict[0]) and word in a_dict[3]:
        temp.add(word)
print("c ∩ d NOT A NOT B: {}".format(temp))

We pulled three law books to examine the clustering algorithms as if they can distinguish the law topic frmo others. However, We are troubled about why as a law book as it is constantly clustered with a bee book. Therefore, we investigate the word composition of book a, b, c and d to see if there is any pattern. By collecting the centroid of cluster 2 where lives book c and d, we are able to get top 135 the most representative and similar words for the cluster. We noticed that book c and d contain a large amount of similar words, rather than book a and b. As we latter discover that because book c is an English law book, it contains a lot of terms that associated with the royal, which is much similar to the bee society, as terms "queen", "royal" often appear in these two books. Although doc2vec uncovers the latent relationship between each word vector, it can be easily confused by the similarity vectors as it disregards the structure and semantic meaning of the context.
