In [11]:
from sklearn.manifold import TSNE
import plotly.express as px
from scipy.spatial import distance
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import pickle

In [13]:
columns = list(range(0, 383))
columns = list(map(str, columns))


In [12]:
with open('dataset/dataset_{}.pkl'.format('features'), 'rb') as fp:
    dataset = pickle.load(fp)

In [2]:
def plot_clusters(df, projections):
    
    fig = px.scatter(projections, x=0, y=1, color=df.label)
    fig.show()

In [7]:
X_test_intro = pd.read_csv("dataset/embed_bert_{}_test.csv".format('introduction'))
X_test_mat = pd.read_csv("dataset/embed_bert_{}_test.csv".format('materials'))
X_test_conc = pd.read_csv("dataset/embed_bert_{}_test.csv".format('conclusion'))

In [14]:
X_test_intro = X_test_intro[columns]
X_test_mat = X_test_mat[columns]
X_test_conc = X_test_conc[columns]

In [15]:
section = 'introduction'
features_intro = dataset[section][5][['sentences', 'articles', 'rouge_1', 'bin']]
features_intro = features_intro.reset_index(drop=True)

X_test_intro['articles'] = features_intro['articles']
X_test_intro['sentences'] = features_intro['sentences']
X_test_intro['rouge_1'] = features_intro['rouge_1']
X_test_intro['bin'] = features_intro['bin']

In [16]:
section = 'materials'
features_mat = dataset[section][5][['sentences', 'articles', 'rouge_1', 'bin']]
features_mat = features_mat.reset_index(drop=True)

X_test_mat['articles'] = features_mat['articles']
X_test_mat['sentences'] = features_mat['sentences']
X_test_mat['rouge_1'] = features_mat['rouge_1']
X_test_mat['bin'] = features_mat['bin']

In [17]:
section = 'conclusion'
features_conc = dataset[section][5][['sentences', 'articles', 'rouge_1', 'bin']]
features_conc = features_conc.reset_index(drop=True)

X_test_conc['articles'] = features_conc['articles']
X_test_conc['sentences'] = features_conc['sentences']
X_test_conc['rouge_1'] = features_conc['rouge_1']
X_test_conc['bin'] = features_conc['bin']

In [87]:
df = pd.concat([X_test_intro, X_test_mat, X_test_conc])

In [19]:
import sys
import joblib
import six
sys.modules['sklearn.externals.joblib'] = joblib
sys.modules['sklearn.externals.six'] = six

In [20]:
import hdbscan

In [45]:
summ_items = pd.read_csv("dataset/indices_summ.csv")['summ']

In [88]:
def cluster_analysis(df, summ_items, columns):

    result = []
    num_clusters = {'articles': [], 'n': []}
    
    for i in summ_items:

        cluster_result = {}

        aux  = df.loc[df['articles'] == i]

        clusterer = hdbscan.HDBSCAN(min_cluster_size=2, alpha=1.3)
        clusterer.fit(aux[columns])
        
        num_clusters['articles'].append(i)
        num_clusters['n'].append(len(np.unique(clusterer.labels_)))

        cluster_result = {'sentences': aux['sentences'],
                          'articles': len(aux)*[i],
                          'rouge_1': aux['rouge_1'], 
                          'label': aux['bin'],
                          'cluster': clusterer.labels_}
        
        

        result.append(pd.DataFrame(cluster_result))
        
    return pd.concat(result), num_clusters

### Todas as seções

In [89]:
result, num_clusters = cluster_analysis(df, summ_items, columns)

In [90]:
pd.DataFrame(num_clusters).describe()

Unnamed: 0,n
count,924.0
mean,4.6829
std,3.014247
min,1.0
25%,3.0
50%,3.0
75%,5.0
max,28.0


### Introdução

In [77]:
intro_result, num_clusters_intro = cluster_analysis(X_test_intro, summ_items, columns)

In [78]:
pd.DataFrame(num_clusters_intro).describe()

Unnamed: 0,n
count,924.0
mean,2.450216
std,1.192569
min,1.0
25%,1.0
50%,3.0
75%,3.0
max,7.0


### Materials

In [79]:
mat_result, num_clusters_mat = cluster_analysis(X_test_mat, summ_items, columns)

In [80]:
pd.DataFrame(num_clusters_mat).describe()

Unnamed: 0,n
count,924.0
mean,2.681818
std,1.563552
min,1.0
25%,1.0
50%,3.0
75%,3.0
max,16.0


### Conclusion

In [81]:
conc_result, num_clusters_conc = cluster_analysis(X_test_conc, summ_items, columns)

In [82]:
pd.DataFrame(num_clusters_conc).describe()

Unnamed: 0,n
count,924.0
mean,3.225108
std,2.04425
min,1.0
25%,1.0
50%,3.0
75%,4.0
max,18.0
