# Document Clustering

In [1]:
import os
import numpy as np
import pandas as pd
import sys
sys.path.append(os.path.abspath(".."))

import plotly.plotly as py
from plotly.graph_objs import *
from url2vec.util.plotter import *
from url2vec.util.seqmanager import *

from sklearn import metrics
from hdbscan import HDBSCAN
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from __future__ import print_function
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

---

In [2]:
# cs.illinois.edu    cs.stanford.edu    eecs.mit.edu    cs.princeton.edu    cs.ox.ac.uk
site = "cs.ox.ac.uk"

## TF-IDF matrix

Here is defined **term frequency - inverse document frequency** (tf-idf) vectorizer parameters and then convert the documents (web pages) list into a tf-idf matrix.

To get a Tf-idf matrix, first count word occurrences by document. This is transformed into a **document-term matrix** (dtm).![Alt text](http://www.codeproject.com/KB/WPF/NNMFSearchResultClusterin/table.jpg "Very nice")

This is also just called a term frequency matrix.
Then apply the term frequency-inverse document frequency weighting: words that occur frequently within a document but not frequently within the corpus receive a higher weighting as these words are assumed to contain more meaning in relation to the document.

A couple things to note about the parameters defined below:

**max_df**: this is the maximum frequency within the documents a given feature can have to be used in the tfi-idf matrix. If the term is in greater than 80% of the documents it probably cares little meanining

**min_idf**: this could be an integer (e.g. 5) and the term would have to be in at least 5 of the documents to be considered. Here I pass 0.1; the term must be in at least 10% of the document.

**ngram_range**: this just means I'll look at unigrams, bigrams and trigrams.

In [3]:
tfidf_vectorizer = TfidfVectorizer(
    max_df = 0.8,
    max_features = 200000,
    min_df = 0.1,
    stop_words = 'english',
    use_idf = True,
    tokenizer = tokenize_and_stem,
    ngram_range = (1,3)
)

The crawling proccess has been done in two different ways:

- **No costraint**: the crawler follows a random outlink from all of the outlinks in a given page
- **List costraint**: the crawler follows a random outlink but only from the outlinks in "lists"

## No-costraint documents

In [4]:
nocostraint_path = os.getcwd() + "/../dataset/" + site + "/no_constraint/words10000_depth5/"
vertex_nc_path   = nocostraint_path + "vertex.txt"
map_nc_path      = nocostraint_path + "urlsMap.txt"

codecontent_map_nc = get_content_map(vertex_nc_path)
urlmap_nc          = get_urlmap(map_nc_path)

documents_nc = [codecontent_map_nc[key] for key in codecontent_map_nc]
codes_nc     = [key for key in codecontent_map_nc]
urls_nc      = [urlmap_nc[key] for key in codecontent_map_nc]

#### Cosine-Similarity
Cosine similarity is measured against the tf-idf matrix and can be used to generate a measure of similarity between each document and the other documents in the corpus.

Subtracting it from 1 provides cosine distance which I will use for plotting on a euclidean (2-dimensional) plane.

In [5]:
%time tfidf_matrix_nc = tfidf_vectorizer.fit_transform(documents_nc)
dist_nc = 1 - cosine_similarity(tfidf_matrix_nc)

print(pd.DataFrame(
        {"documents":tfidf_matrix_nc.shape[0], "terms":tfidf_matrix_nc.shape[1]}, 
        index=[""]).T)
print("\n\n")

CPU times: user 29.3 s, sys: 220 ms, total: 29.5 s
Wall time: 29.3 s
               
documents  3951
terms       363





## Clustering on No-costraint documents
#### K-Means

K-means initializes with a pre-determined number of clusters. Each observation is assigned to a cluster (cluster assignment) so as to minimize the within cluster sum of squares. Next, the mean of the clustered observations is calculated and used as the new cluster centroid. Then, observations are reassigned to clusters and centroids recalculated in an iterative process until the algorithm reaches convergence.

In [6]:
kmeans = KMeans(n_clusters=30)
%time kmeans_labels_nc = kmeans.fit_predict(tfidf_matrix_nc)

docs_nc = { 
    'code': codes_nc,
    'document': documents_nc
}
frame_nc = pd.DataFrame(docs_nc, index=[kmeans_labels_nc] , columns=['document', 'code'])

frame_nc[:5]

CPU times: user 7.21 s, sys: 0 ns, total: 7.21 s
Wall time: 7.25 s


Unnamed: 0,document,code
1,department of computer science ventsi chonev s...,3734
26,department of computer science university of o...,4026
2,joel ouaknine ultimate positivity is decidable...,3724
26,department of computer science university of o...,4024
22,article title in vivo and in silico investigat...,4025


#### Topic modeling
Some fancy indexing and sorting on each cluster to identify which are the top n words that are nearest to the cluster centroid. This gives an idea of the main topic of each the cluster.

In [7]:
# map -> {code: token_list}
tokens_nc_map = to_tokens_map(codecontent_map_nc)
# map -> {code: stem_list}
stems_nc_map = to_stems_map(codecontent_map_nc)

# total vocabulary, list of tokens
totalvocab_nc_stemmed = [stem for key in codecontent_map_nc for stem in stems_nc_map[key]]
# total vocabulary, list of stems
totalvocab_nc_tokenized = [stem for key in codecontent_map_nc for stem in tokens_nc_map[key]]

vocab_nc_frame = pd.DataFrame({'words': totalvocab_nc_tokenized}, index = totalvocab_nc_stemmed)
terms_nc = tfidf_vectorizer.get_feature_names()

# sort cluster centers by proximity to centroid
order_centroids_nc = kmeans.cluster_centers_.argsort()[:,::-1]

num_clusters_nc = len(set(kmeans_labels_nc))
words_matrix_nc = [None] * num_clusters_nc
top_n = 7

for i in range(num_clusters_nc):
    cluster_chart = [vocab_nc_frame.ix[terms_nc[ind].split(' ')].values.tolist()[0][0] 
                     for ind in order_centroids_nc[i,:top_n]]
    words_matrix_nc[i] = cluster_chart
    
pd.DataFrame(
    words_matrix_nc, 
    index = ["Cluster " + str(i) + " - Top Words" for i in range(num_clusters_nc)],
    columns = list(range(1, top_n+1))
)

Unnamed: 0,1,2,3,4,5,6,7
Cluster 0 - Top Words,signed,oxford,services,university,university,activities,help
Cluster 1 - Top Words,students,people,research,vacancies,university,oxford,university
Cluster 2 - Top Words,problems,complexity,time,algorithms,m,paper,research
Cluster 3 - Top Words,course,terms,lecture,timetables,students,group,science
Cluster 4 - Top Words,projects,research,verification,software,activities,automated,university
Cluster 5 - Top Words,publications,security,university,oxford,engineering,software,software
Cluster 6 - Top Words,programme,course,website,software,engineering,software,using
Cluster 7 - Top Words,booktitle,publisher,author,title,proceedings,pages,year
Cluster 8 - Top Words,students,projects,online,online,computer,science,online
Cluster 9 - Top Words,programming,research,group,paper,abstract,available,using


### K-Means Plot
Applying t-SNE for dimensionality reduction. We need two dimensional vectors for visualization purposes.

In [8]:
%time tsne = TSNE(n_components=2, random_state=1)
twodim_docs_nc = tsne.fit_transform(dist_nc)

clusters_colors_nc = [ get_color(i) for i in kmeans_labels_nc]

#kmeans_data = scatter_plot(twodim_docs_nc, word_labels=urls_nc, colors=clusters_colors_nc)
#py.iplot(kmeans_data, filename="K-Means t-SNE nocostraint - Doc Clustering")

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 16 µs


<div>
    <a href="https://plot.ly/~chrispolo/70" 
        target="_blank" title="y" 
        style="display: block; text-align: center;">
            <img src="../dataset/img/nc_docs_wordvectors_scatter_plot_KMEANS.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:70"  src="https://plot.ly/embed.js" async></script>
</div>

---

## List-costraint documents

In [9]:
listcostraint_path = os.getcwd() + "/../dataset/" + site + "/list_constraint/words10000_depth5/"
vertex_lc_path     = listcostraint_path + "vertex.txt"
map_lc_path        = listcostraint_path + "urlsMap.txt"

codecontent_map_lc = get_content_map(vertex_lc_path)
urlmap_lc = get_urlmap(map_lc_path)

# document list
documents_lc = [codecontent_map_lc[key] for key in codecontent_map_lc]
codes_lc     = [key for key in codecontent_map_lc]
urls_lc      = [urlmap_lc[key] for key in codecontent_map_lc]

#### Cosine-Similarity
Cosine similarity is measured against the tf-idf matrix and can be used to generate a measure of similarity between each document and the other documents in the corpus.

Subtracting it from 1 provides cosine distance which I will use for plotting on a euclidean (2-dimensional) plane.

In [10]:
%time tfidf_matrix_lc = tfidf_vectorizer.fit_transform(documents_lc)

dist_lc = 1 - cosine_similarity(tfidf_matrix_lc)

print(pd.DataFrame({"documents":tfidf_matrix_lc.shape[0], "terms":tfidf_matrix_lc.shape[1]}, index=[""]).T)
print("\n\n")

CPU times: user 21.7 s, sys: 128 ms, total: 21.8 s
Wall time: 21.6 s
               
documents  3279
terms       366





## Clustering on List-costraint documents
#### K-Means
K-means initializes with a pre-determined number of clusters. Each observation is assigned to a cluster (cluster assignment) so as to minimize the within cluster sum of squares. Next, the mean of the clustered observations is calculated and used as the new cluster centroid. Then, observations are reassigned to clusters and centroids recalculated in an iterative process until the algorithm reaches convergence.

In [11]:
kmeans = KMeans(n_clusters=25)
%time kmeans_labels_lc = kmeans.fit_predict(tfidf_matrix_lc)
docs_lc = {
    'code': codes_lc,
    'document': documents_lc
}

frame_lc = pd.DataFrame(docs_lc, index = [kmeans_labels_lc] , columns = ['document', 'code'])
frame_lc[:5]

CPU times: user 4.68 s, sys: 0 ns, total: 4.68 s
Wall time: 4.68 s


Unnamed: 0,document,code
18,department of computer science university of o...,3724
18,department of computer science university of o...,3725
19,skip to main content we use cookies on this we...,344
1,skip to main content we use cookies on this we...,345
19,skip to main content we use cookies on this we...,346


#### Topic modeling
Some fancy indexing and sorting on each cluster to identify which are the top n words that are nearest to the cluster centroid. This gives an idea of the main topic of each the cluster.

In [12]:
# map -> {code: token_list}
tokens_lc_map = to_tokens_map(codecontent_map_lc)
# map -> {code: stem_list}
stems_lc_map = to_stems_map(codecontent_map_lc)

# total vocabulary, list of tokens
totalvocab_lc_stemmed = [stem for key in codecontent_map_lc for stem in stems_lc_map[key]]
# total vocabulary, list of stems
totalvocab_lc_tokenized = [stem for key in codecontent_map_lc for stem in tokens_lc_map[key]]

vocab_lc_frame = pd.DataFrame({'words': totalvocab_lc_tokenized}, index = totalvocab_lc_stemmed)
terms_lc = tfidf_vectorizer.get_feature_names()

# sort cluster centers by proximity to centroid
order_centroids_lc = kmeans.cluster_centers_.argsort()[:, ::-1]

num_clusters_lc = len(set(kmeans_labels_lc))
words_matrix_lc = [None] * num_clusters_lc
top_n = 7

for i in range(num_clusters_lc):
    cluster_chart = [vocab_lc_frame.ix[terms_lc[ind].split(' ')].values.tolist()[0][0] for ind in order_centroids_lc[i,:top_n]]
    words_matrix_lc[i] = cluster_chart
    
pd.DataFrame(
    words_matrix_lc, 
    index = ["Cluster " + str(i) + " - Top Words" for i in range(num_clusters_lc)],
    columns = list(range(1, top_n+1))
)

Unnamed: 0,1,2,3,4,5,6,7
Cluster 0 - Top Words,computer,computer,science,courses,mathematics,students,degrees
Cluster 1 - Top Words,data,computer,projects,research,programming,information,university
Cluster 2 - Top Words,j,d,m,c,author,title,p
Cluster 3 - Top Words,data,research,models,computer,students,bibtex,oxford
Cluster 4 - Top Words,publications,university,oxford,computer,university,university,oxford
Cluster 5 - Top Words,oxford,services,university,university,activities,helps,please
Cluster 6 - Top Words,students,cookies,website,ox,research,road,parks
Cluster 7 - Top Words,new,share,research,workshop,uk,science,october
Cluster 8 - Top Words,booktitle,year,inproceedings,publisher,author,title,proceedings
Cluster 9 - Top Words,journal,author,title,year,numbers,volume,doi


### K-Means Plot
Applying t-SNE for dimensionality reduction. We need two dimensional vectors for visualization purposes.

In [13]:
tsne = TSNE(n_components=2, random_state=1)
twodim_docs_lc = tsne.fit_transform(dist_lc)

clusters_colors_lc = [ get_color(i) for i in kmeans_labels_lc]

#k_tsne_data_lc = scatter_plot(twodim_docs_lc, word_labels=urls_lc, colors=clusters_colors_lc)
#py.iplot(k_tsne_data_lc, filename="K-Means listcostraint - Doc Clustering")

<div>
    <a href="https://plot.ly/~chrispolo/74" 
        target="_blank" title="y" 
        style="display: block; text-align: center;">
            <img src="../dataset/img/lc_docs_wordvectors_scatter_plot_KMEANS.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:74"  src="https://plot.ly/embed.js" async></script>
</div>

---

## Evaluation
Evaluating the performance of a clustering algorithm is not as trivial as counting the number of errors or the precision and recall of a supervised classification algorithm. In particular any evaluation metric should not take the absolute values of the cluster labels into account but rather if this clustering define separations of the data similar to some ground truth set of classes or satisfying some assumption such that members belong to the same class are more similar that members of different classes according to some similarity metric.

See the [scikit-learn documentaion](http://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation "ti") for futher information

### Ground Truth

In [14]:
gt = GroundTruth(os.getcwd() + "/../dataset/" + site + "/ground_truth/urlToMembership.txt")
ground_truth_lc = [int(gt.get_groundtruth(urlmap_lc[key])) for key in codecontent_map_lc]

gt = GroundTruth(os.getcwd() + "/../dataset/" + site + "/ground_truth/urlToMembership.txt")
ground_truth_nc = [int(gt.get_groundtruth(urlmap_nc[key])) for key in codecontent_map_nc]

print("Clusters found manually for no-costraint documents:", len(set(ground_truth_nc)))
print([label for label in set(ground_truth_nc)])
print()
print("Clusters found manually for list-costraint documents:", len(set(ground_truth_lc)))
print([label for label in set(ground_truth_lc)])
print("\n\n")

Url not found
Url not found
Url not found
Url not found
Clusters found manually for no-costraint documents: 27
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1]

Clusters found manually for list-costraint documents: 22
[0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 24, -1]





## DBSCAN and HDBSCAN
Applying other clustering algorithm for evaluation purposes.

**DBSCAN** - Density-Based Spatial Clustering of Applications with Noise. Finds core samples of high density and expands clusters from them. Good for data which contains clusters of similar density.

**params**:

- **eps** : The maximum distance between two samples for them to be considered as in the same neighborhood.
- **min_samples** : The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself.


**HDBSCAN** - Hierarchical Density-Based Spatial Clustering of Applications with Noise. Performs DBSCAN over varying epsilon values and integrates the result to find a clustering that gives the best stability over epsilon. This allows HDBSCAN to find clusters of varying densities (unlike DBSCAN), and be more robust to parameter selection.

**params**:

- **min_cluster_size** : minimum nodes to form a cluster

###  No-costraint

In [18]:
dbscan = DBSCAN(eps=0.3, min_samples=5)
%time dbscan_labels_nc = dbscan.fit_predict(tfidf_matrix_nc)

print("Clusters found with DBSCAN:", len(set(dbscan_labels_nc)))
print ([label for label in set(dbscan_labels_nc)])
print("\n")

hdbscan = HDBSCAN(min_cluster_size=15)
%time hdbscan_labels_nc = hdbscan.fit_predict(tfidf_matrix_nc)

print("Clusters found with HDBSCAN:", len(set(hdbscan_labels_nc)))
print([label for label in set(hdbscan_labels_nc)])
print("\n\n")

CPU times: user 2.29 s, sys: 28 ms, total: 2.32 s
Wall time: 2.32 s
Clusters found with DBSCAN: 28
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, -1]


CPU times: user 2.75 s, sys: 76 ms, total: 2.83 s
Wall time: 2.83 s
Clusters found with HDBSCAN: 20
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, -1]





### List-costraint

In [29]:
dbscan = DBSCAN(eps=0.7, min_samples=7)
%time dbscan_labels_lc = dbscan.fit_predict(tfidf_matrix_lc)

print("Clusters found with DBSCAN:", len(set(dbscan_labels_lc)))
print ([label for label in set(dbscan_labels_lc)])
print("\n")

hdbscan = HDBSCAN(min_cluster_size=7)
%time hdbscan_labels_lc = hdbscan.fit_predict(tfidf_matrix_lc)

print("Clusters found with HDBSCAN:", len(set(hdbscan_labels_lc)))
print([label for label in set(hdbscan_labels_lc)])
print("\n\n")

CPU times: user 1.43 s, sys: 32 ms, total: 1.46 s
Wall time: 1.46 s
Clusters found with DBSCAN: 24
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, -1]


CPU times: user 1.89 s, sys: 32 ms, total: 1.92 s
Wall time: 1.92 s
Clusters found with HDBSCAN: 28
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, -1]





### Metrics:

- **Homogeneity**: each cluster contains only members of a single class


- **Completeness**: all members of a given class are assigned to the same cluster


- **Adjusted Rand index**: Given the knowledge of the *ground truth* class assignments and our clustering algorithm assignments of the same samples, the adjusted Rand index is a function that measures the similarity of the two assignments, ignoring permutations and with chance normalization


- **V-measure**: The V-measure is actually equivalent to the mutual information (NMI) discussed above normalized by the sum of the label entropies


- **Mutual Information based scores**: Given the knowledge of the ground truth class assignments and our clustering algorithm assignments of the same samples, the Mutual Information is a function that measures the agreement of the two assignments, ignoring permutations. Two different normalized versions of this measure are available, Normalized Mutual Information(NMI) and Adjusted Mutual Information(AMI). NMI is often used in the literature while AMI was proposed more recently and is normalized against chance


- **Silhouette**: If the ground truth labels are not known, evaluation must be performed using the model itself. The Silhouette Coefficient is an example of such an evaluation, where a higher Silhouette Coefficient score relates to a model with better defined clusters. The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. Scores around zero indicate overlapping clusters. The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster.

In [30]:
metrics_df = pd.DataFrame([
        [
            # dbscan nocostraint
            metrics.homogeneity_score(ground_truth_nc, dbscan_labels_nc),
            metrics.completeness_score(ground_truth_nc, dbscan_labels_nc),
            metrics.v_measure_score(ground_truth_nc, dbscan_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, dbscan_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, dbscan_labels_nc),
            metrics.silhouette_score(tfidf_matrix_nc, dbscan_labels_nc, metric='euclidean')
        ],
        [
            # hdbscan nocostraint
            metrics.homogeneity_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.completeness_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.v_measure_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.silhouette_score(tfidf_matrix_nc, hdbscan_labels_nc, metric='euclidean')
        ],
        [
            # kmeans nocostraint
            metrics.homogeneity_score(ground_truth_nc, kmeans_labels_nc),
            metrics.completeness_score(ground_truth_nc, kmeans_labels_nc),
            metrics.v_measure_score(ground_truth_nc, kmeans_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, kmeans_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, kmeans_labels_nc),
            metrics.silhouette_score(tfidf_matrix_nc, kmeans_labels_nc, metric='euclidean')
        ],
        [
            # dbscan listcostraint
            metrics.homogeneity_score(ground_truth_lc, dbscan_labels_lc),
            metrics.completeness_score(ground_truth_lc, dbscan_labels_lc),
            metrics.v_measure_score(ground_truth_lc, dbscan_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, dbscan_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, dbscan_labels_lc),
            metrics.silhouette_score(tfidf_matrix_lc, dbscan_labels_lc, metric='euclidean')
        ],
        [
            # hdbscan listcostraint
            metrics.homogeneity_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.completeness_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.v_measure_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.silhouette_score(tfidf_matrix_lc, hdbscan_labels_lc, metric='euclidean')
        ],
        [
            # kmeans listcostraint
            metrics.homogeneity_score(ground_truth_lc, kmeans_labels_lc),
            metrics.completeness_score(ground_truth_lc, kmeans_labels_lc),
            metrics.v_measure_score(ground_truth_lc, kmeans_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, kmeans_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, kmeans_labels_lc),
            metrics.silhouette_score(tfidf_matrix_lc, kmeans_labels_lc, metric='euclidean')
        ]],
        index=[
            "NoCostraint - DBSCAN", 
            "NoCostraint - HDBSCAN", 
            "NoCostraint - K-MEANS", 
            "ListCostraint - DBSCAN", 
            "ListCostraint - HDBSCAN", 
            "ListCostraint - K-MEANS"
        ],
        columns=[
            "Homogeneity", 
            "Completeness", 
            "V-Measure score", 
            "Adjusted Rand index", 
            "Mutual Information",
            "Silhouette"
        ])

metrics_df

Unnamed: 0,Homogeneity,Completeness,V-Measure score,Adjusted Rand index,Mutual Information,Silhouette
NoCostraint - DBSCAN,0.201409,0.508282,0.288499,0.065801,0.181278,-0.119022
NoCostraint - HDBSCAN,0.342136,0.473136,0.397111,0.106186,0.326098,0.059871
NoCostraint - K-MEANS,0.702142,0.497646,0.582466,0.271434,0.482105,0.15551
ListCostraint - DBSCAN,0.496163,0.483704,0.489854,0.261695,0.468999,0.131982
ListCostraint - HDBSCAN,0.4613,0.474449,0.467782,0.12197,0.441864,0.052641
ListCostraint - K-MEANS,0.731496,0.49463,0.590184,0.262358,0.481351,0.160001
