## **4. Keywords Clustering** 
We will compare different models implemeting each of these parameters:
- **Algorithm**: K-Means vs Expectation maximization VS Agglomerative
- **Embedding** : One-hot vs Sentence transformers
- **Number of clusters** : ranging from N/10, N/2 the total number of terms)

In [27]:
acteur = 'chum'

In [28]:
import glob
from pandas import *
from collections import Counter
import numpy as np
from sklearn.decomposition import TruncatedSVD

In [29]:
# # get data file names
# path ='../05-transformation'
# filenames = glob.glob(path + "/*.csv")

# dfs = []
# for filename in filenames:
#     dfs.append(read_csv(filename))

# # Concatenate all data into one DataFrame
# df = concat(dfs, ignore_index=True)

In [30]:
base_path ='../05-transformation/'
file_path = base_path + acteur + '_weighting_OKapiBM25.csv'

with open(file_path, encoding='utf-8') as f:
    df = read_csv(f).drop(columns=["Unnamed: 0", 'Structure syntaxique', 'LLR', 'TF (sklearn)', 'DF (sklearn)', 'TF-IDF', 'OkapiBM25', 'Terme formatté'])
df['Fréquence totale (TF)'] = df.groupby(['Terme'])['Fréquence (TF)'].transform('sum')
df['Fréquence documentaire totale (DF)'] = df.groupby(['Terme'])['Fréquence documentaire (DF)'].transform('sum')
df = df.drop(columns = ['Corpus', 'Fréquence (TF)', 'Fréquence documentaire (DF)'])
df = df.drop_duplicates(subset=['Terme'])
df['Terme'] = df['Terme'].astype('str')
df['TF + DF'] = df['Fréquence totale (TF)'] + df['Fréquence documentaire totale (DF)']

df

Unnamed: 0,Terme,isMeSHTerm,isTaxoTerm,Fréquence totale (TF),Fréquence documentaire totale (DF),TF + DF
0,chirurgiens du canada,False,False,46,46,92
1,réunions hebdomadaires,False,False,86,86,172
2,centre hospitalier de l'université,False,False,115,88,203
3,activité de développement professionnel,False,False,43,43,86
4,centre de recherche du centre,False,False,78,68,146
...,...,...,...,...,...,...
179,professeur au département,False,False,34,26,60
180,chercheurs du crchum,False,False,37,30,67
181,recherche chirurgie,False,False,33,32,65
182,calendrier des conférences,False,False,40,40,80


In [31]:
algorithmes = ['K-Means', 'Expectation-Maximization', 'AgglomerativeClustering']
embeddings = ['One-Hot', 'Sentence transformers']

nb_termes = len(df['Terme'].tolist())
clusters = range(round(nb_termes/10), round(nb_termes/2))

results = []
for algorithme in algorithmes:
    for embedding in embeddings:
        for cluster in clusters:
            results.append(\
            {'algorithme' : algorithme,\
                'embedding': embedding, \
                'K (nb clusters)' : cluster, \
                'Score Silhouette': None})


# On va remplir ce dictionnaire avec les bons scores au fur et à mesure qu'on expérimente
results = DataFrame(results)
results

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
0,K-Means,One-Hot,18,
1,K-Means,One-Hot,19,
2,K-Means,One-Hot,20,
3,K-Means,One-Hot,21,
4,K-Means,One-Hot,22,
...,...,...,...,...
439,AgglomerativeClustering,Sentence transformers,87,
440,AgglomerativeClustering,Sentence transformers,88,
441,AgglomerativeClustering,Sentence transformers,89,
442,AgglomerativeClustering,Sentence transformers,90,


In [32]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("(\w+\'|\w+-\w+|\(|\)|\w+)")

file_path = "../04-filtrage/stopwords.txt"
with open(file_path, 'r', encoding="utf-8") as f:
    stopwords = [t.lower().strip('\n') for t in f.readlines()]

def to_tokens(kw, min_chars=2):
    tokens = tokenizer.tokenize(str(kw)) # split the string into a list of words
    tokens = [word for word in tokens if len(word) > min_chars] 
    tokens = [str(word) for word in tokens if word not in stopwords] 
    
    tokens = set(tokens) # to remove duplicates
    tokens = sorted(tokens) # converts our set back to a list and sorts words in alphabetical order
    return tokens

In [33]:
df['tokens'] = df["Terme"].apply(lambda x: to_tokens(
    x,
    min_chars=2,
))


# # Test - Seulement retenir des n-grammes où n est au-dessus de 2 - en fait pas besoin
# df["len"] = df["tokens"].apply(lambda x : len(x))
# df = df[df['len'] > 1].drop(columns=["len"])

df

Unnamed: 0,Terme,isMeSHTerm,isTaxoTerm,Fréquence totale (TF),Fréquence documentaire totale (DF),TF + DF,tokens
0,chirurgiens du canada,False,False,46,46,92,"[canada, chirurgiens]"
1,réunions hebdomadaires,False,False,86,86,172,"[hebdomadaires, réunions]"
2,centre hospitalier de l'université,False,False,115,88,203,"[centre, hospitalier, université]"
3,activité de développement professionnel,False,False,43,43,86,"[activité, développement, professionnel]"
4,centre de recherche du centre,False,False,78,68,146,"[centre, recherche]"
...,...,...,...,...,...,...,...
179,professeur au département,False,False,34,26,60,"[département, professeur]"
180,chercheurs du crchum,False,False,37,30,67,"[chercheurs, crchum]"
181,recherche chirurgie,False,False,33,32,65,"[chirurgie, recherche]"
182,calendrier des conférences,False,False,40,40,80,"[calendrier, conférences]"


In [34]:
vocab = sorted(set(df["tokens"].explode()))
len(vocab)

dim = len(vocab)
dim

158

In [35]:
vocab

['activité',
 'adjoint',
 'adjointe',
 'affiliations',
 'agrégé',
 'agrégée',
 'alzheimer',
 'amphithéâtre',
 'animaux',
 'anne-marie',
 'appropriées',
 'artificielle',
 'assistance',
 'associé',
 'atteints',
 'axe',
 'besoin',
 'biologie',
 'cabinet',
 'calendrier',
 'canada',
 'canadienne',
 'cancer',
 'cancéreuses',
 'cardiaque',
 'cardiométabolique',
 'cardiovasculaires',
 'carrefour',
 'cellulaire',
 'cellules',
 'central',
 'centre',
 'chaire',
 'chaires',
 'chercheur',
 'chercheurs',
 'chirurgie',
 'chirurgiens',
 'chum',
 'clinique',
 'cliniques',
 'collège',
 'conférence',
 'conférences',
 'conférenciers',
 'continu',
 'contribution',
 'crchum',
 'demande',
 'diabète',
 'directeur',
 'département',
 'développement',
 'effets',
 'erik',
 'essais',
 'excellence',
 'faculté',
 'fondamentale',
 'fonds',
 'frcpc',
 'frcsc',
 'gratuites',
 'hebdomadaires',
 'hospitalier',
 'imagerie',
 'immunitaire',
 'immunopathologie',
 'immédiate',
 'infirmiers',
 'infirmières',
 'information',
 

In [36]:
def to_vector(keyword,vocab):
    """
    Calculates vector of keyword on given vocabulary.

    Returns vector as a list of values.  
    """
    vector = []
    for word in vocab:
        if word in keyword:
            vector.append(1)
        else:
            vector.append(0)
    return vector

## **Kmeans**

### One-Hot embedding

> One Hot encoding is a representation of categorical variables as binary vectors. Each integer value is represented as a binary vector that is all zero values except the index of the integer, which is marked with a 1.

In [37]:
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from sklearn.metrics import calinski_harabasz_score


algo = 'K-Means'
embed = 'One-Hot'


counter = Counter(df["tokens"].explode().to_list())
vocab = []
for key in counter:
    vocab.append(key)

df["vector"] = df["tokens"].apply(lambda x: to_vector(x,vocab))
X = df['vector'].tolist()

df

Unnamed: 0,Terme,isMeSHTerm,isTaxoTerm,Fréquence totale (TF),Fréquence documentaire totale (DF),TF + DF,tokens,vector
0,chirurgiens du canada,False,False,46,46,92,"[canada, chirurgiens]","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,réunions hebdomadaires,False,False,86,86,172,"[hebdomadaires, réunions]","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,centre hospitalier de l'université,False,False,115,88,203,"[centre, hospitalier, université]","[0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,activité de développement professionnel,False,False,43,43,86,"[activité, développement, professionnel]","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, ..."
4,centre de recherche du centre,False,False,78,68,146,"[centre, recherche]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
179,professeur au département,False,False,34,26,60,"[département, professeur]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
180,chercheurs du crchum,False,False,37,30,67,"[chercheurs, crchum]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
181,recherche chirurgie,False,False,33,32,65,"[chirurgie, recherche]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
182,calendrier des conférences,False,False,40,40,80,"[calendrier, conférences]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [38]:
# Run NMF
# we redo the normalization to improve the k-means result.
nmf = NMF(n_components= round(len(vocab)/5))
normalizer = Normalizer(copy=False)
pip = make_pipeline(nmf, normalizer)

X = pip.fit_transform(X)
for k in clusters:
    kmeans = KMeans(n_clusters=k, init='k-means++', algorithm='elkan', random_state=0, n_init=1, max_iter=200).fit(X)
    labels = kmeans.labels_

    score  =  silhouette_score(X, labels)
    results.loc[((results['algorithme'] == algo) & \
                (results['embedding'] == embed) & \
                (results['K (nb clusters)'] == k)), 'Score Silhouette'] = score


results

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
0,K-Means,One-Hot,18,0.484659
1,K-Means,One-Hot,19,0.496138
2,K-Means,One-Hot,20,0.517902
3,K-Means,One-Hot,21,0.548243
4,K-Means,One-Hot,22,0.561945
...,...,...,...,...
439,AgglomerativeClustering,Sentence transformers,87,
440,AgglomerativeClustering,Sentence transformers,88,
441,AgglomerativeClustering,Sentence transformers,89,
442,AgglomerativeClustering,Sentence transformers,90,


*Stock resulting clusters into a CSV file*

In [39]:
tab = results[((results['algorithme'] == algo) & \
                  (results['embedding'] == embed))]
                  
# Essayer d'extraire des courbes / graphiques à partir de ce tableau
tab

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
0,K-Means,One-Hot,18,0.484659
1,K-Means,One-Hot,19,0.496138
2,K-Means,One-Hot,20,0.517902
3,K-Means,One-Hot,21,0.548243
4,K-Means,One-Hot,22,0.561945
...,...,...,...,...
69,K-Means,One-Hot,87,0.647394
70,K-Means,One-Hot,88,0.641593
71,K-Means,One-Hot,89,0.635404
72,K-Means,One-Hot,90,0.632076


In [40]:
params = tab[((tab['algorithme'] == algo) & \
                  (tab['embedding'] == embed) & \
                 (tab['Score Silhouette'] == tab['Score Silhouette'].max()))]

params

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
47,K-Means,One-Hot,65,0.680822


In [41]:
k = params['K (nb clusters)'].values[0]
k

65

In [42]:
kmeans = KMeans(n_clusters=k, init='k-means++', algorithm='elkan', random_state=0, n_init=1, max_iter=200).fit(X)
labels = kmeans.labels_
df["kmeans"] = labels

# Pour mieux interpréter, on assigne un label significatif à nos clusters
# On retient le terme pour chaque cluster dont la valeur TF + DF est la plus élevée
current_labels = set(kmeans.labels_.tolist())

desired_labels = {x : None for x in current_labels} # (on initialise à None)

for label in current_labels:
    cluster = df[df["kmeans"] == label]
    max_freq = cluster['TF + DF'].max()
    new_label = cluster[cluster['TF + DF'] == max_freq]['Terme'].values[0]

    desired_labels[label] = new_label

df['Cluster'] = df['kmeans'].map(desired_labels)

df.sort_values(["Cluster"], 
        axis=0,
        ascending=[False], 
        inplace=True)


# On stocke les résultats dans un CSV
base_path = '../06-clustering/'
file_path = base_path + algo + '_' + embed + '.csv'
df[['Cluster', 'Terme', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)', 'TF + DF']].sort_values(['Cluster', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)'],
            ascending = [True, False, False]).to_csv(file_path)

df = df.drop(columns=['Cluster'])

### Sentence transformers embedding

> "A **transformer** is a deep learning model that adopts the mechanism of self-attention, differentially weighting the significance of each part  of the input data.
Transformers are increasingly the model of choice for NLP problems, replacing RNN models such as long short-term memory (LSTM). The additional  training parallelization allows training on larger datasets. This led to the development of pretrained systems such as BERT (Bidirectional Encoder Representations from Transformers) and GPT (Generative Pre-trained Transformer), which were trained with large language datasets, such as the Wikipedia Corpus and Common Crawl, and can be fine-tuned for specific tasks."   
  
(https://en.wikipedia.org/wiki/Transformer_(machine_learning_model))


In [43]:
from sentence_transformers import SentenceTransformer, models
import torch

# On va utiliser un modèle BERT/sentence transformers (fr) pour extraire nos embeddings plutôt que des simples one-hot encoding
model =  SentenceTransformer("dangvantuan/sentence-camembert-base")

sentences = df['Terme'].tolist()
embeddings_st = model.encode(sentences, convert_to_numpy=True)



In [44]:
algo = 'K-Means'
embed = 'Sentence transformers'

X = embeddings_st

for k in clusters:
    kmeans = KMeans(n_clusters=k, init='k-means++', algorithm='elkan', random_state=0, n_init=1, max_iter=200).fit(X)
    labels = kmeans.labels_

    score  =  silhouette_score(X, labels)
    results.loc[((results['algorithme'] == algo) & \
                (results['K (nb clusters)'] == k) & \
                (results['embedding'] == embed)), 'Score Silhouette'] = score


results
    

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
0,K-Means,One-Hot,18,0.484659
1,K-Means,One-Hot,19,0.496138
2,K-Means,One-Hot,20,0.517902
3,K-Means,One-Hot,21,0.548243
4,K-Means,One-Hot,22,0.561945
...,...,...,...,...
439,AgglomerativeClustering,Sentence transformers,87,
440,AgglomerativeClustering,Sentence transformers,88,
441,AgglomerativeClustering,Sentence transformers,89,
442,AgglomerativeClustering,Sentence transformers,90,


*Stock resulting clusters into a CSV file*

In [45]:
tab = results[((results['algorithme'] == algo) & \
                  (results['embedding'] == embed))]
                  
# Essayer d'extraire des courbes / graphiques à partir de ce tableau
tab

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
74,K-Means,Sentence transformers,18,0.165266
75,K-Means,Sentence transformers,19,0.168499
76,K-Means,Sentence transformers,20,0.175649
77,K-Means,Sentence transformers,21,0.166413
78,K-Means,Sentence transformers,22,0.168207
...,...,...,...,...
143,K-Means,Sentence transformers,87,0.258137
144,K-Means,Sentence transformers,88,0.261278
145,K-Means,Sentence transformers,89,0.261667
146,K-Means,Sentence transformers,90,0.262704


In [46]:
params = tab[((results['algorithme'] == algo) & \
                  (results['embedding'] == embed) & \
                 (tab['Score Silhouette'] == tab['Score Silhouette'].max()))]

params

  params = tab[((results['algorithme'] == algo) & \


Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
147,K-Means,Sentence transformers,91,0.263134


In [47]:
k = params['K (nb clusters)'].values[0]
kmeans = KMeans(n_clusters=k, init='k-means++', algorithm='elkan', random_state=0, n_init=1, max_iter=200).fit(X)
labels = kmeans.labels_
df["kmeans"] = labels

# Pour mieux interpréter, on assigne un label significatif à nos clusters
# On retient le terme pour chaque cluster dont la valeur TF + DF est la plus élevée
current_labels = set(kmeans.labels_.tolist())

desired_labels = {x : None for x in current_labels} # (on initialise à None)

for label in current_labels:
    cluster = df[df["kmeans"] == label]
    max_freq = cluster['TF + DF'].max()
    new_label = cluster[cluster['TF + DF'] == max_freq]['Terme'].values[0]

    desired_labels[label] = new_label

df['Cluster'] = df['kmeans'].map(desired_labels)

df.sort_values(["Cluster"], 
        axis=0,
        ascending=[False], 
        inplace=True)


# On stocke les résultats dans un CSV
base_path = '../06-clustering/'
file_path = base_path + algo + '_' + embed + '.csv'
df[['Cluster', 'Terme', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)', 'TF + DF']].sort_values(['Cluster', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)'],
            ascending = [True, False, False]).to_csv(file_path)

df = df.drop(columns=['Cluster'])

## **Expectation-Maximization**

In [48]:
from sklearn.mixture import GaussianMixture

### One-Hot embedding

In [49]:
algo = 'Expectation-Maximization'
embed = 'One-Hot'


X = df['vector'].tolist()
for k in clusters:
    gmm = GaussianMixture(n_components=k, init_params='k-means++', covariance_type='diag').fit(X) # diag pour gérer MemoryError
    labels = gmm.predict(X)

    score  =  silhouette_score(X, labels)
    results.loc[((results['algorithme'] == algo) & \
                (results['K (nb clusters)'] == k) & \
                (results['embedding'] == embed)), 'Score Silhouette'] = score

results

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
0,K-Means,One-Hot,18,0.484659
1,K-Means,One-Hot,19,0.496138
2,K-Means,One-Hot,20,0.517902
3,K-Means,One-Hot,21,0.548243
4,K-Means,One-Hot,22,0.561945
...,...,...,...,...
439,AgglomerativeClustering,Sentence transformers,87,
440,AgglomerativeClustering,Sentence transformers,88,
441,AgglomerativeClustering,Sentence transformers,89,
442,AgglomerativeClustering,Sentence transformers,90,


*Stock resulting clusters into a CSV file*

In [50]:
tab = results[((results['algorithme'] == algo) & \
                  (results['embedding'] == embed))]
                  
# Essayer d'extraire des courbes / graphiques à partir de ce tableau
tab

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
148,Expectation-Maximization,One-Hot,18,0.071438
149,Expectation-Maximization,One-Hot,19,0.076719
150,Expectation-Maximization,One-Hot,20,0.077082
151,Expectation-Maximization,One-Hot,21,0.093755
152,Expectation-Maximization,One-Hot,22,0.080677
...,...,...,...,...
217,Expectation-Maximization,One-Hot,87,0.133336
218,Expectation-Maximization,One-Hot,88,0.111278
219,Expectation-Maximization,One-Hot,89,0.131585
220,Expectation-Maximization,One-Hot,90,0.125427


In [51]:
params = tab[(tab['Score Silhouette'] == tab['Score Silhouette'].max())]

params

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
205,Expectation-Maximization,One-Hot,75,0.14966


In [52]:
k = params['K (nb clusters)'].values[0]
gmm = GaussianMixture(n_components=k, init_params='k-means++', covariance_type='diag').fit(X) # diag pour gérer MemoryError
labels = gmm.predict(X)
df["clusters"] = labels

# Pour mieux interpréter, on assigne un label significatif à nos clusters
# On retient le terme pour chaque cluster dont la valeur TF + DF est la plus élevée
current_labels = set(kmeans.labels_.tolist())

desired_labels = {x : None for x in current_labels} # (on initialise à None)

for label in current_labels:
    cluster = df[df["clusters"] == label]
    max_freq = cluster['TF + DF'].max()
    new_label = cluster[cluster['TF + DF'] == max_freq]['Terme'].values[0]

    desired_labels[label] = new_label

df['Cluster'] = df['clusters'].map(desired_labels)

df.sort_values(["Cluster"], 
        axis=0,
        ascending=[False], 
        inplace=True)


# On stocke les résultats dans un CSV
base_path = '../06-clustering/'
file_path = base_path + algo + '_' + embed + '.csv'
df[['Cluster', 'Terme', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)', 'TF + DF']].sort_values(['Cluster', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)'],
            ascending = [True, False, False]).to_csv(file_path)

df = df.drop(columns=['Cluster'])

IndexError: index 0 is out of bounds for axis 0 with size 0

### Sentence transformers embedding

In [None]:
algo = 'Expectation-Maximization'
embed = 'Sentence transformers'

X = embeddings_st

for k in clusters:
    gmm = GaussianMixture(n_components=k, init_params='k-means++', covariance_type='diag').fit(X) # diag pour gérer MemoryError
    labels = gmm.predict(X)

    score  =  silhouette_score(X, labels)
    results.loc[((results['algorithme'] == 'Expectation-Maximization') & \
                (results['K (nb clusters)'] == k) & \
                (results['embedding'] == 'Sentence transformers')), 'Score Silhouette'] = score

results
    

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
0,K-Means,One-Hot,18,0.439495
1,K-Means,One-Hot,19,0.462772
2,K-Means,One-Hot,20,0.480734
3,K-Means,One-Hot,21,0.499245
4,K-Means,One-Hot,22,0.511111
...,...,...,...,...
439,AgglomerativeClustering,Sentence transformers,87,
440,AgglomerativeClustering,Sentence transformers,88,
441,AgglomerativeClustering,Sentence transformers,89,
442,AgglomerativeClustering,Sentence transformers,90,


*Stock resulting clusters into a CSV file*

In [None]:
tab = results[((results['algorithme'] == algo) & \
                  (results['embedding'] == embed))]
                  
# Essayer d'extraire des courbes / graphiques à partir de ce tableau
tab

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
222,Expectation-Maximization,Sentence transformers,18,0.148081
223,Expectation-Maximization,Sentence transformers,19,0.138617
224,Expectation-Maximization,Sentence transformers,20,0.142843
225,Expectation-Maximization,Sentence transformers,21,0.162941
226,Expectation-Maximization,Sentence transformers,22,0.16109
...,...,...,...,...
291,Expectation-Maximization,Sentence transformers,87,0.252432
292,Expectation-Maximization,Sentence transformers,88,0.253908
293,Expectation-Maximization,Sentence transformers,89,0.244674
294,Expectation-Maximization,Sentence transformers,90,0.235876


In [None]:
params = tab[(tab['Score Silhouette'] == tab['Score Silhouette'].max())]

params

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
290,Expectation-Maximization,Sentence transformers,86,0.272454


In [None]:
k = params['K (nb clusters)'].values[0]
gmm = GaussianMixture(n_components=k, init_params='k-means++', covariance_type='diag').fit(X) # diag pour gérer MemoryError
labels = gmm.predict(X)
df["clusters"] = labels

# Pour mieux interpréter, on assigne un label significatif à nos clusters
# On retient le terme pour chaque cluster dont la valeur TF + DF est la plus élevée
current_labels = set(kmeans.labels_.tolist())

desired_labels = {x : None for x in current_labels} # (on initialise à None)

for label in current_labels:
    cluster = df[df["clusters"] == label]
    max_freq = cluster['TF + DF'].max()
    new_label = cluster[cluster['TF + DF'] == max_freq]['Terme'].values[0]

    desired_labels[label] = new_label

df['Cluster'] = df['clusters'].map(desired_labels)

df.sort_values(["Cluster"], 
        axis=0,
        ascending=[False], 
        inplace=True)


# On stocke les résultats dans un CSV
base_path = '../06-clustering/'
file_path = base_path + algo + '_' + embed + '.csv'
df[['Cluster', 'Terme', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)', 'TF + DF']].sort_values(['Cluster', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)'],
            ascending = [True, False, False]).to_csv(file_path)

df = df.drop(columns=['Cluster'])

## **Agglomerative Clustering**

### One-Hot embedding

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
algo = 'AgglomerativeClustering'
embed = 'One-Hot'

for k in clusters:
    model = AgglomerativeClustering().fit(X)
    labels = model.labels_

    score  =  silhouette_score(X, labels)
    results.loc[((results['algorithme'] == 'AgglomerativeClustering') &
                (results['K (nb clusters)'] == k) & \
                (results['embedding'] == 'One-Hot')), 'Score Silhouette'] = score

results

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
0,K-Means,One-Hot,18,0.439495
1,K-Means,One-Hot,19,0.462772
2,K-Means,One-Hot,20,0.480734
3,K-Means,One-Hot,21,0.499245
4,K-Means,One-Hot,22,0.511111
...,...,...,...,...
439,AgglomerativeClustering,Sentence transformers,87,
440,AgglomerativeClustering,Sentence transformers,88,
441,AgglomerativeClustering,Sentence transformers,89,
442,AgglomerativeClustering,Sentence transformers,90,


*Stock resulting clusters into a CSV file*

In [None]:
tab = results[((results['algorithme'] == algo) & \
                  (results['embedding'] == embed))]
                  
# Essayer d'extraire des courbes / graphiques à partir de ce tableau
tab

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
296,AgglomerativeClustering,One-Hot,18,0.027938
297,AgglomerativeClustering,One-Hot,19,0.027938
298,AgglomerativeClustering,One-Hot,20,0.027938
299,AgglomerativeClustering,One-Hot,21,0.027938
300,AgglomerativeClustering,One-Hot,22,0.027938
...,...,...,...,...
365,AgglomerativeClustering,One-Hot,87,0.027938
366,AgglomerativeClustering,One-Hot,88,0.027938
367,AgglomerativeClustering,One-Hot,89,0.027938
368,AgglomerativeClustering,One-Hot,90,0.027938


In [None]:
params = tab[(tab['Score Silhouette'] == tab['Score Silhouette'].max())]

params

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
296,AgglomerativeClustering,One-Hot,18,0.027938
297,AgglomerativeClustering,One-Hot,19,0.027938
298,AgglomerativeClustering,One-Hot,20,0.027938
299,AgglomerativeClustering,One-Hot,21,0.027938
300,AgglomerativeClustering,One-Hot,22,0.027938
...,...,...,...,...
365,AgglomerativeClustering,One-Hot,87,0.027938
366,AgglomerativeClustering,One-Hot,88,0.027938
367,AgglomerativeClustering,One-Hot,89,0.027938
368,AgglomerativeClustering,One-Hot,90,0.027938


In [None]:
k = params['K (nb clusters)'].values[0]
model = AgglomerativeClustering().fit(X)
labels = model.labels_
df["kmeans"] = labels

# Pour mieux interpréter, on assigne un label significatif à nos clusters
# On retient le terme pour chaque cluster dont la valeur TF + DF est la plus élevée
current_labels = set(kmeans.labels_.tolist())

desired_labels = {x : None for x in current_labels} # (on initialise à None)

for label in current_labels:
    cluster = df[df["kmeans"] == label]
    max_freq = cluster['TF + DF'].max()
    new_label = cluster[cluster['TF + DF'] == max_freq]['Terme'].values[0]

    desired_labels[label] = new_label

df['Cluster'] = df['kmeans'].map(desired_labels)

df.sort_values(["Cluster"], 
        axis=0,
        ascending=[False], 
        inplace=True)


# On stocke les résultats dans un CSV
base_path = '../06-clustering/'
file_path = base_path + algo + '_' + embed + '.csv'
df[['Cluster', 'Terme', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)', 'TF + DF']].sort_values(['Cluster', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)'],
            ascending = [True, False, False]).to_csv(file_path)

df = df.drop(columns=['Cluster'])

### Sentence transformers embedding

In [None]:
algo = 'AgglomerativeClustering'
embed = 'Sentence transformers'


# Normalize the embeddings to unit length
X = embeddings_st /  np.linalg.norm(embeddings_st, axis=1, keepdims=True)

# Perform kmean clustering
gmm = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5) #, affinity='cosine', linkage='average', distance_threshold=0.4)
gmm.fit(X)

labels = gmm.labels_

score  =  silhouette_score(X, labels)
results.loc[((results['algorithme'] == algo) & \
            (results['embedding'] == embed)), 'Score Silhouette'] = score


# for k in clusters:
#     gaac = AgglomerativeClustering().fit(X)
#     labels = gaac.labels_

#     score  =  silhouette_score(X, labels)
#     results.loc[((results['algorithme'] == algo) & \
#                 (results['K (nb clusters)'] == k) & \
#                 (results['embedding'] == embed)), 'Score Silhouette'] = score

results
    

Unnamed: 0,algorithme,embedding,K (nb clusters),Score Silhouette
0,K-Means,One-Hot,18,0.439495
1,K-Means,One-Hot,19,0.462772
2,K-Means,One-Hot,20,0.480734
3,K-Means,One-Hot,21,0.499245
4,K-Means,One-Hot,22,0.511111
...,...,...,...,...
439,AgglomerativeClustering,Sentence transformers,87,0.237712
440,AgglomerativeClustering,Sentence transformers,88,0.237712
441,AgglomerativeClustering,Sentence transformers,89,0.237712
442,AgglomerativeClustering,Sentence transformers,90,0.237712


*Stock resulting clusters into a CSV file*

In [None]:
tab = results[((results['algorithme'] == algo) & \
                  (results['embedding'] == embed))].drop(columns=['K (nb clusters)']).drop_duplicates()
                  
tab

Unnamed: 0,algorithme,embedding,Score Silhouette
370,AgglomerativeClustering,Sentence transformers,0.237712


In [None]:
df['clusters'] = labels

# Pour mieux interpréter, on assigne un label significatif à nos clusters
# On retient le terme pour chaque cluster dont la valeur TF + DF est la plus élevée
current_labels = set(labels.tolist())

desired_labels = {x : None for x in current_labels} # (on initialise à None)

for label in current_labels:
    cluster = df[df["clusters"] == label]
    max_freq = cluster['TF + DF'].max()
    new_label = cluster[cluster['TF + DF'] == max_freq]['Terme'].values[0]

    desired_labels[label] = new_label

df['Cluster'] = df['clusters'].map(desired_labels)

df.sort_values(["Cluster"], 
        axis=0,
        ascending=[False], 
        inplace=True)


# On stocke les résultats dans un CSV
base_path = '../06-clustering/'
file_path = base_path + algo + '_' + embed + '.csv'
df[['Cluster', 'Terme', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)', 'TF + DF']].sort_values(['Cluster', 'Fréquence totale (TF)', 'Fréquence documentaire totale (DF)'],
            ascending = [True, False, False]).to_csv(file_path)

df = df.drop(columns=['Cluster'])

In [None]:
base_path = '../06-clustering/'
file_path = base_path + 'results_clustering_silhouette.csv'
results.to_csv(file_path)