In [2]:
import glob
from pandas import *
from collections import Counter

In [3]:
algorithmes = ['K-Means', 'Expectation-Maximization']
embeddings = ['One-Hot', 'Sentence transformers']
features = ['10%', '20%', '30%', '40%', '50%', '75%', '100%'] # (100% = aucune réduction de dimensionnalité)
clusters = [50, 100, 150, 200]

results = []
for algorithme in algorithmes:
    for embedding in embeddings:
        for feature in features:
            for cluster in clusters:
                results.append(\
                {'algorithme' : algorithme,\
                    'embedding': embedding, \
                    'N features': feature, \
                    'K (nb clusters)' : cluster,\
                    'Score Silhouette': None})


# On va remplir ce dictionnaire avec les bons scores au fur et à mesure qu'on expérimente
results = DataFrame(results)
results

Unnamed: 0,algorithme,embedding,N features,K (nb clusters),Score Silhouette
0,K-Means,One-Hot,10%,50,
1,K-Means,One-Hot,10%,100,
2,K-Means,One-Hot,10%,150,
3,K-Means,One-Hot,10%,200,
4,K-Means,One-Hot,20%,50,
...,...,...,...,...,...
75,Expectation-Maximization,Sentence transformers,40%,200,
76,Expectation-Maximization,Sentence transformers,50%,50,
77,Expectation-Maximization,Sentence transformers,50%,100,
78,Expectation-Maximization,Sentence transformers,50%,150,


In [4]:
# get data file names
path ='../05-transformation'
filenames = glob.glob(path + "/*.csv")

dfs = []
for filename in filenames:
    dfs.append(read_csv(filename))

# Concatenate all data into one DataFrame
big_frame = concat(dfs, ignore_index=True).drop(columns=["Unnamed: 0", 'Structure syntaxique', 'LLR', 'TF (sklearn)', 'DF (sklearn)', 'TF-IDF', 'OkapiBM25', 'Terme formatté'])
big_frame['Fréquence totale (TF)'] = big_frame.groupby(['Terme'])['Fréquence (TF)'].transform('sum')
big_frame['Fréquence documentaire totale (DF)'] = big_frame.groupby(['Terme'])['Fréquence documentaire (DF)'].transform('sum')
big_frame = big_frame.drop(columns = ['Corpus', 'Fréquence (TF)', 'Fréquence documentaire (DF)'])
big_frame = big_frame.drop_duplicates(subset=['Terme'])
big_frame['Terme'] = big_frame['Terme'].astype('str')

big_frame

Unnamed: 0,Terme,isMeSHTerm,isTaxoTerm,Fréquence totale (TF),Fréquence documentaire totale (DF)
0,clinique de cognition,False,False,24,6
1,problèmes liés,False,False,42,30
2,usagers du sud-ouest-verdun,False,False,24,24
3,accès aux services secteurs des faubourgs,False,False,58,58
4,gynécologie,False,True,20,14
...,...,...,...,...,...
12641,vie professionnelle favorisant la formation,False,False,68,68
12643,jours de congé,False,False,69,61
12645,jour d'utilisation,False,False,61,61
12646,statut temps complet permanent salaire,False,False,19,19


In [5]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("(\w+\'|\w+-\w+|\(|\)|\w+)")

file_path = "../04-filtrage/stopwords.txt"
with open(file_path, 'r', encoding="utf-8") as f:
    stopwords = [t.lower().strip('\n') for t in f.readlines()]

def to_tokens(kw, min_chars=2):
    tokens = tokenizer.tokenize(str(kw)) # split the string into a list of words
    tokens = [word for word in tokens if len(word) > min_chars] 
    tokens = [str(word) for word in tokens if word not in stopwords] 
    
    tokens = set(tokens) # to remove duplicates
    tokens = sorted(tokens) # converts our set back to a list and sorts words in alphabetical order
    return tokens

In [6]:
big_frame['tokens'] = big_frame["Terme"].apply(lambda x: to_tokens(
    x,
    min_chars=2,
)).astype(str)

big_frame

Unnamed: 0,Terme,isMeSHTerm,isTaxoTerm,Fréquence totale (TF),Fréquence documentaire totale (DF),tokens
0,clinique de cognition,False,False,24,6,"['clinique', 'cognition']"
1,problèmes liés,False,False,42,30,"['liés', 'problèmes']"
2,usagers du sud-ouest-verdun,False,False,24,24,"['sud-ouest', 'usagers', 'verdun']"
3,accès aux services secteurs des faubourgs,False,False,58,58,"['accès', 'faubourgs', 'secteurs', 'services']"
4,gynécologie,False,True,20,14,['gynécologie']
...,...,...,...,...,...,...
12641,vie professionnelle favorisant la formation,False,False,68,68,"['favorisant', 'formation', 'professionnelle',..."
12643,jours de congé,False,False,69,61,"['congé', 'jours']"
12645,jour d'utilisation,False,False,61,61,"['jour', 'utilisation']"
12646,statut temps complet permanent salaire,False,False,19,19,"['complet', 'permanent', 'salaire', 'statut', ..."


In [7]:
vocab = sorted(set(big_frame["tokens"].explode()))
len(vocab)

dim = len(vocab)
dim

11530

In [8]:
def to_vector(keyword,vocab):
    """
    Calculates vector of keyword on given vocabulary.

    Returns vector as a list of values.  
    """
    vector = []
    for word in vocab:
        if word in keyword:
            vector.append(1)
        else:
            vector.append(0)
    return vector

## **Kmeans**

### One-Hot embedding

In [9]:
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

algo = 'K-Means'
embed = 'One-Hot'

for ratio in features:
    r = float(ratio.strip('%'))/100
    n = round(r * dim)
    counter = Counter(big_frame["tokens"].explode().to_list())
    vocab = []
    for key,value in counter.most_common(n):
        vocab.append(key)
    
    big_frame["vector"] = big_frame["tokens"].apply(lambda x: to_vector(x,vocab))

    X = big_frame['vector'].tolist()
    for k in clusters:
        kmeans = KMeans(n_clusters=k, init='k-means++', algorithm='elkan', random_state=0, n_init=1, max_iter=200).fit(X)
        labels = kmeans.labels_

        score  =  silhouette_score(X, labels)
        results.loc[((results['algorithme'] == algo) & \
                    (results['N features'] == ratio) & \
                    (results['K (nb clusters)'] == k) & \
                    (results['embedding'] == embed)), 'Score Silhouette'] = score

results
    

Unnamed: 0,algorithme,embedding,N features,K (nb clusters),Score Silhouette
0,K-Means,One-Hot,10%,50,0.815169
1,K-Means,One-Hot,10%,100,0.82806
2,K-Means,One-Hot,10%,150,0.843009
3,K-Means,One-Hot,10%,200,0.854241
4,K-Means,One-Hot,20%,50,0.669468
...,...,...,...,...,...
75,Expectation-Maximization,Sentence transformers,40%,200,
76,Expectation-Maximization,Sentence transformers,50%,50,
77,Expectation-Maximization,Sentence transformers,50%,100,
78,Expectation-Maximization,Sentence transformers,50%,150,


### Sentence transformers embedding

In [10]:
# On va utiliser un modèle BERT/sentence transformers (fr) pour extraire nos embeddings plutôt que des simples one-hot encoding
from sentence_transformers import SentenceTransformer
model =  SentenceTransformer("dangvantuan/sentence-camembert-base")

sentences = big_frame['Terme'].tolist()
embeddings_st = model.encode(sentences, convert_to_numpy=True)

In [11]:
# from collections import Counter
# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_score
# import numpy as np

# for ratio in features:
#     r = float(ratio.strip('%'))/100
#     n = round(r * dim)
#     counter = Counter(big_frame["tokens"].explode().to_list())
#     vocab = []
#     for key,value in counter.most_common(n):
#         vocab.append(key)
    
#     big_frame["vector"] = big_frame["tokens"].apply(lambda x: to_vector(x,vocab))

#     X = embeddings_st

#     ###### APPLIQUER PCA AVEC UN NB DE DIMENSION CORRESPONDANT À R POUR FAIRE VARIER LA DIMENSION DES VECTEURS #########

#     for k in clusters:
#         kmeans = KMeans(n_clusters=k, init='k-means++', algorithm='elkan', random_state=0, n_init=1, max_iter=200).fit(X)

#         score  =  silhouette_score(X, kmeans.labels_)
#         results.loc[(results['algorithme'] == 'K-Means') & \
#                     ((results['N features'] == ratio) & \
#                     (results['K (nb clusters)'] == k) & \
#                     results['embedding' == 'Sentence transformers']), 'Score Silhouette'] = score

# results
    

## **Expectation-Maximization**

In [12]:
from sklearn.mixture import GaussianMixture

### One-Hot embedding

In [14]:
for ratio in features:
    r = float(ratio.strip('%'))/100
    n = round(r * dim)
    counter = Counter(big_frame["tokens"].explode().to_list())
    vocab = []
    for key,value in counter.most_common(n):
        vocab.append(key)
    
    big_frame["vector"] = big_frame["tokens"].apply(lambda x: to_vector(x,vocab))

    X = big_frame['vector'].tolist()
    for k in clusters:
        gmm = GaussianMixture(n_components=k, init_params='k-means++', covariance_type='diag').fit(X) # diag pour gérer MemoryError
        labels = gmm.predict(X)

        score  =  silhouette_score(X, labels)
        results.loc[((results['algorithme'] == 'Expectation-Maximization') &
                    (results['N features'] == ratio) & \
                    (results['K (nb clusters)'] == k) & \
                    (results['embedding'] == 'One-Hot')), 'Score Silhouette'] = score

results
    

Unnamed: 0,algorithme,embedding,N features,K (nb clusters),Score Silhouette
0,K-Means,One-Hot,10%,50,0.815169
1,K-Means,One-Hot,10%,100,0.82806
2,K-Means,One-Hot,10%,150,0.843009
3,K-Means,One-Hot,10%,200,0.854241
4,K-Means,One-Hot,20%,50,0.669468
...,...,...,...,...,...
75,Expectation-Maximization,Sentence transformers,40%,200,
76,Expectation-Maximization,Sentence transformers,50%,50,
77,Expectation-Maximization,Sentence transformers,50%,100,
78,Expectation-Maximization,Sentence transformers,50%,150,


### Sentence transformers embedding

In [None]:
##########

## **Agglomerative Clustering**

### One-Hot embedding

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
for ratio in features:
    r = float(ratio.strip('%'))/100
    n = round(r * dim)
    counter = Counter(big_frame["tokens"].explode().to_list())
    vocab = []
    for key,value in counter.most_common(n):
        vocab.append(key)
    
    big_frame["vector"] = big_frame["tokens"].apply(lambda x: to_vector(x,vocab))

    X = big_frame['vector'].tolist()
    for k in clusters:
        model = AgglomerativeClustering().fit(X)
        labels = model.labels_

        score  =  silhouette_score(X, labels)
        results.loc[((results['algorithme'] == 'AgglomerativeClustering') &
                    (results['N features'] == ratio) & \
                    (results['K (nb clusters)'] == k) & \
                    (results['embedding'] == 'One-Hot')), 'Score Silhouette'] = score

results
    

### Sentence transformers embedding

In [None]:
##########