## **4. Keywords Clustering**
One-Hot embeddings | K-Means

We will compare models implemeting different combinations of these parameters:
- **Number of features** : ranging from 60% to 100% of total features (100 % = no dimensionality reduction)
- **Number of clusters** : ranging from N/10, N/2 the total number of terms to be clustered

Online Tutorial followed :   
https://colab.research.google.com/drive/1HHNFjKlip1AaFIuvvn0AicWyv6egLOZw?usp=sharing#scrollTo=Ya0TkMAJYvAM

________________________________________________________________________________________________________________________________________

### **Embedding** : One-Hot Encoding

> One Hot encoding is a representation of categorical variables as binary vectors. Each integer value is represented as a binary vector that is all zero values except the index of the integer, which is marked with a 1.

In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score
from nltk.tokenize import RegexpTokenizer
regex = "[\w+-]+|\([\s+\w+\d+-]+\)|\w+|\w"
tokenizex = RegexpTokenizer(regex)

In [18]:
file_path = "../06-clustering/candidate_terms.csv"
file_path = '../06-clustering/candidate_terms.csv'
with open(file_path, encoding='utf-8') as f:
    df = pd.read_csv(f).drop(columns=["Unnamed: 0"])
    df['TF + DF'] = df['TF'] + df['DF']
    terms = df['Terme'].tolist()

In [19]:
file_path = "../04-filtrage/stopwords.txt"
with open(file_path, 'r', encoding="utf-8") as f:
    stop = [t.lower().strip('\n') for t in f.readlines()]

def to_tokens(kw, min_chars=2):
    tokens = tokenizex.tokenize(str(kw)) # split the string into a list of words
    tokens = [word for word in tokens if len(word) > min_chars] 
    tokens = [str(word) for word in tokens if word not in stop] 
    
    tokens = set(tokens) # to remove duplicates
    tokens = sorted(tokens) # converts our set back to a list and sorts words in alphabetical order
    return tokens

df["tokens"] = df["Terme"].apply(lambda x: to_tokens(
    x,
    min_chars=3,
))

In [20]:
vocab = sorted(set(df["tokens"].astype('str').explode()))

In [21]:
def to_vector(keyword,vocab):
    """
    Calculates vector of keyword on given vocabulary.

    Returns vector as a list of values.  
    """
    vector = []
    for word in vocab:
        if word in keyword:
            vector.append(1)
        else:
            vector.append(0)
    return vector

df["vector"] = df["tokens"].astype('str').apply(lambda x: to_vector(x,vocab))

Unnamed: 0,Corpora,Terme,Structure syntaxique,Forme lemmatisée,isMeSHTerm,MeSHID,MesH_prefLabel_fr,MesH_prefLabel_en,isTaxoTerm,Log Likelihood,TF,DF,TF*IDF,OKapiBM25,TF + DF,tokens,vector
0,"['chum', 'chuqc', 'chusj', 'cisss_ca', 'cisss_...",services sociaux,NOM ADJ,service social,True,D012947,Services sociaux et travail social (activité),Social Work,True,1674.908057,40189,15418,1.000000,26.361483,55607,"[services, sociaux]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"['chum', 'chuqc', 'cisss_ca', 'cisss_cotenord'...",santé publique,NOM ADJ,santé public,True,D011634,Santé publique,Public Health,True,1572.987576,32510,11194,1.000000,18.947189,43704,"[publique, santé]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"['chum', 'chuqc', 'chusj', 'cisss_ca', 'cisss_...",santé mentale,NOM ADJ,santé mental,True,D008603,Santé mentale,Mental Health,True,1579.080827,13229,4795,1.000000,24.142062,18024,"[mentale, santé]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"['chuqc', 'chusj', 'cisss_ca', 'cisss_cotenord...",ministère de la santé,NOM PRP DET:ART NOM,ministère de le santé,False,,,,False,1411.378559,10741,7142,0.553007,21.734060,17883,"[ministère, santé]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"['chuqc', 'chusj', 'cisss_ca', 'cisss_cotenord...",ministère de la santé et des services,NOM PRP DET:ART NOM KON PRP:det NOM,ministère de le santé et des service,False,,,,False,1107.888160,10560,7061,0.553007,31.111185,17621,"[ministère, santé, services]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12609,['santeestrie'],centre de crise,NOM PRP NOM,centre de crise,False,,,,True,84.828851,3,3,0.136894,8.582629,6,"[centre, crise]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12610,['inesss'],carcinome rénal,NOM ADJ,carcinome rénal,True,D002292,Néphrocarcinome,"Carcinoma, Renal Cell",False,61.813956,3,3,0.141561,18.359316,6,"[carcinome, rénal]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12611,['inesss'],durée de vie,NOM PRP NOM,durée de vie,True,D008136,Longévité,Longevity,False,434.083437,3,3,0.184051,21.441105,6,[durée],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12612,['laval_sante'],commotion cérébrale,NOM ADJ,commotion cérébral,True,D001924,Commotion de l'encéphale,Brain Concussion,True,388.209903,3,3,0.557684,13.725435,6,"[commotion, cérébrale]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [25]:
df[['Terme','MesH_prefLabel_en', 'TF', 'vector']].to_csv('../06-clustering/embeddings_oneHot.csv')

In [None]:
features = ['60%', '70%', '80%', '90%', '100%'] # (100% = aucune réduction de dimensionnalité)
nb_termes = len(df['Terme'].tolist())
clusters = range(round(nb_termes/10), round(nb_termes/2), 50)

results = []
for feature in features:
    for cluster in clusters:
        results.append({'N features': feature, 'K (nb clusters)' : cluster})


# On va remplir ce dictionnaire avec les bons scores au fur et à mesure qu'on expérimente
results = pd.DataFrame(results)
results

#### **K-Means**

In [None]:
from collections import Counter
import numpy as np

algo = 'K-Means'

for k in clusters:
    for ratio in features:
        r = float(ratio.strip('%'))/100
        n = round(r * len(vocab))
        
        counter = Counter(df["tokens"].explode().to_list())
        vocab = []
        for key,value in counter.most_common(n):
            vocab.append(key)
        
        df["vector"] = df["tokens"].apply(lambda x: to_vector(x,vocab))

        X = df['vector'].tolist()
        kmeans = KMeans(n_clusters=k, init='k-means++', algorithm='elkan', random_state=0, n_init=1, max_iter=200).fit(X)
        labels = kmeans.labels_

        silhouette  =  silhouette_score(X, labels)
        davies = davies_bouldin_score(X, labels)
        results.loc[((results['N features'] == ratio) & \
                    (results['K (nb clusters)'] == k)), 'Score Silhouette'] = silhouette

        results.loc[((results['N features'] == ratio) & \
                    (results['K (nb clusters)'] == k)), 'Davies–Bouldin index'] = davies        

        # results
        print(k, silhouette, davies)

In [None]:
file_path = '../06-Clustering/results_One-Hot_K-means.csv'
results.dropna().to_csv(file_path)
results.dropna()

In [14]:
############################
# Choix des paramètres

k = 500
n_features = 100

############################

X = df['vector'].tolist() # Aucune dimension de réductionnalité
kmeans = KMeans(n_clusters=k, init='k-means++', algorithm='elkan', random_state=0, n_init=1, max_iter=200).fit(X)
labels = kmeans.labels_
df["kmeans"] = labels

# Attribuer un label pertinent 
# Attribuer un label significatif aux clusters 
# Deux options :
#   1 - S'il y a un token en commun  entre les termes d'un même clusters, le clusters sera désigné par celui-ci
#   2 - Sinon, on prend le terme dont la somme TF + DF est la plus élevée 
# rename the clusters to the shortest keyword in the cluster

labels = set(df['kmeans'].tolist())
desired_labels = {x : None for x in labels} # (on initialise à None)
for label in labels:
    d = df[df['kmeans'] == label]['tokens'].tolist()
    new_label = list(set.intersection(*map(set,d)))
    try:
        desired_labels[label] = new_label[0]
    except:
            cluster = df[df["kmeans"] == label]
            max_freq = cluster['TF + DF'].max()
            new_label = cluster[cluster['TF + DF'] == max_freq]['Terme'].values
            desired_labels[label] = new_label[0]

df['Cluster'] = df['kmeans'].map(desired_labels)

# move the cluster and Term columns to the front
col = df.pop("Terme")
df.insert(0, col.name, col)

col = df.pop('Cluster')
df.insert(0, col.name, col)

df.sort_values(["Cluster", "Terme"], ascending=[True, True], inplace=True)
df = df.drop(columns=['tokens', 'vector', 'kmeans'])
df

Unnamed: 0,Cluster,Terme,Corpora,Structure syntaxique,Forme lemmatisée,isMeSHTerm,MeSHID,MesH_prefLabel_fr,MesH_prefLabel_en,isTaxoTerm,Log Likelihood,TF,DF,TF*IDF,OKapiBM25,TF + DF
11186,(france),cancer (france),['iucpq'],NOM PUN NOM PUN,cancer (france ),False,,,,False,394.812287,41,27,0.540614,17.687338,68
9554,(loretteville),clsc de la jacques-cartier (loretteville),['ciusss_cn'],NOM PRP DET:ART NOM PUN NOM PUN,clsc de le jacques-cartier (loretteville ),False,,,,False,459.466800,52,52,0.120608,18.372014,104
5410,accès,centre d'accès,['ciusss_estmtl'],NOM PRP NOM,centre de accès,False,,,,False,1103.615521,100,81,0.861189,4.907043,181
4882,actif,audit de potentiel piétonnier actif,['sante_mtl'],NOM PRP NOM ADJ NOM,audit de potentiel piétonnier actif,False,,,,False,-inf,113,90,0.340833,6.736669,203
11309,actif,jeu actif,['ciusss_mcq'],NOM ADJ,jeu actif,False,,,,False,287.046764,40,40,0.147041,11.835291,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8377,évaluation,évaluation oncaspar pegaspargase servier trait...,['inesss'],NOM ADJ NOM ADJ NOM,évaluation oncaspar pegaspargase servier trait...,False,,,,False,619.808154,63,63,0.008053,17.295587,126
7960,évaluations,mode continu autres évaluations,['inesss'],NOM ADJ ADJ NOM,mode continu autre évaluation,False,,,,False,914.220458,66,66,0.007977,10.422017,132
7010,évaluations,évaluations en mode continu,['inesss'],NOM PRP NOM ADJ,évaluation en mode continu,False,,,,False,1238.025325,68,68,0.038305,18.633351,136
7398,évaluations,évaluations en mode continu autres évaluations,['inesss'],NOM PRP NOM ADJ ADJ NOM,évaluation en mode continu autre évaluation,False,,,,False,914.220458,66,66,0.007977,7.090685,132


In [15]:
file_path = '../06-clustering/clusters_One-Hot_K-Means.csv'
df.to_csv(file_path, index=False)