## **3. Pondération statistique** (TF-IDF / OKapiBM25)  

https://stackoverflow.com/questions/46580932/calculate-tf-idf-using-sklearn-for-n-grams-in-python  
http://scikit-learn.sourceforge.net/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn-feature-extraction-text-tfidfvectorizer  
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

https://pypi.org/project/rank-bm25/

In [1]:
path = '../04-filtrage/output/'
acteur = 'msss'
tag = ''

if tag:
    csv_file = acteur + '_' + tag + '_significant-collocations.csv'

else:
    csv_file = acteur + '_significant-collocations.csv'

### **Lire le vocabulaire** (termes retenus au prétraitement)

In [2]:
from pandas import *

with open(path+csv_file, encoding='utf-8') as f:
    csv = read_csv(f)[["Collocation", "Structure syntaxique", "Fréquence", "LLR", "p-value"]]

csv

Unnamed: 0,Collocation,Structure syntaxique,Fréquence,LLR,p-value
0,services sociaux,NOM ADJ,10268,-inf,1.000000e+00
1,ministère de la santé,NOM PRP DET:ART NOM,3698,-inf,1.000000e+00
2,santé publique,NOM ADJ,3559,-inf,1.000000e+00
3,indicateurs de gestion,NOM PRP NOM,2876,-inf,1.000000e+00
4,répertoire des indicateurs,NOM PRP:det NOM,2542,-inf,1.000000e+00
...,...,...,...,...,...
40764,commande de fmo,NOM PRP NOM,4,76.534351,2.164156e-18
40765,idée d'une augmentation,NOM PRP DET:ART NOM,4,67.186266,2.470290e-16
40766,compat utf 8'communiqué,NOM ADJ ADJ,0,98.970715,2.562634e-23
40767,compat utf 8'communiqué de presse,NOM ADJ ADJ PRP NOM,0,116.413102,3.859238e-27


In [3]:
vocabulaire = [t.lower() for t in list(csv['Collocation'])]

In [4]:
print('On a un vocabulaire de {} formes.'.format(len(vocabulaire)))
vocabulaire

On a un vocabulaire de 40769 formes.


['services sociaux',
 'ministère de la santé',
 'santé publique',
 'indicateurs de gestion',
 'répertoire des indicateurs',
 'répertoire des indicateurs de gestion',
 'pandémie de la covid',
 'santé et services',
 'santé et services sociaux',
 'gestion en santé et services sociaux',
 'gestion en santé',
 'gestion en santé et services',
 'indicateurs de gestion en santé',
 'réseau de la santé',
 'soutien à domicile',
 'méthode de calcul',
 'bilan de la dernière journée',
 'bilan de la dernière',
 'santé mentale',
 'prélèvements réalisés',
 'santé physique',
 'disponibilité des données',
 'soins intensifs',
 'décision de la demande',
 'ministre de la santé',
 'institut national',
 'nombre total',
 'santé publique du québec',
 'année financière',
 'gestionnaire principal',
 'institut national de santé',
 'institut national de santé publique',
 'doses administrées',
 'jeunes en difficulté',
 'services sociaux msss',
 'renseignements administratifs',
 'objectif cible',
 "statut de l'indicat

### **Lire le corpus**

In [6]:
import os, shutil, re
from pathlib import Path
from os import path
from pandas import *

base_path = '../03-corpus/2-data/1-fr/'
if tag:
    base_path = path.join(base_path, acteur, acteur + '_' + tag + '.csv')

else:
    base_path = path.join(base_path, acteur +  '.csv')
        
with open(base_path, "r", encoding = "UTF-8") as f:
    data = read_csv(base_path, sep=',')
    text = data['text'].tolist()

In [7]:
text = text[:round(len(text))]

nb_docs = len(text)

print("On a donc un corpus de {} documents.".format(nb_docs))

On a donc un corpus de 4859 documents.


### **Nettoyage**

In [9]:
corpus = [str(t).strip('\n').lower().replace('’', '\'') for t in text]
    
punct = '[!#$%&\(\)•►*+,-\/:;<=>?@[\]^_{|}~©«»—“”–—]'
spaces = '\s+'
postals = '([a-zA-Z]+\d+|\d+[a-zA-Z]+)+'
phones = '\d{3}\s\d{3}-\d{4}' #très simple (trop)

corpus = [re.sub(punct, ' ', t).replace("' ", "'" ).replace("'", "'") for t in corpus]
corpus = [re.sub(spaces, ' ', t) for t in corpus]
corpus = [str(t).strip('\n').lower().replace('’', '\'') for t in corpus]
corpus = [re.sub(spaces, ' ', t) for t in corpus]
corpus = [re.sub(phones, ' ', t) for t in corpus]
corpus = [re.sub(postals, ' ', t) for t in corpus]
corpus = [re.sub(punct, ' ', t) for t in corpus]
corpus = [t.replace("  ", " " ) for t in corpus]

### **Appliquer le prétraitement**
Si les termes passées comme vocabulaire sont lemmatisés, changer le paramètre lem pour True au moment d'appliquer la fonction nlp(corpus)  
Le TfIdfVectorizer de sklearn va extraire lui-mêmes les ngrammes, faire le filtrage des mots fonctionnels et calculer le tf-idf pour nos termes d'intérêt ;  
Or, si les termes qu'on lui donne comme vocabulaire ont été lemmatisés, on veut donc aussi lui passer un corpus lemmatisé.

In [10]:
import nltk
from nltk.tokenize import RegexpTokenizer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

def nlp(corpus, lem=False): 
    if not lem:
        # Tokenisation
        tokenizer = RegexpTokenizer(r"\w\'|\w+")

        tokens = [tokenizer.tokenize(doc) for  doc in corpus]
        len_corpus = len(nltk.flatten(tokens))
        print("Avec le RegExpTokenizer, notre corpus contient {} tokens.".format(len_corpus))

        return tokens

    else:
        # POS tagging
        input = [" ".join(nltk.flatten(doc)).replace("' ", "'") for doc in tokens]
        import treetaggerwrapper
        tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')

        path = '../04-filtrage/mapping_treeTagger_lefff.csv'

        with open(path) as f:
            csv = read_csv(f)

        treeTag = [term for term in csv['TreeTagger'].tolist()] 
        lefff = [term for term in csv['Lefff'].tolist()]

        mapping = {term : lefff[treeTag.index(term)] for term in treeTag}

        tagged= [tagger.tag_text(doc) for doc in input]

        tuples_doc = []
        for doc in tagged:
            tuples = []
            for t in doc:
                token = t.split('\t')[0]
                pos = mapping[t.split('\t')[1]]

                tuples.append([token, pos])
            tuples_doc.append(tuples)

        #Lemmatisation
        lemmatizer = FrenchLefffLemmatizer()
        docs_lemmas = []

        for doc in tuples_doc:
            doc_lemma = []
            for t in doc:
                term_lemmatized = ""
                if(lemmatizer.lemmatize(t[0], t[1]) == []):
                    term_lemmatized = lemmatizer.lemmatize(t[0])
                else:
                    term_lemmatized = lemmatizer.lemmatize(t[0], t[1])[0][0] # [0][0] pour avoir le lemme seul et non (lemme, pos)
            
                if len(term_lemmatized) >1 :
                    doc_lemma.append(term_lemmatized)
            docs_lemmas.append(doc_lemma)

        docs_lemmas = [" ".join(doc) for doc in docs_lemmas]

        return docs_lemmas

In [11]:
corpus = nlp(corpus)

Avec le RegExpTokenizer, notre corpus contient 3129514 tokens.


In [12]:
file_path = '../04-filtrage/mwe_stopwords.txt'

with open (file_path, 'r', encoding='utf-8') as f:
    mwe_sw = [t.lower().strip('\n') for t in f.readlines()]

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# max_df : ignore words that appear in 85% of documents, 
# min df:  ignore words that appear in less than 1% of documents 
# vocabulary = vocabulaire

# Sans utiliser le vocabulaire
# tfidf = TfidfVectorizer(min_df=0.1, stop_words=None, ngram_range=(2,4), max_df=0.85, use_idf=True)

def identity_tokenizer(text):
    return text

# vocabulary = vocabulaire
tfidf = TfidfVectorizer(vocabulary = vocabulaire, tokenizer=identity_tokenizer, ngram_range=(2,5), use_idf=True, lowercase=False, stop_words= mwe_sw)
tfs = tfidf.fit_transform(corpus)



In [14]:
features_names = tfidf.get_feature_names_out()
corpus_index = [corpus.index(n) for n in corpus]

import pandas as pd
df = pd.DataFrame(tfs.T.todense(), index=features_names, columns=corpus_index).transpose()

In [15]:
df

Unnamed: 0,services sociaux,ministère de la santé,santé publique,indicateurs de gestion,répertoire des indicateurs,répertoire des indicateurs de gestion,pandémie de la covid,santé et services,santé et services sociaux,gestion en santé et services sociaux,...,criblage des variants,var au lieu,vaccination 18 ans,ordonnance de garde,différence significative entre les hommes,commande de fmo,idée d'une augmentation,compat utf 8'communiqué,compat utf 8'communiqué de presse,compat utf 8'le
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.082813,0.033003,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.167933,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.043752,0.000000,0.059336,0.000000,0.012523,0.000000,0.0,0.012008,0.012008,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.078762,0.000000,0.000000,0.045122,0.045089,0.045236,0.0,0.086468,0.086468,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3760,0.041411,0.019804,0.000000,0.028469,0.028448,0.028541,0.0,0.013639,0.013639,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3801,0.025803,0.015425,0.000000,0.022173,0.022157,0.022229,0.0,0.010623,0.010623,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3899,0.146246,0.037467,0.000000,0.035907,0.035881,0.035998,0.0,0.017202,0.017202,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3911,0.075254,0.000000,0.000000,0.043112,0.043081,0.043222,0.0,0.020654,0.020654,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# from pathlib import Path

# base_path = '../05-transformation/' + acteur + '/'
# Path(base_path).mkdir(parents=True, exist_ok=True)

# if sous_corpus:
#     path = base_path + tag + '/'
#     titre = tag

# else:
#     titre = acteur

# df.to_csv(base_path + titre + '_matrice-TFIDF.csv')

In [17]:
terms_weighted = []
rows, cols = tfs.nonzero()
for row, col in zip(rows,cols):
    terms_weighted.append([features_names[col], tfs[row,col]])

terms_weighted = DataFrame(terms_weighted, columns=['Collocation', 'TF-IDF'])
terms_weighted.sort_values(["TF-IDF"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

In [None]:
terms_weighted = terms_weighted.drop_duplicates(keep='first')

In [None]:
terms_weighted = pd.merge(csv, terms_weighted, on='Collocation').drop_duplicates(
  subset = ['Collocation', 'Fréquence'],
  keep = 'first').reset_index(drop = True)

In [None]:
terms_weighted.to_csv('C:/Users/p1115145/Documents/text-mining-project/05-transformation/msss/' + 'cancer-TEST' + '_weighting_TF-IDF.csv')

**Test : clustering (documents)**

In [None]:
def cluster_text(corpus):
    # vocabulary = vocabulaire
    vectorizer = TfidfVectorizer(vocabulary = vocabulaire, tokenizer=identity_tokenizer, ngram_range=(2,5), use_idf=True, lowercase=False, stop_words= mwe_sw)
    X = vectorizer.fit_transform(corpus)

    import matplotlib.pyplot as plt
    from sklearn.cluster import KMeans
    Sum_of_squared_distances = []
    K = range(2,10)
    for k in K:
       km = KMeans(n_clusters=k, max_iter=200, n_init=10)
       km = km.fit(X)
       Sum_of_squared_distances.append(km.inertia_)
    plt.plot(K, Sum_of_squared_distances, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum_of_squared_distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()

    print('How many clusters do you want to use?')
    true_k = int(input())
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=10)
    model.fit(X)

    labels=model.labels_
    clusters=pd.DataFrame(list(zip(text,labels)),columns=['title','cluster'])
    #print(clusters.sort_values(by=['cluster']))

    for i in range(true_k):
        print(clusters[clusters['cluster'] == i])
        clusters.to_csv('../06-clustering/' + acteur + '_clusters.csv')
        
    return

In [None]:
cluster_text(corpus)

## **OKapi BM25**
https://hal.archives-ouvertes.fr/hal-00760158 

In [None]:
from rank_bm25 import BM25Okapi

In [None]:
bm25 = BM25Okapi(corpus)

In [None]:
tokenizer = RegexpTokenizer(r"\w\'|\w+")
tokenized_queries = [tokenizer.tokenize(t) for t in vocabulaire]

features_names = [t for t in vocabulaire]
corpus_index = [corpus.index(n) for n in corpus]

tab = [bm25.get_scores(query) for query in tokenized_queries]
df = pd.DataFrame(tab, index=features_names, columns=corpus_index).transpose()

In [None]:
# df.to_csv(base_path + titre + '_matrice-OkapiBM25.csv') # Si on veut avoir la matrice (mais le fichier peut être très volumineux)

In [None]:
terms_okapi = {term: df[term].max() for term in df}

In [None]:
tab = DataFrame(terms_okapi.items(), columns=['Collocation', 'OkapiBM25'])
tab.sort_values(["OkapiBM25"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

tab = pd.merge(terms_weighted, tab, on="Collocation")

In [112]:
base_path = '../05-transformation/'

In [113]:
if tag:
    file_path = base_path + acteur + '_' + tag + '_weighting_OKapiBM25.csv'

else: 
    file_path = base_path + acteur  + '_weighting_OKapiBM25.csv'
tab.to_csv(file_path)

In [114]:
tab

Unnamed: 0,Collocation,Structure syntaxique,Fréquence,LLR,p-value,TF-IDF,OkapiBM25
0,centre de recherche,NN NP NN,862,-inf,1.000000e+00,0.202834,8.660473
1,fondation du chum,NN NP NN,773,-inf,1.000000e+00,0.119745,8.187498
2,commissaire local aux plaintes,NN JJ NP NP,767,-inf,1.000000e+00,0.119745,9.172096
3,commissaire local,NN JJ,767,-inf,1.000000e+00,0.119745,4.396333
4,enseignement et académie,NN CC NN,766,-inf,1.000000e+00,0.279188,7.996926
...,...,...,...,...,...,...,...
2978,drug thrombolysis,NN NN,4,55.712485,8.388384e-14,0.395491,15.105375
2979,cancer with surgery total,NN IN NN NN,4,74.977692,4.760628e-18,0.272092,14.416272
2980,elie bou,NN NN,4,78.609905,7.566905e-19,0.513619,15.203312
2981,spectral computed,NN VVD,4,71.093893,3.406228e-17,0.454122,17.986251
