## **3. Pondération statistique** (TF-IDF / OKapiBM25)  

https://stackoverflow.com/questions/46580932/calculate-tf-idf-using-sklearn-for-n-grams-in-python  
http://scikit-learn.sourceforge.net/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn-feature-extraction-text-tfidfvectorizer  
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

https://pypi.org/project/rank-bm25/

In [117]:
path = '../04-filtrage/output/'
acteur = 'chum'
sous_corpus = False
tag = ''

### **Lire le vocabulaire** (termes retenus au prétraitement)

In [118]:
from pandas import *

if sous_corpus:
    file_path = path + acteur + '/' + tag + '/' + tag + '_significant-collocations.csv' # si on veut les termes lemmatisés : -lemmatized.csv

else:
    file_path = path + acteur + '/' + acteur + '_significant-collocations.csv' # si on veut les termes lemmatisés : -lemmatized.csv
    
with open(file_path, encoding='utf-8') as f:
    csv = read_csv(f)[["Collocation","Fréquence", "LLR", "p-value"]]

csv

Unnamed: 0,Collocation,Fréquence,LLR,p-value
0,médecine nucléaire,110,1431.000339,0.0
1,conférenciers scientifiques de renom,83,1631.779876,0.0
2,scientifiques de renom,83,1571.365126,0.0
3,reconstruction mammaire,94,1511.762445,0.0
4,conférenciers scientifiques de renom issus,83,1590.583152,0.0
...,...,...,...,...
2007,plaintes politique d'approvisionnement,2295,-inf,1.0
2008,commissaire local,2303,-inf,1.0
2009,évaluation en santé,167,-inf,1.0
2010,crchum sont des réunions,86,-inf,1.0


In [119]:
vocabulaire = [t.lower() for t in list(csv['Collocation'])]

In [120]:
print('On a un vocabulaire de {} formes.'.format(len(vocabulaire)))
vocabulaire

On a un vocabulaire de 2012 formes.


['médecine nucléaire',
 'conférenciers scientifiques de renom',
 'scientifiques de renom',
 'reconstruction mammaire',
 'conférenciers scientifiques de renom issus',
 'répertoire patients',
 'renom issus',
 'prendre soin',
 'pool room',
 'essais cliniques',
 'centre hospitalier',
 'professeur agrégé',
 'sclérose en plaques',
 'indexées sur researchgate',
 'clinique département',
 'système immunitaire',
 'radio oncologie',
 'pizza pizza',
 'centre intégré de cancérologie',
 'recherche du centre hospitalier',
 'syndromes parkinsoniens',
 'professeur adjoint',
 'recherche en santé du canada',
 'intégré de cancérologie',
 'conférenciers scientifiques',
 'effets secondaires',
 'maladie de parkinson',
 'recherche en santé',
 'chaire de recherche',
 'médecine université',
 'syndromes parkinsoniens atypiques',
 'plateformes et services',
 'activité physique',
 'régulier centre de recherche',
 'fibrose kystique',
 'santé du canada',
 'santé mentale',
 'mega brands',
 'département de médecine un

### **Lire le corpus**

In [121]:
import os, shutil, re
from pathlib import Path
from os import path
from pandas import *

# Change the directory
if sous_corpus:
    base_path = '../03-corpus/2-sous-corpus/'
    base_path = path.join(base_path, acteur, tag) 

else: 
    base_path = '../03-corpus/2-data/1-fr/'
    base_path = path.join(base_path, acteur + '.csv')
        
with open(base_path, "r", encoding = "UTF-8") as f:
    data = read_csv(base_path)
    text = data['text'].tolist()
    corpus = [(re.sub('\d', '', t.strip('\n').lower().replace('’', '\''))) for t in text]

In [122]:
corpus = corpus[:round(len(corpus))]

nb_docs = len(corpus)

print("On a donc un corpus de {} documents.".format(nb_docs))

On a donc un corpus de 2299 documents.


### **Appliquer le prétraitement**
Si les termes passées comme vocabulaire sont lemmatisés, changer le paramètre lem pour True au moment d'appliquer la fonction nlp(corpus)  
Le TfIdfVectorizer de sklearn va extraire lui-mêmes les ngrammes, faire le filtrage des mots fonctionnels et calculer le tf-idf pour nos termes d'intérêt ;  
Or, si les termes qu'on lui donne comme vocabulaire ont été lemmatisés, on veut donc aussi lui passer un corpus lemmatisé.

In [123]:
import nltk
from nltk.tokenize import RegexpTokenizer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

def nlp(corpus, lem=False): 
    if not lem:
        # Tokenisation
        tokenizer = RegexpTokenizer(r"\w\'|\w+")

        tokens = [tokenizer.tokenize(doc) for  doc in corpus]
        len_corpus = len(nltk.flatten(tokens))
        print("Avec le RegExpTokenizer, notre corpus contient {} tokens.".format(len_corpus))

        return tokens

    else:
        # POS tagging
        input = [" ".join(nltk.flatten(doc)).replace("' ", "'") for doc in tokens]
        import treetaggerwrapper
        tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')

        path = '../04-filtrage/mapping_treeTagger_lefff.csv'

        with open(path) as f:
            csv = read_csv(f)

        treeTag = [term for term in csv['TreeTagger'].tolist()] 
        lefff = [term for term in csv['Lefff'].tolist()]

        mapping = {term : lefff[treeTag.index(term)] for term in treeTag}

        tagged= [tagger.tag_text(doc) for doc in input]

        tuples_doc = []
        for doc in tagged:
            tuples = []
            for t in doc:
                token = t.split('\t')[0]
                pos = mapping[t.split('\t')[1]]

                tuples.append([token, pos])
            tuples_doc.append(tuples)

        #Lemmatisation
        lemmatizer = FrenchLefffLemmatizer()
        docs_lemmas = []

        for doc in tuples_doc:
            doc_lemma = []
            for t in doc:
                term_lemmatized = ""
                if(lemmatizer.lemmatize(t[0], t[1]) == []):
                    term_lemmatized = lemmatizer.lemmatize(t[0])
                else:
                    term_lemmatized = lemmatizer.lemmatize(t[0], t[1])[0][0] # [0][0] pour avoir le lemme seul et non (lemme, pos)
            
                if len(term_lemmatized) >1 :
                    doc_lemma.append(term_lemmatized)
            docs_lemmas.append(doc_lemma)

        docs_lemmas = [" ".join(doc) for doc in docs_lemmas]

        return docs_lemmas

In [124]:
corpus = nlp(corpus)

Avec le RegExpTokenizer, notre corpus contient 868105 tokens.


In [125]:
file_path = '../04-filtrage/mwe_stopwords.txt'

with open (file_path, 'r', encoding='utf-8') as f:
    mwe_sw = [t.lower().strip('\n') for t in f.readlines()]

In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer

# max_df : ignore words that appear in 85% of documents, 
# min df:  ignore words that appear in less than 1% of documents 
# vocabulary = vocabulaire

# Sans utiliser le vocabulaire
# tfidf = TfidfVectorizer(min_df=0.1, stop_words=None, ngram_range=(2,4), max_df=0.85, use_idf=True)

def identity_tokenizer(text):
    return text

# vocabulary = vocabulaire
tfidf = TfidfVectorizer(vocabulary = vocabulaire, tokenizer=identity_tokenizer, ngram_range=(2,5), use_idf=True, lowercase=False, stop_words= mwe_sw)
tfs = tfidf.fit_transform(corpus)



In [127]:
features_names = tfidf.get_feature_names_out()
corpus_index = [corpus.index(n) for n in corpus]

import pandas as pd
df = pd.DataFrame(tfs.T.todense(), index=features_names, columns=corpus_index).transpose()

In [128]:
df

Unnamed: 0,médecine nucléaire,conférenciers scientifiques de renom,scientifiques de renom,reconstruction mammaire,conférenciers scientifiques de renom issus,répertoire patients,renom issus,prendre soin,pool room,essais cliniques,...,mesures appropriées,fibrillation auriculaire,retraités du chum,chum chercheurs,répertoire centre,plaintes politique d'approvisionnement,commissaire local,évaluation en santé,crchum sont des réunions,patients témoignages
0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.067944,0.0,0.026040,0.065560,0.0,0.0,0.026040,0.0,0.00000,0.0
1,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.047716,0.000000,0.0,0.0,0.047716,0.0,0.00000,0.0
2,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.128903,0.0,0.049402,0.124379,0.0,0.0,0.049402,0.0,0.00000,0.0
3,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.066877,0.000000,0.0,0.0,0.066877,0.0,0.00000,0.0
4,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.136307,0.000000,0.0,0.0,0.136307,0.0,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2294,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.061854,0.000000,0.0,0.0,0.061854,0.0,0.00000,0.0
2295,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.007308,0.000000,0.0,0.0,0.007308,0.0,0.00000,0.0
2296,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.117658,0.0,0.045093,0.113529,0.0,0.0,0.045093,0.0,0.00000,0.0
2297,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.055448,0.000000,0.0,0.0,0.055448,0.0,0.00000,0.0


In [129]:
from pathlib import Path

base_path = '../05-transformation/' + acteur + '/'
Path(base_path).mkdir(parents=True, exist_ok=True)

if sous_corpus:
    path = path + tag + '/'
    titre = tag

else:
    titre = acteur

df.to_csv(base_path + titre + '_matrice-TFIDF.csv')

In [130]:
terms_weighted = []
rows, cols = tfs.nonzero()
for row, col in zip(rows,cols):
    terms_weighted.append([features_names[col], tfs[row,col]])

terms_weighted = DataFrame(terms_weighted, columns=['Collocation', 'TF-IDF'])
terms_weighted.sort_values(["TF-IDF"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

In [131]:
terms_weighted

Unnamed: 0,Collocation,TF-IDF
142439,école secondaire,0.998040
61082,radio canada,0.993851
46790,épi tavie,0.990278
70866,père noël,0.989402
14847,haute fidélité,0.966337
...,...,...
87940,laboratoires fondation,0.001999
87939,comité des usagers,0.001999
87938,retraités du chum,0.001999
87973,regroupement des retraités du chum,0.001999


In [132]:
termes= set(features_names)

liste_filtre = {term: df[term].max() for term in termes}

In [133]:
termes_tries = pd.DataFrame(liste_filtre.items(), columns=['Collocation', 'TF-IDF'])
termes_tries.sort_values(["TF-IDF"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

termes_tries = pd.merge(csv, termes_tries, on="Collocation")
termes_tries.to_csv(base_path + titre + '_weighting_TF-IDF.csv')

## **OKapi BM25**
https://hal.archives-ouvertes.fr/hal-00760158 

In [134]:
from rank_bm25 import BM25Okapi

In [135]:
bm25 = BM25Okapi(corpus)

In [136]:
tokenizer = RegexpTokenizer(r"\w\'|\w+")
tokenized_queries = [tokenizer.tokenize(t) for t in vocabulaire]

features_names = [t for t in vocabulaire]
corpus_index = [corpus.index(n) for n in corpus]

tab = [bm25.get_scores(query) for query in tokenized_queries]
df = pd.DataFrame(tab, index=features_names, columns=corpus_index).transpose()

In [137]:
# df.to_csv(base_path + titre + '_matrice-OkapiBM25.csv') # Si on veut avoir la matrice (mais le fichier peut être très volumineux)

In [138]:
terms_weighted = {term: df[term].max() for term in df}

In [139]:
tab = DataFrame(terms_weighted.items(), columns=['Collocation', 'OkapiBM25'])
tab.sort_values(["OkapiBM25"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

tab = pd.merge(termes_tries, tab, on="Collocation")

In [140]:
tab.to_csv(base_path + titre + '_weighting_OKapiBM25.csv')