## **3. Pondération statistique** (TF-IDF / OKapiBM25)  

https://stackoverflow.com/questions/46580932/calculate-tf-idf-using-sklearn-for-n-grams-in-python  
http://scikit-learn.sourceforge.net/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn-feature-extraction-text-tfidfvectorizer  
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

https://pypi.org/project/rank-bm25/

In [305]:
path = '../04-filtrage/output/'
acteur = 'msss'
sous_corpus = True
tag = 'cancer'

### **Lire le vocabulaire** (termes retenus au prétraitement)

In [306]:
from pandas import *

if sous_corpus:
    file_path = path + acteur + '/' + tag + '/' + tag + '_significant-collocations.csv' # si on veut les termes lemmatisés : -lemmatized.csv
    file_path = 'C:/Users/p1115145/Documents/text-mining-project/04-filtrage/output/msss/cancer/cancer_significant-collocations.csv'
else:
    file_path = path + acteur + '/' + acteur + '_significant-collocations.csv' # si on veut les termes lemmatisés : -lemmatized.csv
    
with open(file_path, encoding='utf-8') as f:
    csv = read_csv(f)[["Collocation", "Structure syntaxique", "Fréquence", "LLR", "p-value"]]

csv

Unnamed: 0,Collocation,Structure syntaxique,Fréquence,LLR,p-value
0,données probantes,NOM ADJ,864,-inf,1.000000e+00
1,lutte contre le cancer,VER:pres PRP DET:ART NOM,835,-inf,1.000000e+00
2,survie globale,NOM ADJ,807,-inf,1.000000e+00
3,services sociaux,NOM ADJ,660,-inf,1.000000e+00
4,effets indésirables,NOM ADJ,568,-inf,1.000000e+00
...,...,...,...,...,...
2115,beam radiotherapy,NOM ADJ,11,161.088249,6.544612e-37
2116,durée de séjour,VER:pper PRP NOM,11,174.409432,8.056989e-40
2117,élevé chez les patients,VER:pper PRP DET:ART NOM,11,88.418330,5.297762e-21
2118,endoscopie digestive,NOM ADJ,11,175.263715,5.243464e-40


In [307]:
vocabulaire = [t.lower() for t in list(csv['Collocation'])]

In [308]:
print('On a un vocabulaire de {} formes.'.format(len(vocabulaire)))
vocabulaire

On a un vocabulaire de 2120 formes.


['données probantes',
 'lutte contre le cancer',
 'survie globale',
 'services sociaux',
 'effets indésirables',
 'soins palliatifs',
 'phase iii',
 'patients atteints',
 'probantes de niveau',
 'données probantes de niveau',
 'ministère de la santé',
 'clin oncol',
 'atteintes de cancer',
 'dépistage du cancer',
 'bouffées de chaleur',
 'survie médiane',
 'taux de réponse',
 'programme québécois',
 'cancer du poumon',
 'recommandation de grade',
 'qualité de vie',
 'traitement du cancer',
 "comité de l'évolution",
 'étude de phase',
 'patients traités',
 'pratique clinique',
 'cancer colorectal',
 'lung cancer',
 'médecin de famille',
 'traitement adjuvant',
 'suivi médian',
 'fin de vie',
 'oncologue médical',
 'patients du groupe',
 'réponse complète',
 'professionnels de la santé',
 'taux de survie',
 'groupe témoin',
 'évolution des pratiques',
 'hématologue et oncologue',
 'étude randomisée',
 'centre hospitalier',
 'équipes interdisciplinaires',
 'survie sans maladie',
 'contrôl

### **Lire le corpus**

In [309]:
import os, shutil, re
from pathlib import Path
from os import path
from pandas import *

# Change the directory
if sous_corpus:
    base_path = '../03-corpus/2-sous-corpus/'
    base_path = path.join(base_path, acteur, acteur + '_' + tag + '.csv') 
    

else: 
    base_path = '../03-corpus/2-data/1-fr/'
    base_path = path.join(base_path, acteur + '.csv')
        
with open(base_path, "r", encoding = "UTF-8") as f:
    data = read_csv(base_path)
    text = data['text'].tolist()
    corpus = [(re.sub('\d', '', t.strip('\n').lower().replace('’', '\''))) for t in text]

In [310]:
corpus = corpus[:round(len(corpus))]

nb_docs = len(corpus)

print("On a donc un corpus de {} documents.".format(nb_docs))

On a donc un corpus de 252 documents.


### **Appliquer le prétraitement**
Si les termes passées comme vocabulaire sont lemmatisés, changer le paramètre lem pour True au moment d'appliquer la fonction nlp(corpus)  
Le TfIdfVectorizer de sklearn va extraire lui-mêmes les ngrammes, faire le filtrage des mots fonctionnels et calculer le tf-idf pour nos termes d'intérêt ;  
Or, si les termes qu'on lui donne comme vocabulaire ont été lemmatisés, on veut donc aussi lui passer un corpus lemmatisé.

In [311]:
import nltk
from nltk.tokenize import RegexpTokenizer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

def nlp(corpus, lem=False): 
    if not lem:
        # Tokenisation
        tokenizer = RegexpTokenizer(r"\w\'|\w+")

        tokens = [tokenizer.tokenize(doc) for  doc in corpus]
        len_corpus = len(nltk.flatten(tokens))
        print("Avec le RegExpTokenizer, notre corpus contient {} tokens.".format(len_corpus))

        return tokens

    else:
        # POS tagging
        input = [" ".join(nltk.flatten(doc)).replace("' ", "'") for doc in tokens]
        import treetaggerwrapper
        tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')

        path = '../04-filtrage/mapping_treeTagger_lefff.csv'

        with open(path) as f:
            csv = read_csv(f)

        treeTag = [term for term in csv['TreeTagger'].tolist()] 
        lefff = [term for term in csv['Lefff'].tolist()]

        mapping = {term : lefff[treeTag.index(term)] for term in treeTag}

        tagged= [tagger.tag_text(doc) for doc in input]

        tuples_doc = []
        for doc in tagged:
            tuples = []
            for t in doc:
                token = t.split('\t')[0]
                pos = mapping[t.split('\t')[1]]

                tuples.append([token, pos])
            tuples_doc.append(tuples)

        #Lemmatisation
        lemmatizer = FrenchLefffLemmatizer()
        docs_lemmas = []

        for doc in tuples_doc:
            doc_lemma = []
            for t in doc:
                term_lemmatized = ""
                if(lemmatizer.lemmatize(t[0], t[1]) == []):
                    term_lemmatized = lemmatizer.lemmatize(t[0])
                else:
                    term_lemmatized = lemmatizer.lemmatize(t[0], t[1])[0][0] # [0][0] pour avoir le lemme seul et non (lemme, pos)
            
                if len(term_lemmatized) >1 :
                    doc_lemma.append(term_lemmatized)
            docs_lemmas.append(doc_lemma)

        docs_lemmas = [" ".join(doc) for doc in docs_lemmas]

        return docs_lemmas

In [312]:
corpus = nlp(corpus)

Avec le RegExpTokenizer, notre corpus contient 919141 tokens.


In [313]:
file_path = '../04-filtrage/mwe_stopwords.txt'

with open (file_path, 'r', encoding='utf-8') as f:
    mwe_sw = [t.lower().strip('\n') for t in f.readlines()]

In [314]:
from sklearn.feature_extraction.text import TfidfVectorizer

# max_df : ignore words that appear in 85% of documents, 
# min df:  ignore words that appear in less than 1% of documents 
# vocabulary = vocabulaire

# Sans utiliser le vocabulaire
# tfidf = TfidfVectorizer(min_df=0.1, stop_words=None, ngram_range=(2,4), max_df=0.85, use_idf=True)

def identity_tokenizer(text):
    return text

# vocabulary = vocabulaire
tfidf = TfidfVectorizer(vocabulary = vocabulaire, tokenizer=identity_tokenizer, ngram_range=(2,5), use_idf=True, lowercase=False, stop_words= mwe_sw)
tfs = tfidf.fit_transform(corpus)



In [315]:
features_names = tfidf.get_feature_names_out()
corpus_index = [corpus.index(n) for n in corpus]

import pandas as pd
df = pd.DataFrame(tfs.T.todense(), index=features_names, columns=corpus_index).transpose()

In [316]:
df

Unnamed: 0,données probantes,lutte contre le cancer,survie globale,services sociaux,effets indésirables,soins palliatifs,phase iii,patients atteints,probantes de niveau,données probantes de niveau,...,ordonnances collectives,symptômes à distance,traitement chez les patients,multicenter randomized,prévention et traitement,beam radiotherapy,durée de séjour,élevé chez les patients,endoscopie digestive,formation en ligne sur les normes
0,0.002043,0.215788,0.0,0.016755,0.011586,0.267138,0.0,0.006795,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.033079,0.211061,0.0,0.054266,0.000000,0.141530,0.0,0.026898,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.004874,0.0,0.0,0.0
2,0.000000,0.249884,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,0.000000,0.000000,0.0,0.000000,0.000000,0.344054,0.0,0.105018,0.0,0.0,...,0.0,0.200262,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
248,0.000000,0.000000,0.0,0.022130,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
249,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
250,0.000000,0.000000,0.0,0.091559,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [317]:
# from pathlib import Path

# base_path = '../05-transformation/' + acteur + '/'
# Path(base_path).mkdir(parents=True, exist_ok=True)

# if sous_corpus:
#     path = base_path + tag + '/'
#     titre = tag

# else:
#     titre = acteur

# df.to_csv(base_path + titre + '_matrice-TFIDF.csv')

In [318]:
terms_weighted = []
rows, cols = tfs.nonzero()
for row, col in zip(rows,cols):
    terms_weighted.append([features_names[col], tfs[row,col]])

terms_weighted = DataFrame(terms_weighted, columns=['Collocation', 'TF-IDF'])
terms_weighted.sort_values(["TF-IDF"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

In [319]:
terms_weighted = terms_weighted.drop_duplicates(keep='first')

In [320]:
terms_weighted = pd.merge(csv, terms_weighted, on='Collocation').drop_duplicates(
  subset = ['Collocation', 'Fréquence'],
  keep = 'first').reset_index(drop = True)

In [321]:
terms_weighted.to_csv('C:/Users/p1115145/Documents/text-mining-project/05-transformation/msss/' + 'cancer-TEST' + '_weighting_TF-IDF.csv')

## **OKapi BM25**
https://hal.archives-ouvertes.fr/hal-00760158 

In [322]:
from rank_bm25 import BM25Okapi

In [323]:
bm25 = BM25Okapi(corpus)

In [324]:
tokenizer = RegexpTokenizer(r"\w\'|\w+")
tokenized_queries = [tokenizer.tokenize(t) for t in vocabulaire]

features_names = [t for t in vocabulaire]
corpus_index = [corpus.index(n) for n in corpus]

tab = [bm25.get_scores(query) for query in tokenized_queries]
df = pd.DataFrame(tab, index=features_names, columns=corpus_index).transpose()

In [325]:
# df.to_csv(base_path + titre + '_matrice-OkapiBM25.csv') # Si on veut avoir la matrice (mais le fichier peut être très volumineux)

In [326]:
terms_okapi = {term: df[term].max() for term in df}

In [327]:
tab = DataFrame(terms_okapi.items(), columns=['Collocation', 'OkapiBM25'])
tab.sort_values(["OkapiBM25"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

tab = pd.merge(terms_weighted, tab, on="Collocation")

'../05-transformation/msss/'

In [328]:
tab.to_csv(base_path + acteur +'_weighting_OKapiBM25.csv')

In [329]:
tab

Unnamed: 0,Collocation,Structure syntaxique,Fréquence,LLR,p-value,TF-IDF,OkapiBM25
0,données probantes,NOM ADJ,864,-inf,1.000000e+00,0.406957,5.154878
1,lutte contre le cancer,VER:pres PRP DET:ART NOM,835,-inf,1.000000e+00,0.720513,10.419937
2,survie globale,NOM ADJ,807,-inf,1.000000e+00,0.523127,5.500395
3,services sociaux,NOM ADJ,660,-inf,1.000000e+00,0.376075,5.650148
4,effets indésirables,NOM ADJ,568,-inf,1.000000e+00,0.518971,5.369203
...,...,...,...,...,...,...,...
1970,prévention et traitement,NOM KON NOM,11,58.218195,2.345992e-14,0.171505,7.148890
1971,beam radiotherapy,NOM ADJ,11,161.088249,6.544612e-37,0.028164,9.550761
1972,durée de séjour,VER:pper PRP NOM,11,174.409432,8.056989e-40,0.189593,12.084283
1973,élevé chez les patients,VER:pper PRP DET:ART NOM,11,88.418330,5.297762e-21,0.023727,9.131247
