## **3. Pondération statistique** (TF-IDF / OKapiBM25)  

https://stackoverflow.com/questions/46580932/calculate-tf-idf-using-sklearn-for-n-grams-in-python  
http://scikit-learn.sourceforge.net/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn-feature-extraction-text-tfidfvectorizer  
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

https://pypi.org/project/rank-bm25/

In [26]:
path = '../04-filtrage/output/'
acteur = 'iucpq'
sous_corpus = False
tag = ''

### **Lire le vocabulaire** (termes retenus au prétraitement)

In [27]:
from pandas import *

if sous_corpus:
    file_path = path + acteur + '/' + tag + '/' + tag + '_significant-collocations.csv' # si on veut les termes lemmatisés : -lemmatized.csv
    file_path = 'C:/Users/p1115145/Documents/text-mining-project/04-filtrage/output/msss/cancer/cancer_significant-collocations.csv'
else:
    file_path = path + acteur + '/' + acteur + '_significant-collocations.csv' # si on veut les termes lemmatisés : -lemmatized.csv
    
with open(file_path, encoding='utf-8') as f:
    csv = read_csv(f)[["Collocation", "Structure syntaxique", "Fréquence", "LLR", "p-value"]]

csv

Unnamed: 0,Collocation,Structure syntaxique,Fréquence,LLR,p-value
0,centre de recherche,NOM PRP NOM,5438,-inf,1.000000e+00
1,fin de vie,NOM PRP NOM,4871,-inf,1.000000e+00
2,soins palliatifs,NOM ADJ,3335,-inf,1.000000e+00
3,cliniques spécialisées,NOM VER:pper,3322,-inf,1.000000e+00
4,soins respiratoires,NOM ADJ,3276,-inf,1.000000e+00
...,...,...,...,...,...
54617,chercheur inserm,ADJ NOM,1,12.816181,3.436345e-04
54618,pearson chargée de projet,NOM VER:pper PRP NOM,1,16.285143,5.448952e-05
54619,athérosclérose avec dr éric larose,NOM PRP NOM ADJ NOM,1,20.218262,6.909023e-06
54620,implantation canadienne de la valve,NOM ADJ PRP DET:ART NOM,1,17.427491,2.984778e-05


In [28]:
vocabulaire = [t.lower() for t in list(csv['Collocation'])]

In [29]:
print('On a un vocabulaire de {} formes.'.format(len(vocabulaire)))
vocabulaire

On a un vocabulaire de 54622 formes.


['centre de recherche',
 'fin de vie',
 'soins palliatifs',
 'cliniques spécialisées',
 'soins respiratoires',
 'soins respiratoires spécialisés',
 'service régional',
 'soins respiratoires spécialisés à domicile',
 'spécialisés à domicile',
 'régional de soins respiratoires',
 'régional de soins',
 'service régional de soins',
 'service régional de soins respiratoires',
 'domicile srsrsd',
 'prévention des infections',
 'rapports annuels',
 'cardiologie pneumologie',
 'cardiologie pneumologie obésité',
 'pneumologie obésité',
 'département de médecine',
 "recherche de l'institut",
 "centre de recherche de l'institut",
 'oncologie thoracique',
 'maladies cardiaques',
 'comité des usagers',
 'prévention des maladies',
 'diabète de type',
 'pavillon de prévention',
 'pavillon de prévention des maladies',
 'pavillon de prévention des maladies cardiaques',
 'prévention des maladies cardiaques',
 'centre de soins',
 'prévention des maladies cardiaques ppmc',
 'cardiaques ppmc',
 'maladies c

### **Lire le corpus**

In [30]:
import os, shutil, re
from pathlib import Path
from os import path
from pandas import *

# Change the directory
if sous_corpus:
    base_path = '../03-corpus/2-sous-corpus/'
    base_path = path.join(base_path, acteur, acteur + '_' + tag + '.csv') 
    

else: 
    base_path = '../03-corpus/2-data/1-fr/'
    base_path = path.join(base_path, acteur + '.csv')
        
with open(base_path, "r", encoding = "UTF-8") as f:
    data = read_csv(base_path)
    text = data['text'].tolist()

corpus = [t.strip('\n').lower().replace('’', '\'').replace("œ", "oe") for t in data]
    
punct = '[!#$%&\(\)*+,-/:;<=>?@[\]^_{|}~©«»—“”–—]'
spaces = '\s+'

corpus = [re.sub(punct, ' ', t).replace("' ", "'" ).replace("'", "'") for t in text]
corpus = [re.sub(spaces, ' ', t) for t in corpus]

In [31]:
corpus = corpus[:round(len(corpus))]

nb_docs = len(corpus)

print("On a donc un corpus de {} documents.".format(nb_docs))

On a donc un corpus de 1657 documents.


### **Appliquer le prétraitement**
Si les termes passées comme vocabulaire sont lemmatisés, changer le paramètre lem pour True au moment d'appliquer la fonction nlp(corpus)  
Le TfIdfVectorizer de sklearn va extraire lui-mêmes les ngrammes, faire le filtrage des mots fonctionnels et calculer le tf-idf pour nos termes d'intérêt ;  
Or, si les termes qu'on lui donne comme vocabulaire ont été lemmatisés, on veut donc aussi lui passer un corpus lemmatisé.

In [32]:
import nltk
from nltk.tokenize import RegexpTokenizer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

def nlp(corpus, lem=False): 
    if not lem:
        # Tokenisation
        tokenizer = RegexpTokenizer(r"\w\'|\w+")

        tokens = [tokenizer.tokenize(doc) for  doc in corpus]
        len_corpus = len(nltk.flatten(tokens))
        print("Avec le RegExpTokenizer, notre corpus contient {} tokens.".format(len_corpus))

        return tokens

    else:
        # POS tagging
        input = [" ".join(nltk.flatten(doc)).replace("' ", "'") for doc in tokens]
        import treetaggerwrapper
        tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')

        path = '../04-filtrage/mapping_treeTagger_lefff.csv'

        with open(path) as f:
            csv = read_csv(f)

        treeTag = [term for term in csv['TreeTagger'].tolist()] 
        lefff = [term for term in csv['Lefff'].tolist()]

        mapping = {term : lefff[treeTag.index(term)] for term in treeTag}

        tagged= [tagger.tag_text(doc) for doc in input]

        tuples_doc = []
        for doc in tagged:
            tuples = []
            for t in doc:
                token = t.split('\t')[0]
                pos = mapping[t.split('\t')[1]]

                tuples.append([token, pos])
            tuples_doc.append(tuples)

        #Lemmatisation
        lemmatizer = FrenchLefffLemmatizer()
        docs_lemmas = []

        for doc in tuples_doc:
            doc_lemma = []
            for t in doc:
                term_lemmatized = ""
                if(lemmatizer.lemmatize(t[0], t[1]) == []):
                    term_lemmatized = lemmatizer.lemmatize(t[0])
                else:
                    term_lemmatized = lemmatizer.lemmatize(t[0], t[1])[0][0] # [0][0] pour avoir le lemme seul et non (lemme, pos)
            
                if len(term_lemmatized) >1 :
                    doc_lemma.append(term_lemmatized)
            docs_lemmas.append(doc_lemma)

        docs_lemmas = [" ".join(doc) for doc in docs_lemmas]

        return docs_lemmas

In [33]:
corpus = nlp(corpus)

Avec le RegExpTokenizer, notre corpus contient 1728102 tokens.


In [34]:
file_path = '../04-filtrage/mwe_stopwords.txt'

with open (file_path, 'r', encoding='utf-8') as f:
    mwe_sw = [t.lower().strip('\n') for t in f.readlines()]

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

# max_df : ignore words that appear in 85% of documents, 
# min df:  ignore words that appear in less than 1% of documents 
# vocabulary = vocabulaire

# Sans utiliser le vocabulaire
# tfidf = TfidfVectorizer(min_df=0.1, stop_words=None, ngram_range=(2,4), max_df=0.85, use_idf=True)

def identity_tokenizer(text):
    return text

# vocabulary = vocabulaire
tfidf = TfidfVectorizer(vocabulary = vocabulaire, tokenizer=identity_tokenizer, ngram_range=(2,5), use_idf=True, lowercase=False, stop_words= mwe_sw)
tfs = tfidf.fit_transform(corpus)



In [36]:
features_names = tfidf.get_feature_names_out()
corpus_index = [corpus.index(n) for n in corpus]

import pandas as pd
df = pd.DataFrame(tfs.T.todense(), index=features_names, columns=corpus_index).transpose()

In [37]:
df

Unnamed: 0,centre de recherche,fin de vie,soins palliatifs,cliniques spécialisées,soins respiratoires,soins respiratoires spécialisés,service régional,soins respiratoires spécialisés à domicile,spécialisés à domicile,régional de soins respiratoires,...,effets de la cigarette,radial approach,ans entrevue avec une patiente,hypothèse d'une réduction,formation de surspécialisation en pathologie,chercheur inserm,pearson chargée de projet,athérosclérose avec dr éric larose,implantation canadienne de la valve,effort de redressement
0,0.049103,0.147309,0.0,0.0,0.098206,0.098206,0.0,0.098206,0.098206,0.098206,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.049103,0.147309,0.0,0.0,0.098206,0.098206,0.0,0.098206,0.098206,0.098206,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.007410,0.029639,0.0,0.0,0.014820,0.014820,0.0,0.014820,0.014820,0.014820,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.109382,0.328145,0.0,0.0,0.218764,0.218764,0.0,0.218764,0.218764,0.218764,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.054909,0.164728,0.0,0.0,0.164728,0.164728,0.0,0.164728,0.164728,0.164728,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1652,0.022005,0.066016,0.0,0.0,0.044011,0.044011,0.0,0.044011,0.044011,0.044011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1653,0.017314,0.051942,0.0,0.0,0.034628,0.034628,0.0,0.034628,0.034628,0.034628,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1654,0.046251,0.069376,0.0,0.0,0.046251,0.046251,0.0,0.046251,0.046251,0.046251,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1655,0.020002,0.060007,0.0,0.0,0.040005,0.040005,0.0,0.040005,0.040005,0.040005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# from pathlib import Path

# base_path = '../05-transformation/' + acteur + '/'
# Path(base_path).mkdir(parents=True, exist_ok=True)

# if sous_corpus:
#     path = base_path + tag + '/'
#     titre = tag

# else:
#     titre = acteur

# df.to_csv(base_path + titre + '_matrice-TFIDF.csv')

In [39]:
terms_weighted = []
rows, cols = tfs.nonzero()
for row, col in zip(rows,cols):
    terms_weighted.append([features_names[col], tfs[row,col]])

terms_weighted = DataFrame(terms_weighted, columns=['Collocation', 'TF-IDF'])
terms_weighted.sort_values(["TF-IDF"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

In [40]:
terms_weighted = terms_weighted.drop_duplicates(keep='first')

In [41]:
terms_weighted = pd.merge(csv, terms_weighted, on='Collocation').drop_duplicates(
  subset = ['Collocation', 'Fréquence'],
  keep = 'first').reset_index(drop = True)

In [42]:
terms_weighted.to_csv('C:/Users/p1115145/Documents/text-mining-project/05-transformation/msss/' + 'cancer-TEST' + '_weighting_TF-IDF.csv')

## **OKapi BM25**
https://hal.archives-ouvertes.fr/hal-00760158 

In [43]:
from rank_bm25 import BM25Okapi

In [44]:
bm25 = BM25Okapi(corpus)

In [45]:
tokenizer = RegexpTokenizer(r"\w\'|\w+")
tokenized_queries = [tokenizer.tokenize(t) for t in vocabulaire]

features_names = [t for t in vocabulaire]
corpus_index = [corpus.index(n) for n in corpus]

tab = [bm25.get_scores(query) for query in tokenized_queries]
df = pd.DataFrame(tab, index=features_names, columns=corpus_index).transpose()

In [46]:
# df.to_csv(base_path + titre + '_matrice-OkapiBM25.csv') # Si on veut avoir la matrice (mais le fichier peut être très volumineux)

In [47]:
terms_okapi = {term: df[term].max() for term in df}

In [48]:
tab = DataFrame(terms_okapi.items(), columns=['Collocation', 'OkapiBM25'])
tab.sort_values(["OkapiBM25"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

tab = pd.merge(terms_weighted, tab, on="Collocation")

In [51]:
base_path

'../03-corpus/2-data/1-fr/iucpq.csv'

In [49]:
tab.to_csv(base_path + acteur +'_weighting_OKapiBM25.csv')

In [50]:
tab

Unnamed: 0,Collocation,Structure syntaxique,Fréquence,LLR,p-value,TF-IDF,OkapiBM25
0,centre de recherche,NOM PRP NOM,5438,-inf,1.000000e+00,0.258199,9.458484
1,fin de vie,NOM PRP NOM,4871,-inf,1.000000e+00,0.443513,9.877333
2,soins palliatifs,NOM ADJ,3335,-inf,1.000000e+00,0.574180,6.253255
3,cliniques spécialisées,NOM VER:pper,3322,-inf,1.000000e+00,0.173394,5.587348
4,soins respiratoires,NOM ADJ,3276,-inf,1.000000e+00,0.313948,6.034909
...,...,...,...,...,...,...,...
24739,transport des patients sur civière,NOM PRP:det NOM PRP NOM,1,25.265177,4.996486e-07,0.077468,22.996460
24740,effets de la cigarette,NOM PRP DET:ART NOM,1,16.302214,5.400083e-05,0.118055,21.226215
24741,formation de surspécialisation en pathologie,NOM PRP NOM PRP NOM,1,20.499982,5.963180e-06,0.137844,21.764966
24742,implantation canadienne de la valve,NOM ADJ PRP DET:ART NOM,1,17.427491,2.984778e-05,0.104569,24.162951
