## **3. Pondération statistique** (TF-IDF / OKapiBM25)  

https://stackoverflow.com/questions/46580932/calculate-tf-idf-using-sklearn-for-n-grams-in-python  
http://scikit-learn.sourceforge.net/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn-feature-extraction-text-tfidfvectorizer  
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

https://pypi.org/project/rank-bm25/

In [215]:
path = '../04-filtrage/output/'
acteur = 'msss'
tag = 'deficiences-et-handicaps'

if tag:
    csv_file = acteur + '_' + tag + '_significant-collocations.csv'

else:
    csv_file = acteur + '_significant-collocations.csv'

### **Lire le vocabulaire** (termes retenus au prétraitement)

In [216]:
from pandas import *

with open(path+csv_file, encoding='utf-8') as f:
    csv = read_csv(f).drop(columns = ['Unnamed: 0'])
    # On va traiter les apostrophes et les parenthèses avec le underscore ("_") ; le tokenizer de sklearn ne les aime pas 
    csv['Terme']= csv["Terme"].apply(lambda x: x.replace("'", "_").replace("(", "_").replace(")", "_"))

csv

Unnamed: 0,Terme,Structure syntaxique,Fréquence (TF),LLR,p-value,Fréquence documentaire (DF),isMeSHTerm,isTaxoTerm
0,services,NOM,1071,-,-,94,False,False
1,domicile,NOM,440,-,-,41,False,False
2,santé,NOM,405,-,-,72,False,False
3,déficience,NOM,393,-,-,81,False,True
4,pmatcom,NOM,350,-,-,32,False,False
...,...,...,...,...,...,...,...,...
605,composant,NOM,16,-,-,27,False,False
606,déploiement,NOM,16,-,-,14,False,False
607,gestionnaire secondaire daniel garneau,NOM ADJ ADJ NOM,16,272.9682413133986,2.5586261803234317e-61,16,False,False
608,types d_intervention,NOM PRP NOM,16,187.82116953292174,9.50466374208189e-43,6,False,False


In [217]:
vocabulaire = [t.lower() for t in list(csv['Terme'])]

In [218]:
print('On a un vocabulaire de {} formes.'.format(len(vocabulaire)))

On a un vocabulaire de 610 formes.


In [219]:
vocabulaire = [t.replace("'", "_").replace("(", "_").replace(")", "_") for t in vocabulaire]
vocabulaire

['services',
 'domicile',
 'santé',
 'déficience',
 'pmatcom',
 'tsa',
 'soutien',
 'accès',
 'intervention',
 'services sociaux',
 'sce',
 'trouble',
 'autisme',
 'projet',
 'spectre de l_autisme',
 'spectre',
 'trouble du spectre de l_autisme',
 'trouble du spectre',
 'usager',
 'nombre',
 'besoins',
 'aide',
 'indicateur',
 'données',
 'programme',
 'usagers',
 'communication',
 'ramq',
 'famille',
 'année',
 'type',
 'profil',
 'intellectuelle',
 'système',
 'contrôle',
 'déficience physique',
 'soutien à domicile',
 'msss',
 'évaluation',
 'clsc',
 'aides',
 'équipements',
 'québec',
 'cours',
 'cadre',
 'déficience intellectuelle',
 'adultes',
 'commande',
 'durée',
 'santé physique',
 'forum',
 'pilote',
 'projet pilote',
 'appareil',
 'dispositifs',
 'environnement',
 'service',
 'bluetooth',
 'appareils',
 'gestion',
 'indicateurs',
 'enfants',
 'utilisation',
 'programmes',
 'ministère',
 'alimentation',
 'maladies',
 'établissement',
 'maladies chroniques',
 'calcul',
 'attr

### **Lire le corpus**

In [220]:
import os, shutil, re
from pathlib import Path
from os import path
from pandas import *

base_path = '../03-corpus/2-data/1-fr/'
if tag:
    base_path = path.join(base_path, acteur, acteur + '_' + tag + '.csv')

else:
    base_path = path.join(base_path, acteur +  '.csv')
        
with open(base_path, "r", encoding = "UTF-8") as f:
    data = read_csv(base_path, sep=',')
    text = data['text'].tolist()

In [221]:
text = text[:round(len(text))]

nb_docs = len(text)

print("On a donc un corpus de {} documents.".format(nb_docs))

On a donc un corpus de 112 documents.


### **Nettoyage**

In [222]:
corpus = [str(t).strip('\n').lower().replace('’', '\'') for t in text]
    
punct = '[!#$%&•►*+,;<=>?@[\]^_{|}~©«»—“”–—]'
spaces = '\s+'
postals = '([a-zA-Z]+\d+|\d+[a-zA-Z]+)+'
phones = '\d{3}\s\d{3}-\d{4}' #très simple (trop)

corpus = [str(t).strip('\n').lower().replace('’', '\'') for t in corpus]
corpus = [re.sub(spaces, ' ', t) for t in corpus]
corpus = [re.sub(punct, ' ', t).replace("' ", "'" ) for t in corpus]
corpus = [re.sub(phones, ' STOP ', t) for t in corpus]
corpus = [re.sub(postals, ' STOP ', t) for t in corpus]
corpus = [t.replace("  ", " " ) for t in corpus]
corpus = [t.replace("'", "_").replace("(", "_").replace(")", "_") for t in corpus]


On va commencer par utiliser le CountVectorizer pour valider que l'implémentation de sklearn arrive bien au même compte que nous 

In [223]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(vocabulary=vocabulaire, ngram_range=(1,10), token_pattern='\w+')
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

features_names = vectorizer.get_feature_names_out()
corpus_index = [corpus.index(n) for n in corpus]

import pandas as pd
df = pd.DataFrame(X.T.todense(), index=features_names, columns=corpus_index).transpose()

In [224]:
csv['TF (sklearn)'] = 0
for t in vocabulaire:
    csv.loc[csv['Terme'] == t, 'TF (sklearn)'] = df[t].sum()


csv['DF (sklearn)'] = 0
for t in vocabulaire:
    freqdoc = len(df[df[t] != 0])
    csv.loc[csv['Terme'] == t, 'DF (sklearn)'] = freqdoc
csv

Unnamed: 0,Terme,Structure syntaxique,Fréquence (TF),LLR,p-value,Fréquence documentaire (DF),isMeSHTerm,isTaxoTerm,TF (sklearn),DF (sklearn)
0,services,NOM,1071,-,-,94,False,False,1233,94
1,domicile,NOM,440,-,-,41,False,False,424,41
2,santé,NOM,405,-,-,72,False,False,401,72
3,déficience,NOM,393,-,-,81,False,True,374,81
4,pmatcom,NOM,350,-,-,32,False,False,317,30
...,...,...,...,...,...,...,...,...,...,...
605,composant,NOM,16,-,-,27,False,False,14,10
606,déploiement,NOM,16,-,-,14,False,False,16,14
607,gestionnaire secondaire daniel garneau,NOM ADJ ADJ NOM,16,272.9682413133986,2.5586261803234317e-61,16,False,False,16,16
608,types d_intervention,NOM PRP NOM,16,187.82116953292174,9.50466374208189e-43,6,False,False,16,6


In [225]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import RegexpTokenizer

#toknizer = RegexpTokenizer()

# max_df : ignore words that appear in 85% of documents, 
# min df:  ignore words that appear in less than 1% of documents 
# vocabulary = vocabulaire

# Sans utiliser le vocabulaire
# tfidf = TfidfVectorizer(min_df=0.1, stop_words=None, ngram_range=(2,4), max_df=0.85, use_idf=True)

# vocabulary = vocabulaire
tfidf_vectorizer = TfidfVectorizer(vocabulary=vocabulaire, ngram_range=(1,10), lowercase=False, token_pattern='\w+')
tfidf = tfidf_vectorizer.fit_transform(corpus)

In [226]:
features_names = tfidf_vectorizer.get_feature_names_out()
corpus_index = [corpus.index(n) for n in corpus]

import pandas as pd
df = pd.DataFrame(tfidf.T.todense(), index=features_names, columns=corpus_index).transpose()

In [227]:
terms_tfidf = {term: df[term].max() for term in df}
zeros = {term : value for term, value in terms_tfidf.items() if terms_tfidf[term] == 0}

In [228]:
zeros

{'alimentation': 0.0,
 'maladies': 0.0,
 'maladies chroniques': 0.0,
 'autisme _tsa_': 0.0,
 'projet-pilote': 0.0,
 'élément': 0.0,
 'i-clsc': 0.0,
 'profil iso-smaf': 0.0,
 'autonomie des personnes âgées': 0.0,
 'attribution du pmatcom': 0.0,
 'alimentation en données': 0.0,
 'accompagnement': 0.0,
 'heures de service de soutien à domicile': 0.0,
 'accès à un système': 0.0,
 'projet-pilote visant l_accès à un système': 0.0,
 'heures de service de soutien': 0.0,
 'projet-pilote visant l_accès': 0.0,
 'projet-pilote visant': 0.0,
 'accès à un système de contrôle de l_environnement': 0.0,
 'accès à un système de contrôle': 0.0,
 'amélioration': 0.0,
 'assurance maladie': 0.0,
 'assurance maladie du québec': 0.0,
 'main-d_œuvre': 0.0,
 'sous-centres': 0.0,
 'alimentation électrique': 0.0,
 'autisme gestionnaire principal': 0.0,
 'autisme gestionnaire': 0.0,
 'avenir': 0.0,
 'admissibilité': 0.0,
 'autisme _di-tsa_': 0.0,
 'résidence privée pour personnes âgées': 0.0,
 'résultats source d_

In [229]:
len(zeros) / len(vocabulaire) *100

5.737704918032787

In [230]:
terms_tfidf

{'services': 0.5349847438692181,
 'domicile': 0.5376780962869535,
 'santé': 0.3390994261903309,
 'déficience': 0.46159725612702607,
 'pmatcom': 0.5999527313511764,
 'tsa': 0.28308803333971555,
 'soutien': 0.3120198289926014,
 'accès': 0.06841604972113328,
 'intervention': 0.25134582288792345,
 'services sociaux': 0.2225789253184717,
 'sce': 0.39250876000876767,
 'trouble': 0.3716580473314477,
 'autisme': 0.0834043019529732,
 'projet': 0.46127020890493226,
 'spectre de l_autisme': 0.24383624125377332,
 'spectre': 0.24383624125377332,
 'trouble du spectre de l_autisme': 0.24383624125377332,
 'trouble du spectre': 0.24383624125377332,
 'usager': 0.0843038520906856,
 'nombre': 0.18403848670773515,
 'besoins': 0.26720286634421364,
 'aide': 0.18538832306398134,
 'indicateur': 0.1308569710076092,
 'données': 0.2170812846816091,
 'programme': 0.41580206485777693,
 'usagers': 0.18570542804036808,
 'communication': 0.2965078204093053,
 'ramq': 0.22433411457019783,
 'famille': 0.2368476579584293,

In [231]:
terms_weighted = DataFrame(terms_tfidf.items(), columns=['Terme', 'TF-IDF'])

In [232]:
terms_weighted.sort_values(["TF-IDF"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

terms_weighted = terms_weighted.drop_duplicates(keep='first')
terms_weighted

Unnamed: 0,Terme,TF-IDF
120,guide,0.661131
236,cliniques,0.614395
305,dépistage,0.607883
4,pmatcom,0.599953
322,places,0.543512
...,...,...
377,heures de service de soutien à domicile,0.000000
65,alimentation,0.000000
295,alimentation en données,0.000000
275,attribution du pmatcom,0.000000


In [233]:
terms_weighted = pd.merge(csv, terms_weighted, on='Terme').drop_duplicates(
  subset = ['Terme', 'Fréquence (TF)'],
  keep = 'first').reset_index(drop = True)

## **OKapi BM25**
https://hal.archives-ouvertes.fr/hal-00760158 

In [234]:
from rank_bm25 import BM25Okapi

In [235]:
bm25 = BM25Okapi([t.split() for t in corpus])

In [236]:
vocabulaire

['services',
 'domicile',
 'santé',
 'déficience',
 'pmatcom',
 'tsa',
 'soutien',
 'accès',
 'intervention',
 'services sociaux',
 'sce',
 'trouble',
 'autisme',
 'projet',
 'spectre de l_autisme',
 'spectre',
 'trouble du spectre de l_autisme',
 'trouble du spectre',
 'usager',
 'nombre',
 'besoins',
 'aide',
 'indicateur',
 'données',
 'programme',
 'usagers',
 'communication',
 'ramq',
 'famille',
 'année',
 'type',
 'profil',
 'intellectuelle',
 'système',
 'contrôle',
 'déficience physique',
 'soutien à domicile',
 'msss',
 'évaluation',
 'clsc',
 'aides',
 'équipements',
 'québec',
 'cours',
 'cadre',
 'déficience intellectuelle',
 'adultes',
 'commande',
 'durée',
 'santé physique',
 'forum',
 'pilote',
 'projet pilote',
 'appareil',
 'dispositifs',
 'environnement',
 'service',
 'bluetooth',
 'appareils',
 'gestion',
 'indicateurs',
 'enfants',
 'utilisation',
 'programmes',
 'ministère',
 'alimentation',
 'maladies',
 'établissement',
 'maladies chroniques',
 'calcul',
 'attr

In [237]:
#tokenizer = RegexpTokenizer(r"\w\'|\w+")
## Revoir ça
tokenized_queries = [t.split() for t in set(vocabulaire)]

features_names = [t for t in set(vocabulaire)]
corpus_index = [corpus.index(n) for n in corpus]

tab = [bm25.get_scores(query) for query in tokenized_queries]
df = pd.DataFrame(tab, index=features_names, columns=corpus_index).transpose()

In [238]:
print(len(tokenized_queries))

tokenized_queries

610


[['trouble', 'envahissant'],
 ['matière'],
 ['contribution'],
 ['contexte'],
 ['soutien'],
 ['dispositifs', 'd_alimentation'],
 ['allocations'],
 ['projet-pilote', 'visant'],
 ['déficience', 'intellectuelle', 'et', 'trouble', 'du', 'spectre'],
 ['communication', 'et', 'sce'],
 ['module'],
 ['système', 'd_information', 'sur', 'la', 'clientèle'],
 ['nombre', 'total', 'd_heures'],
 ['dispositifs', 'de', 'montage'],
 ['difficulté'],
 ['autonomie'],
 ['équipement'],
 ['heures', 'de', 'service'],
 ['portrait'],
 ['sat'],
 ['projet-pilote'],
 ['autisme', 'gestionnaire'],
 ['accès', 'à', 'un', 'système', 'de', 'contrôle', 'de', 'l_environnement'],
 ['propulsion'],
 ['ressource'],
 ['internet'],
 ['nutrition'],
 ['heures', 'de', 'service', 'de', 'soutien'],
 ['source'],
 ['transmission'],
 ['tablette'],
 ['cliniques'],
 ['accès', 'à', 'un', 'système', 'de', 'contrôle'],
 ['continuité'],
 ['suppléance', 'à', 'la', 'communication'],
 ['installation'],
 ['moment'],
 ['inhalothérapie', 'à', 'domici

In [239]:
df

Unnamed: 0,trouble envahissant,matière,contribution,contexte,soutien,dispositifs d_alimentation,allocations,projet-pilote visant,déficience intellectuelle et trouble du spectre,communication et sce,...,limites,répertoire des indicateurs de gestion en santé,vigueur,aide technique,numéro de formulaire gestred,calcul formule nombre,programmes,territoire,eésad,gestion en santé
0,1.320640,0.000000,0.000000,1.186355,0.515800,0.240302,0.000000,0.000000,9.348663,1.805457,...,0.580496,8.418974,1.012115,0.000000,6.630706,1.863801,0.752928,0.00000,0.000000,3.036550
1,1.107872,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.170283,1.767370,...,0.876256,9.204401,0.000000,0.000000,1.857857,1.912129,0.730181,0.00000,0.000000,3.375750
2,1.068933,0.000000,2.481539,0.000000,1.705414,0.263997,0.000000,0.000000,8.106618,1.811597,...,0.000000,8.884283,0.000000,1.158420,1.870125,1.033199,0.792302,0.00000,4.377613,3.347299
3,1.521525,0.000000,0.000000,0.000000,0.000000,0.387408,0.000000,0.000000,9.623338,1.801846,...,0.935858,9.730335,0.000000,0.000000,3.887400,1.516194,0.880984,0.00000,0.000000,3.795469
4,1.238219,0.855284,0.000000,0.000000,1.591295,0.337345,3.120176,0.000000,8.814438,1.805103,...,0.814920,9.160715,0.000000,0.697773,5.383942,1.635820,0.000000,2.14853,0.000000,3.390130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,0.000000,0.728572,0.000000,1.418707,0.616822,0.000000,0.000000,1.789693,4.179338,5.109696,...,0.694188,5.996857,0.000000,4.766581,2.666148,0.000000,0.406538,0.00000,0.000000,2.375085
59,0.000000,0.000000,0.000000,0.000000,0.000000,1.626046,0.000000,2.051250,4.183793,5.709833,...,0.000000,6.369299,1.387229,0.000000,1.865501,0.000000,1.094572,0.00000,0.000000,2.698741
60,0.000000,1.409520,0.000000,0.000000,0.865736,0.000000,0.000000,1.621597,3.623734,5.604604,...,0.628987,5.315335,0.000000,4.001059,2.355443,0.000000,0.368354,0.00000,0.000000,1.690994
61,0.000000,0.000000,0.000000,0.000000,0.000000,3.101932,0.000000,1.379636,3.564119,5.983508,...,0.000000,5.239154,1.495610,5.053124,2.251827,0.000000,0.313391,0.00000,0.000000,1.677164


In [240]:
#df.to_csv(base_path + titre + '_matrice-OkapiBM25.csv') # Si on veut avoir la matrice (mais le fichier peut être très volumineux)

In [241]:
terms_okapi = {term: df[term].max() for term in df}

In [242]:
terms_okapi

{'trouble envahissant': 5.251591807475578,
 'matière': 1.790931290162034,
 'contribution': 2.9478616636204586,
 'contexte': 2.4312735645118657,
 'soutien': 1.7684872829760538,
 'dispositifs d_alimentation': 4.596593719879635,
 'allocations': 3.1497308409390263,
 'projet-pilote visant': 4.164884928898214,
 'déficience intellectuelle et trouble du spectre': 10.410995707583346,
 'communication et sce': 6.2876381188978065,
 'module': 3.2957140199709656,
 'système d_information sur la clientèle': 6.33120861357348,
 'nombre total d_heures': 7.105812115196743,
 'dispositifs de montage': 8.760162895365598,
 'difficulté': 4.2535667731343265,
 'autonomie': 3.512520729713149,
 'équipement': 4.840690409599016,
 'heures de service': 7.398686065921494,
 'portrait': 1.4450404297083403,
 'sat': 4.374620697378904,
 'projet-pilote': 2.189823439853786,
 'autisme gestionnaire': 3.3674022430400976,
 'accès à un système de contrôle de l_environnement': 13.531472943446488,
 'propulsion': 2.0720520054530907,


In [243]:
tab = DataFrame(terms_okapi.items(), columns=['Terme', 'OkapiBM25'])
tab

Unnamed: 0,Terme,OkapiBM25
0,trouble envahissant,5.251592
1,matière,1.790931
2,contribution,2.947862
3,contexte,2.431274
4,soutien,1.768487
...,...,...
605,calcul formule nombre,1.912129
606,programmes,1.094572
607,territoire,2.420975
608,eésad,4.377613


In [244]:
tab.sort_values(["OkapiBM25"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

tab

Unnamed: 0,Terme,OkapiBM25
498,services psychosociaux pour les jeunes en diff...,17.975599
274,contrôle de l_environnement et à la technologi...,14.941249
236,forum québécois sur le trouble du spectre de l...,14.675228
377,heures de service de soutien à domicile,14.149632
338,réseau de la santé et des services sociaux,14.057097
...,...,...
386,amélioration,0.000000
166,accompagnement,0.000000
185,avenir,0.000000
356,alimentation,0.000000


In [245]:
terms_weighted

Unnamed: 0,Terme,Structure syntaxique,Fréquence (TF),LLR,p-value,Fréquence documentaire (DF),isMeSHTerm,isTaxoTerm,TF (sklearn),DF (sklearn),TF-IDF
0,services,NOM,1071,-,-,94,False,False,1233,94,0.534985
1,domicile,NOM,440,-,-,41,False,False,424,41,0.537678
2,santé,NOM,405,-,-,72,False,False,401,72,0.339099
3,déficience,NOM,393,-,-,81,False,True,374,81,0.461597
4,pmatcom,NOM,350,-,-,32,False,False,317,30,0.599953
...,...,...,...,...,...,...,...,...,...,...,...
605,composant,NOM,16,-,-,27,False,False,14,10,0.120842
606,déploiement,NOM,16,-,-,14,False,False,16,14,0.079319
607,gestionnaire secondaire daniel garneau,NOM ADJ ADJ NOM,16,272.9682413133986,2.5586261803234317e-61,16,False,False,16,16,0.043476
608,types d_intervention,NOM PRP NOM,16,187.82116953292174,9.50466374208189e-43,6,False,False,16,6,0.089036


In [246]:
tab = pd.merge(terms_weighted, tab, on="Terme")
tab.sort_values(["OkapiBM25"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

In [247]:
base_path = '../05-transformation/'

In [248]:
if tag:
    file_path = base_path + acteur + '_' + tag + '_weighting_OKapiBM25--NO-VOC.csv'

else: 
    file_path = base_path + acteur  + '_weighting_OKapiBM25-NO-VOC.csv'
tab.to_csv(file_path)

In [249]:
tab

Unnamed: 0,Terme,Structure syntaxique,Fréquence (TF),LLR,p-value,Fréquence documentaire (DF),isMeSHTerm,isTaxoTerm,TF (sklearn),DF (sklearn),TF-IDF,OkapiBM25
390,services psychosociaux pour les jeunes en diff...,NOM ADJ PRP DET:ART NOM PRP NOM,24,408.6314092088783,7.278380157027306e-91,4,False,False,24,4,0.193916,17.975599
380,contrôle de l_environnement et à la technologi...,NOM PRP DET:ART NOM KON PRP DET:ART NOM ADJ,24,330.6300707157602,7.0074096349940544e-74,24,False,False,24,24,0.256496,14.941249
154,forum québécois sur le trouble du spectre de l...,NOM ADJ PRP DET:ART NOM PRP:det NOM PRP DET:AR...,53,648.7195797703292,4.242069274655342e-143,34,False,False,53,34,0.235635,14.675228
377,heures de service de soutien à domicile,NOM PRP NOM PRP NOM PRP NOM,24,263.5405998774374,2.9024822958443215e-59,6,False,False,0,0,0.000000,14.149632
283,réseau de la santé et des services sociaux,NOM PRP DET:ART NOM KON PRP:det NOM ADJ,33,389.11574663313723,1.2893797391311264e-86,19,False,False,33,19,0.232596,14.057097
...,...,...,...,...,...,...,...,...,...,...,...,...
373,accompagnement,NOM,24,-,-,15,False,False,0,0,0.000000,0.000000
65,alimentation,NOM,107,-,-,49,False,False,0,0,0.000000,0.000000
19,nombre,NOM,218,-,-,62,False,False,212,56,0.184038,0.000000
55,environnement,NOM,121,-,-,34,False,False,2,2,0.038477,0.000000
