In [1]:
from nltk.tokenize import word_tokenize
import pandas as pd
def read_file(path):
    with open(path, encoding="utf-8", errors="ignore") as f:
        chaine = f.read()
    return chaine

def get_list_lang(lang="fr"):
    liste_langues_fr_txt = read_file(f"liste_langues_{lang}.txt")
    liste_langues_fr_txt = liste_langues_fr_txt.replace("\n",",")
    liste_langues_fr_txt = liste_langues_fr_txt.replace("  ","")
    liste_langues_fr_txt = liste_langues_fr_txt.replace('"',"")
    liste_langues_fr_txt = liste_langues_fr_txt.lower()
    liste_langues_fr_txt = liste_langues_fr_txt.split(",")
    return liste_langues_fr_txt

def br_appliquee(liste_chemins,liste_langues,langue="french"):
    """Prend en entrée la liste de chemins des articles -créée avec la fonction chemin_corpus(chemin)- 
    + la liste de langues -créée avec la fonction liste_langues(fichierjson)
    Renvoie un dictionnaire de dictionnaires de forme {chemin: {langue_citée: nb_occurrence}}"""
    
    dico_br_app = {}
    for chemin in liste_chemins:
        chemin_propre = chemin.split("\\")[-1]
        chaine = read_file(chemin).lower()
        chaine_l_propre = re.sub(r'[^\w]',' ', chaine)
        liste_mots_tok = word_tokenize(chaine_l_propre, language=langue)
        
        for mot in liste_mots_tok:
            for lg in liste_langues:
                if mot == lg:
                    dico_br_app.setdefault(chemin_propre, {})
                    dico_br_app[chemin_propre].setdefault(lg, 0)
                    dico_br_app[chemin_propre][lg]+=1

    return dico_br_app
def chemin_corpus(chemin_dossier,langue="fr",taln_ou_acl="taln"):
    """Prend en entrée le chemin du corpus ("archives_Boudin_txt/*/*/actes/*")
    Renvoie une liste contenant les chemins des articles en français"""
    
    liste_chemins = []

    for chemin in glob.glob(chemin_dossier):
        chaine = read_file(chemin)
        if detect(chaine)==langue:
            ###POUR EXCLURE LES DOCUMENTS QUI NE SONT PAS VRAIMENT ARTICLES (souvent très courts)
            if taln_ou_acl=="taln":
                tirets = chemin.split("-")
                #if len(tirets)>1:
                type_art = tirets[-2]
                if type_art != "demo" and type_art !="invite":
                    liste_chemins.append(chemin)
            if taln_ou_acl=="acl":
                if "P" in chemin:
                    if "the" in chaine:
                        liste_chemins.append(chemin)
                        
            if taln_ou_acl=="lrec":
                if "the" in chaine:
                    liste_chemins.append(chemin)
            
    return liste_chemins

def chemins_annotes(fichier_csv, cas_bender,nbheader=2,taln_ou_acl="taln"):
    """Prend en entrée le chemin vers le fichier csv qui contient le tableau d'annotation manuelle ("annotation_csv.csv")
    + le cas de figure d'application de la règle de Bender ('Appliquée', 'Non-appliquée', ...)
    + le nb_header à préciser ntmt pour langue traitée car une ligne en-dessous 
    Retourne la liste des chemins d'articles concernés"""
    
    dataset = pd.read_csv(fichier_csv,header=nbheader)
    annotes_cas = dataset.loc[dataset[cas_bender].notnull()]
    liste_annotes_cas = list(annotes_cas.iloc[:,1].values)
    if taln_ou_acl == "taln":
        url = "http://talnarchives.atala.org/TALN/"
    if taln_ou_acl == "acl":
        url = "https://aclanthology.org/"
    if taln_ou_acl == "lrec":
        url = "http://www.lrec-conf.org/proceedings/"
    
    chemins_annotes_cas = []
    for chemin in liste_annotes_cas:
        if type(chemin)==str:
            if url not in chemin and taln_ou_acl=="lrec":
                chemin = "http://www.lrec-conf.org/proceedings/"+chemin
            if url in chemin:
                dossiers = chemin.split("/")
                chemin_propre = dossiers[-1]
                chemin_propre = chemin_propre.replace(".pdf",".txt")
                chemins_annotes_cas.append(chemin_propre)
                
    return chemins_annotes_cas



In [2]:
#from tools_FD import *
from langdetect import detect
import glob
import re

In [3]:
def dico_br_appliquee_phrases(liste_chemins_annotes,liste_langues,chemins_annotes_appliquee):#,langue="french"):
    #faire deux dicos : 
#un où on envoie les phrases qui contiennent un nom de langue qui est bien la langue étudiée (annotée manuellement comme appliquée)
#un où on envoie les phrases qui contiennent un nom de langue mais qui n'est pas la langue étudiée (annotée manuellement comme autre chose qu'appliquée)
    dico_phrases_nomlg = {"appliquée":[], "non-appliquée": []}
    stats = {}
    for chemin in liste_chemins_annotes:
            #nettoyer articles et chemins
        chemin_propre = chemin.split("/")[-1]
        chaine = read_file(chemin).lower()
        liste_phrases = chaine.split(".")
        for phrase in liste_phrases:
            for mot in phrase.split():
                for l in liste_langues:
                    if mot == l:
                        stats.setdefault(mot, 0)
                        stats[mot]+=1
                        if chemin_propre in chemins_annotes_appliquee:
                            if phrase not in dico_phrases_nomlg["appliquée"]:
                                dico_phrases_nomlg["appliquée"].append(phrase)
                        else: #if chemin_propre not in chemins_annotes_appliquee:
                            if phrase not in dico_phrases_nomlg["non-appliquée"]:
                                dico_phrases_nomlg["non-appliquée"].append(phrase)
    L = [[eff, mot] for mot, eff in stats.items()]
    print(sorted(L)[-10:])
    return dico_phrases_nomlg

liste_chemins_annotes_taln = chemin_corpus("articles_annotes/*/*","fr","taln")
liste_lang = get_list_lang("fr")


In [4]:
dico_br_app_taln_a = br_appliquee(liste_chemins_annotes_taln,liste_lang,"french")
data_annote_all = dico_br_appliquee_phrases(liste_chemins_annotes_taln,liste_lang, dico_br_app_taln_a)

[[13, 'vote'], [15, 'espagnol'], [22, 'chinois'], [29, 'arabe'], [37, 'multilingue'], [64, 'allemand'], [70, 'tchèque'], [211, 'anglais'], [1038, 'français'], [1780, 'langues']]


In [6]:
import json
with open("data_classif133b.json") as f:
    data_annote = json.load(f)
textes_fr = [x[0] for x in data_annote]
classes_fr = [x[1] for x in data_annote]

In [7]:
from sklearn.linear_model import LogisticRegression
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

classifiers = []
for pen in ["l1", "l2"]:#"elasticnet",
    classifiers.append(["LogReg_saga_pen=%s"%pen, 
                        LogisticRegression(multi_class="auto", solver="saga", 
                                           max_iter=500, class_weight="balanced",
                                          penalty = pen)])

    
vectorizers=[
    ["Tfidf",TfidfVectorizer()],
    ["Count", CountVectorizer()]
]
for nom_classif, algo in classifiers:
    print(nom_classif)
    for name, V in vectorizers:
        X1_fr = V.fit_transform(textes_fr)
        Train_V = V.fit(textes_fr)
        Y1_fr = classes_fr
        X_train_fr, X_test_fr, y_train_fr, y_test_fr = train_test_split(X1_fr, Y1_fr, test_size=0.3, random_state=0)
        start = time.perf_counter()
        clf_fr = algo.fit(X_train_fr, y_train_fr)
        end = time.perf_counter()
        print(name, "... %.2f seconds"%(round(end-start,3)))
        pred_fr = clf_fr.predict(X_test_fr)

        conf_matrix_fr = confusion_matrix(y_test_fr, pred_fr)
        report_fr = classification_report(y_test_fr,pred_fr)
        print(report_fr)


LogReg_saga_pen=l1




Tfidf ... 0.38 seconds
               precision    recall  f1-score   support

    appliquée       0.98      0.92      0.95       234
non-appliquée       0.10      0.29      0.15         7

    micro avg       0.90      0.90      0.90       241
    macro avg       0.54      0.60      0.55       241
 weighted avg       0.95      0.90      0.93       241

Count ... 0.54 seconds
               precision    recall  f1-score   support

    appliquée       0.97      0.94      0.96       234
non-appliquée       0.07      0.14      0.10         7

    micro avg       0.92      0.92      0.92       241
    macro avg       0.52      0.54      0.53       241
 weighted avg       0.95      0.92      0.93       241

LogReg_saga_pen=l2
Tfidf ... 0.24 seconds
               precision    recall  f1-score   support

    appliquée       0.97      1.00      0.99       234
non-appliquée       0.00      0.00      0.00         7

    micro avg       0.97      0.97      0.97       241
    macro avg       0.49

  'precision', 'predicted', average, warn_for)


Count ... 0.22 seconds
               precision    recall  f1-score   support

    appliquée       0.97      0.98      0.97       234
non-appliquée       0.00      0.00      0.00         7

    micro avg       0.95      0.95      0.95       241
    macro avg       0.49      0.49      0.49       241
 weighted avg       0.94      0.95      0.95       241



In [8]:
with open("corpus_lremap.json",encoding="utf-8")as f:
    names_LRE = json.load(f)
    
stats_LRE = {}
#data_annote = [["ACL", "420-acl_annotation_bender.json"], ["LREC", "550-lrec_annotation_bender.json"]]4
data_annote = [["TALN","annotation_taln.json"]]
resultats = {}
for conf_name, json_path in data_annote:
    print(conf_name)
    resultats[conf_name] = {"Sentence":[], "Baseline":[], "Baseline_LRE":[], "Sentence_LRE":[]}
    with open(json_path) as f:
        dic_conf = json.load(f)
    for path_article, infos_article in dic_conf.items():
        classe = infos_article["class"]
        if classe =="Non-déductible" or classe =="Non-Aplicable":
            classe="N/A"
        chaine = read_file(path_article)
        has_LRE = False
        for name in names_LRE:
            if name in chaine:
                has_LRE = True
                stats_LRE.setdefault(name,0)
                stats_LRE[name]+=1
        chaine = chaine.lower()
        phrases = chaine.split(".")
        a_tester = []
        for phrase in phrases:
            for mot in phrase.split():
                for l in liste_lang:
                    if mot == l:
                        a_tester.append(phrase)
        if len(a_tester)>0:
            pred_baseline = "Appliquée"
            pred_LRE = pred_baseline
            if classe =="Déductible":#pour avoir aussi les déductibles
                pred_baseline = classe
            X_test2 = Train_V.transform(a_tester)
            Y_pred2 = clf_fr.predict(X_test2)
        else:
            pred_baseline = "N/A"
            Y_pred2= [["N/A"]]
            if has_LRE==True:
                pred_LRE = "Déductible"
            else:
                pred_LRE = pred_baseline

        NB_pos = len([x for x in Y_pred2 if x== "appliquée"])
        if "appliquée" in Y_pred2:
            pred_sentence = "Appliquée"
            if classe =="Déductible":#pour avoir aussi les déductibles
                pred_sentence = classe
        else:
            pred_sentence = "N/A"
        pred_sentence_LRE = pred_sentence
        if pred_sentence!="Appliquée":
            if pred_LRE=="Déductible":
                pred_sentence_LRE = pred_LRE
            "Sentence_LRE"
        resultats[conf_name]["Sentence"].append([classe, pred_sentence])
        #, {"Phr_lg":len(a_tester), "Phr_lg_pos":NB_pos}])
        resultats[conf_name]["Baseline"].append([classe, pred_baseline])
        resultats[conf_name]["Baseline_LRE"].append([classe, pred_LRE])
        resultats[conf_name]["Sentence_LRE"].append([classe, pred_sentence_LRE])


TALN


In [9]:
from scipy.stats import spearmanr
#"Sentence":[], "Baseline":[], "Baseline_LRE":[]

for class_name, dic_res_taln in resultats["TALN"].items():
    print("-"*20)
    print(class_name+"\n")
    y_test, y_pred = [x[0] for x in dic_res_taln], [x[1] for x in dic_res_taln]
    report_taln = classification_report(y_test, y_pred, output_dict=True)
    print("TALN", str(spearmanr(y_test,y_pred)))
    #print(report_lrec)
#     dic_res_acl = resultats["ACL"][class_name]
#     y_test, y_pred = [x[0] for x in dic_res_acl], [x[1] for x in dic_res_acl]
#     report_acl = classification_report(y_test, y_pred, output_dict=True)
#     print("ACL", str(spearmanr(y_test,y_pred)))
    #print(report_acl)
    dic_classes = {"Appliquée":"Applied\t", "Déductible":"Deducible", "N/A":"N/A\t", "macro avg":"macro avg", "weighted avg":"micro avg"}
    for classe in ["Appliquée", "Déductible", "N/A", "macro avg", "weighted avg"]:
        liste_ligne = [dic_classes[classe]+"\t"]
        for this_res in [report_taln]:
            for mesure in ["precision", "recall", "f1-score"]:
                liste_ligne.append(round(this_res[classe][mesure], 3))
            liste_ligne.append(this_res[classe]["support"])
        print(" & ".join([str(x) for x in liste_ligne])+"\\\\")
        


--------------------
Sentence

TALN SpearmanrResult(correlation=0.6110282391708873, pvalue=2.3703778353816496e-14)
Applied		 & 0.9 & 0.947 & 0.923 & 95\\
Deducible	 & 1.0 & 0.538 & 0.7 & 13\\
N/A		 & 0.2 & 0.5 & 0.286 & 8\\
macro avg	 & 0.525 & 0.496 & 0.477 & 127\\
micro avg	 & 0.788 & 0.795 & 0.78 & 127\\
--------------------
Baseline

TALN SpearmanrResult(correlation=0.44302828982052134, pvalue=1.8297752879229262e-07)
Applied		 & 0.85 & 0.958 & 0.901 & 95\\
Deducible	 & 1.0 & 0.615 & 0.762 & 13\\
N/A		 & 0.167 & 0.25 & 0.2 & 8\\
macro avg	 & 0.504 & 0.456 & 0.466 & 127\\
micro avg	 & 0.749 & 0.795 & 0.765 & 127\\
--------------------
Baseline_LRE

TALN SpearmanrResult(correlation=0.271258387838038, pvalue=0.0020372095726604823)
Applied		 & 0.791 & 0.958 & 0.867 & 95\\
Deducible	 & 0.714 & 0.385 & 0.5 & 13\\
N/A		 & 0.4 & 0.25 & 0.308 & 8\\
macro avg	 & 0.476 & 0.398 & 0.419 & 127\\
micro avg	 & 0.69 & 0.772 & 0.719 & 127\\
--------------------
Sentence_LRE

TALN SpearmanrResult(corr



In [10]:
print(stats_LRE)

{'AAC': 41, 'RSC': 6, 'Automatic Content Extraction': 4, 'ESTER': 12, 'British National Corpus': 25, 'BNC': 25, 'UE': 67, 'ODIN': 1, 'EAGLE': 1, 'CID': 12, 'Le Monde': 12, 'AMI': 46, 'C 3': 3, 'Switchboard': 1, 'GMA': 4, 'RTE': 20, 'ERJ': 6, 'Le Robert': 2, 'NSP': 3, 'UMLS': 14, 'DUC': 15, 'BART': 4, 'BAT': 9, 'FEAT': 2, 'Semi': 13, 'TLFi': 2, 'Lefff': 7, 'CSI': 2, 'Ngram Statistics Package': 1, 'TUT': 4, 'NYT': 2, 'ELC': 1, 'TAC KBP': 2, 'Unified Medical Language System': 3, 'COW': 2, 'MALLET': 2, 'WSJ': 1, 'Machine Learning for Language Toolkit': 1, 'WOLF': 1, 'SUMO': 6, 'ESO': 2, 'frmg': 3, 'MIM': 2, 'ESLO': 1, 'ANNODIS': 2, 'DiSCo': 1, 'FCE': 1, 'ACD': 3, 'DMC': 1, 'MPQA': 2, 'SUC': 1}


In [11]:
from scipy.stats import spearmanr

a = ["a", "b", "c"]
b = ["a", "a", "c"]

print(spearmanr(a,b))

SpearmanrResult(correlation=0.8660254037844387, pvalue=0.3333333333333332)


In [28]:
#### Tester vectoriseurs
algo = LogisticRegression(multi_class="auto", solver="lbfgs", max_iter=500, class_weight="balanced")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
#Balanced est indispensable
classifiers = [
    #["base+bal", LogisticRegression(multi_class="auto", solver="lbfgs", class_weight="balanced")
    #],
    #["base+bal+iter", LogisticRegression(multi_class="auto", solver="lbfgs", max_iter=1000, class_weight="balanced")
    #],
        ["base+bal+saga", LogisticRegression(multi_class="auto", solver="saga", class_weight="balanced")
    ],
    ["base+bal+iter+saga", LogisticRegression(multi_class="auto", solver="saga", max_iter=1000, class_weight="balanced")
    ],
    #["LogisticRegr_Bal_C=0.01", LogisticRegression(multi_class="auto", solver="saga", C=0.01, n_jobs=2, max_iter=1000, class_weight="balanced")],
    ["LogisticRegr_Bal_C=0.05", LogisticRegression(multi_class="auto", solver="saga", C=0.05, n_jobs=2, max_iter=1000, class_weight="balanced")],
    ["Decision Tree", DecisionTreeClassifier()],
    #["MNB", MultinomialNB()],
    #["RF_depth=10_estimators=100", RandomForestClassifier(max_depth=10, n_estimators = 100, random_state=0)],
]
    

#V = CountVectorizer(ngram_range=(3,5), analyzer="char")
list_vectorizer = [
    ["Count 1-1\t", CountVectorizer()],
    ["Count 3-7--char", CountVectorizer(ngram_range=(3,7), analyzer="char")],
    #["tfidf", TfidfVectorizer()],
    #["tfidf 3-7--char", TfidfVectorizer(ngram_range=(3,7), analyzer="char")],
]
list_res = []
for name, clf in classifiers:
  for nom, this_V in list_vectorizer:
    X1_fr = this_V.fit_transform(textes_fr)
    Train_V = this_V.fit(textes_fr)
    Y1_fr = classes_fr
    
    X_train_fr, X_test_fr, y_train_fr, y_test_fr = train_test_split(X1_fr, Y1_fr, test_size=0.3, random_state=0)
    start = time.perf_counter()
    clf_new = clf.fit(X_train_fr, y_train_fr)
    end = time.perf_counter()
    #print("... %.2f seconds"%(round(end-start,2)))
    pred_fr = clf_new.predict(X_test_fr)

    conf_matrix_fr = confusion_matrix(y_test_fr, pred_fr)
    #print("Classifieur 1\n",conf_matrix_en) #na/app 229/3 2/9
    report_fr = classification_report(y_test_fr,pred_fr,output_dict=True)
    print(name, nom, report_fr["macro avg"]["f1-score"])

    list_res.append([report_fr["macro avg"]["f1-score"], name, nom] )
for a in sorted(list_res):
    print(a)



base+bal+saga Count 1-1	 0.5305194805194806
base+bal+saga Count 3-7--char 0.565195998031819
base+bal+iter+saga Count 1-1	 0.4861407249466951
base+bal+iter+saga Count 3-7--char 0.4904862579281184
LogisticRegr_Bal_C=0.05 Count 1-1	 0.5176491994177583
LogisticRegr_Bal_C=0.05 Count 3-7--char 0.4904862579281184
Decision Tree Count 1-1	 0.6121244635193133
Decision Tree Count 3-7--char 0.4883227176220807
[0.4861407249466951, 'base+bal+iter+saga', 'Count 1-1\t']
[0.4883227176220807, 'Decision Tree', 'Count 3-7--char']
[0.4904862579281184, 'LogisticRegr_Bal_C=0.05', 'Count 3-7--char']
[0.4904862579281184, 'base+bal+iter+saga', 'Count 3-7--char']
[0.5176491994177583, 'LogisticRegr_Bal_C=0.05', 'Count 1-1\t']
[0.5305194805194806, 'base+bal+saga', 'Count 1-1\t']
[0.565195998031819, 'base+bal+saga', 'Count 3-7--char']
[0.6121244635193133, 'Decision Tree', 'Count 1-1\t']


In [29]:
### ROC curve
#data_annote = [["ACL", "420-acl_annotation_bender.json"], ["LREC", "551-lrec_annotation_bender.json"]]
data_annote = [["TALN","annotation_taln.json"]]

res_ROC = {}
for tranche in range(10, 110, 10):
  dic_resultats = {}
  for conf_name, json_path in data_annote:
    print(conf_name, tranche)
    dic_resultats[conf_name] = {"Sentence":[], "Baseline":[], "Baseline_LRE":[]}
    with open(json_path) as f:
        dic_conf = json.load(f)
    for path_article, infos_article in dic_conf.items():
        classe = infos_article["class"]
        if classe =="Non-déductible" or classe =="Non-Applicable":
            classe="N/A"
        chaine = read_file(path_article)
        has_LRE = False
        chaine = chaine.lower()
        phrases = chaine.split(".")
        a_tester = []
        for cpt, phrase in enumerate(phrases):
            if cpt>(len(phrases)*tranche/100):
                break
            for mot in phrase.split():
                for l in liste_lang:
                    if mot == l:
                        a_tester.append(phrase)
        if len(a_tester)>0:
            pred_baseline = "Appliquée"
            pred_LRE = pred_baseline
            if classe =="Déductible":#pour avoir aussi les déductibles
                pred_baseline = classe
            X_test2 = Train_V.transform(a_tester)
            Y_pred2 = clf_fr.predict(X_test2)
        else:
            pred_baseline = "N/A"
            Y_pred2= [["N/A"]]
            pred_LRE = pred_baseline
            if has_LRE==True:
                pred_LRE = "Déductible"
        if "appliquée" in Y_pred2:
            pred_sentence = "Appliquée"
            if classe =="Déductible":#pour avoir aussi les déductibles
                pred_sentence = classe
        else:
            pred_sentence = "N/A"        
        dic_resultats[conf_name]["Sentence"].append([classe, pred_sentence])
        dic_resultats[conf_name]["Baseline"].append([classe, pred_baseline])
        dic_resultats[conf_name]["Baseline_LRE"].append([classe, pred_LRE])
    res_ROC[tranche] = dic_resultats

TALN 10


ValueError: X has 217808 features per sample; expecting 4936

In [18]:
print(res_ROC.keys())

dict_keys([])


In [17]:
#with open("resultats_roc.json", "w") as w:
#    w.write(json.dumps(res_ROC, indent = 2))

In [19]:
print(clf_fr)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=500,
          multi_class='auto', n_jobs=None, penalty='l2', random_state=None,
          solver='saga', tol=0.0001, verbose=0, warm_start=False)


In [21]:
liste_data_conf = [["TALN", "archives_Boudin_txt/archives_Boudin_txt/*/actes"]]#, ["ACL"]]
for name_conf, path_conf in liste_data_conf:
    liste_path = glob.glob(path_conf)