In [1]:
lng = 'fr'
#tag = 'sante-mentale'      Si on veut traiter un seul fichier à la fois
#csv_file = tag + '.csv'    Si on veut traiter un seul fichier à la fois

import shutil, re, random, os
from os import listdir, chdir, path
from pathlib import Path
from pandas import *

import nltk
#nltk.download(['popular'])
from nltk.tokenize import RegexpTokenizer
tokenizer_re = RegexpTokenizer(r"\w\'|\w+")
from nltk import bigrams, trigrams, ngrams, everygrams
from nltk.probability import FreqDist


import treetaggerwrapper
tagger = treetaggerwrapper.TreeTagger(TAGLANG=lng)


from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.stats import binom, chi2

  punct2find_re = re.compile("([^ ])([[" + ALONEMARKS + "])",
  DnsHostMatch_re = re.compile("(" + DnsHost_expression + ")",
  UrlMatch_re = re.compile(UrlMatch_expression, re.VERBOSE | re.IGNORECASE)
  EmailMatch_re = re.compile(EmailMatch_expression, re.VERBOSE | re.IGNORECASE)


In [11]:
def lire_corpus(csv_file, langue=lng):
    base_path = '../03-corpus/2-data/'
    file_path = path.join(base_path, '1-' + langue, csv_file)

    with open(file_path, "r", encoding = 'iso-8859-1') as f:
        data = read_csv(file_path)
        data = data[~data["text"].str.contains('ã')]['text']
        #data = data[~data["url"].str.contains('pdf')] # Si on veut exclure les PDFs 
        
    return data

def clean_data(data):
    text = [str(t).strip('\n').lower().replace('’', '\'').replace("œ", "oe") for t in data]
    punct = '[!#$%&\(\)•*+,-\/:;<=>?@[\]^_{|}~©«»—“”–—]'
    spaces = '\s+'
    urls = 'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    postal = '/^[ABCEGHJ-NPRSTVXY]\d[ABCEGHJ-NPRSTV-Z][ -]?\d[ABCEGHJ-NPRSTV-Z]\d$/i'
    phones = '(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}'

    text = [re.sub(spaces, ' ', t) for t in text]
    text = [re.sub(phones, " ", t).replace("  ", " " ) for t in text]
    text = [re.sub(postal, " ", t).replace("  ", " " ) for t in text]
    text = [re.sub(urls, " ", t).replace("  ", " " ) for t in text]
    text = [re.sub(punct, ' ', t).replace("  ", " " ) for t in text]
    
    return text


def sample_corpus(corpus, ratio):
    n = round(ratio * len(corpus))
    corpus = random.sample(corpus, n)
    print("On va travailler sur un échantillon correspondant à environ " + str(ratio * 100) + " % des documents du corpus, soit {} documents". format(len(corpus)))
    
    return " ".join(corpus)


def filter_mwesw(corpus):
    file_mwesw = '../04-filtrage/mwe_stopwords.txt'
    with open (file_mwesw, 'r', encoding='utf-8') as f:
        mwe_sw = [t.lower().strip('\n') for t in f.readlines()]
    for mwe in mwe_sw:
        corpus = corpus.replace(mwe, ' MWE_STOP ').replace('  ', " ")
    
    return corpus


def tok(corpus):
    # Seulement les caractères alphabétiques
    tokens = tokenizer_re.tokenize(corpus)
    print("Avec le RegExpTokenizer, notre corpus contient {} tokens.".format(len(tokens)))
    temps = round(len(tokens) / 15000 / 60)
    print('Le POS tagging devrait prendre environ {} minutes.'.format(temps))
    
    return tokens


def tagging(corpus): 
    return [(t.split('\t')[0], t.split('\t')[1]) for t in tagger.tag_text(corpus)]


def extr_ngrams(tagged):
    ngrammes= list(everygrams(tagged, min_len=2, max_len=6))
    print("Avant filtrage, on a {} ngrammes.".format(len(ngrammes)))
    
    return ngrammes

def extract_patterns(ngrammes):
    patterns = []
    for ng in ngrammes:
        phrase = [t[0] for t in ng]
        pattern = [t[1] for t in ng]
        patterns.append([phrase, pattern])

    return patterns


def freq(phrases):
    return FreqDist([" ".join(t[0]).replace("' ", "'") for t in phrases])


def import_stopwords():
    # Stopwords fréquents en français (non lemmatisés)
    file_path = "../04-filtrage/stopwords.txt"
    with open(file_path, 'r', encoding="utf-8") as f:
        stopwords = [t.lower().strip('\n') for t in f.readlines()]

    # Stopwords fréquents en anglais (non lemmatisés)
    file_path = '../04-filtrage/stop_words_english.txt'
    with open(file_path, 'r', encoding="utf-8") as f:
        stopwords += [t.lower().strip('\n') for t in f.readlines()]
    return stopwords

stopwords = import_stopwords()

def filtrer_stopwords(x):
    return [term for term in x if not 'MWE_STOP' in term[0] and not term[0][0] in stopwords and not term[0][-1] in stopwords]

def filter_num(x):
    return [term for term in x if not term[0][0].isnumeric() and not term[0][-1].isnumeric()]

def filter_len(x):
    return [term for term in x if \
        len(term[0][0]) > 2 and len(term[0][0]) < 18 and \
        len(term[0][-1]) > 2 and len(term[0][-1]) < 18]

def tabCSV(phrases):
    tab = DataFrame(phrases, columns=["Expression", "Patron syntaxique", "Fréquence"]).drop_duplicates()
    tab.sort_values(["Fréquence"], 
                        axis=0,
                        ascending=[False], 
                        inplace=True)
    return tab.values.tolist()

def import_patterns():
    file_patterns = '../04-filtrage/MeSH/mesh_patterns-fr.csv'
    with open (file_patterns, 'r') as f:
        patterns = read_csv(f)
        patterns = patterns['Structure'].tolist() #[:200]
    return patterns

patterns = import_patterns()

def filter_patterns(phrases):
    return [t for t in phrases if t[1] in patterns and not 'NOM NOM' in t[1]]


def loglikelihood_ratio(c_prior, c_n, c_ngram, N):
    """
    Compute the ratio of two hypotheses of likelihood and return the ratio.
    The formula here and test verification values are taken from 
    Manning & Schūtze _Foundations of Statistical Natural Language Processing_ p.172-175
    Parameters:
    c_prior: count of word 1 if bigrams or count of [w1w2 .. w(n-1)] if ngram
    c_n : count of word 2 if bigrams or count of wn if ngram
    c12: count of bigram (w1, w2) if bigram or count of ngram if ngram
    N: the number of words in the corpus
    """

    p = c_n / N
    p1 = c_ngram / c_prior
    p2 = (c_n - c_ngram) / (N - c_prior)   
    # We proactively trap a runtimeWarning: divide by zero encountered in log,
    # which may occur with extreme collocations
    # import warnings
    # with warnings.catch_warnings(): # this will reset our filterwarnings setting
    #     warnings.filterwarnings('error')
    try:
        return (np.log(binom.pmf(c_ngram, c_prior, p)) 
                + np.log(binom.pmf(c_n - c_ngram, N - c_prior, p)) 
                - np.log(binom.pmf(c_ngram, c_prior, p1) )
                - np.log(binom.pmf(c_n - c_ngram, N - c_prior, p2)))             
    except Exception as e:
        return np.inf 

In [12]:
def nlp(x):
    print('CORPUS - ' + x)
    tag = x.strip('.csv')
    
    print('Lecture du corpus')
    data = lire_corpus(x)
    text = clean_data(data)
    corpus = sample_corpus(text, 1)

    tokens=tok(corpus)
    corpus="  ".join(tokens).replace("' ", "'")
    corpus = filter_mwesw(corpus)


    print('Tagging du corpus')
    tagged=tagging(corpus)
    ngrammes=extr_ngrams(tagged)

    print('Extraction des patrons syntaxiques')
    phrases=extract_patterns(ngrammes)
    frequencies=freq(phrases)

    print('Filtrage des termes extraits')
    phrases=filtrer_stopwords(phrases)
    phrases=filter_num(phrases)
    phrases=filter_len(phrases)
    phrases=[[" ".join(term[0]).replace("' ", "'"), " ".join(term[1])] for term in phrases]

    for phrase in phrases:
        phrase.append(frequencies[phrase[0]])

    phrases = tabCSV(phrases)

    terms = filter_patterns(phrases)

    print('Test statistique (calcul du log-likelihood ratio)')
    # Log-likelihood ratio

    terms_patterns = DataFrame(terms, columns = ["Expression", "Structure syntaxique", "Fréquence"])
    terms_patterns = terms_patterns.to_dict('records')
    dict_patterns = {}
    for term in terms_patterns:
        exp = term['Expression']
        pattern = term['Structure syntaxique']
        dict_patterns[exp] = pattern



    N = len(tokens)
    print(str(N))
    fd_tokens = nltk.FreqDist(tokens)

    def llr_ngrammes(n):
        llr = []

        for i in range(2, n+1):
            ngrammes = set([tuple(tokenizer_re.tokenize(term[0])) for term in terms if len(tokenizer_re.tokenize(term[0])) == i])
            fd = nltk.FreqDist(ngrams(tokens, n=i))
            fd_prior = nltk.FreqDist(ngrams(tokens, n=i-1))
            
            for t in ngrammes:
                c_prior = fd_prior[t[:i-1]] # Antécédent = P(w1w2..w_n-1) (si on considère que P(w1w2...wn) = P(wn) | P(w1w2...w_n-1)
                c_n = fd_tokens[t[i-1]]     # Dernier mot du ngramme  P(wn)
                c_ngram = fd[t]             # Le ngramme lui-même P(w1w2w3..wn)

                res = -2 * loglikelihood_ratio(c_prior, c_n, c_ngram, N)
                p = chi2.sf(res, 1) # 1 degrees of freedom
                #if res == float('-inf') :
                #    res = 50000

                if p < 0.001 or (res == float('-inf')):
                    llr.append({'Collocation' : " ".join(t).replace("' ", "'"), 'Structure syntaxique': dict_patterns[" ".join(t).replace("' ", "'")], 'Fréquence' : c_ngram, 'LLR': res, 'p-value': p})

        return llr
    terms = llr_ngrammes(6)

    df = DataFrame(terms)
    df.sort_values(['Fréquence'], 
                axis=0,
                ascending=[False], 
                inplace=True)

    output_path = path.join('../04-filtrage/output/', tag + '_significant-collocations.csv') 
    df.to_csv(output_path)

    
    
    print('CORPUS - ' + x + ' Terminé ✓')


In [13]:
lng = 'fr'
fichiers = os.listdir('../03-corpus/2-data/1-' + lng + '/')

In [14]:
for file in fichiers:
    nlp(file)

CORPUS - acces-aux-services.csv
Lecture du corpus
On va travailler sur un échantillon correspondant à environ 100 % des documents du corpus, soit 117 documents
Avec le RegExpTokenizer, notre corpus contient 106397 tokens.
Le POS tagging devrait prendre environ 0 minutes.
Tagging du corpus
Avant filtrage, on a 542675 ngrammes.
Extraction des patrons syntaxiques
Filtrage des termes extraits
Test statistique (calcul du log-likelihood ratio)
106397


  return (np.log(binom.pmf(c_ngram, c_prior, p))


ZeroDivisionError: division by zero