Log likelihood ratio - Bigrams

Lire le corpus

In [12]:
from collections import Counter
from random import sample

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import *
from tqdm import tqdm
from scipy.stats import binom, chi2

import shutil, re
from os import listdir, chdir, path
from pathlib import Path

acteurs = ['asso_ordres', 'chsld', 'chu_iu', 'cisss_ciusss', 'cliniques_medicales', 'csbe', 'gmf', 'inesss', 'inspq', 'msss', 'ophq', 'quebec_sante', 'ramq', 'sante_mtl', 'urgence_sante']
acteur = 'chum'
sous_corpus = False 
tag = ''

# Change the directory
base_path = '../03-corpus/2-data/1-fr/'
file_path = path.join(base_path, acteur) + '.csv'

In [13]:
with open(file_path, "r", encoding = "UTF-8") as f:
        data = read_csv(file_path)
        text = data['text'].tolist()
        corpus = [(re.sub('\d', '', t.strip('\n').lower().replace('’', '\''))) for t in text]

Extraire des bigrammes

In [15]:
import nltk

from nltk.tokenize import RegexpTokenizer

# Seulement les caractères alphabétiques
tokenizer_re = RegexpTokenizer(r"\w\'|\w+")

tokens = nltk.flatten([tokenizer_re.tokenize(doc) for doc in corpus])

In [18]:
N = len(tokens)

In [4]:
from nltk.util import bigrams

In [19]:
bg = list(bigrams(tokens))

In [20]:
# Stopwords fréquents en français (non lemmatisés)
file_path = "../04-filtrage/stopwords.txt"
with open(file_path, 'r', encoding="utf-8") as f:
    stopwords = [t.lower().strip('\n') for t in f.readlines()]
    stopwords += ["l'", "d'", "s'"]


bg = [b for b in bg if not b[0] in stopwords and not b[1] in stopwords and len(b[0]) > 2 and len(b[1]) > 2]

In [21]:
fd_bg = nltk.FreqDist(bg)

tab = pd.DataFrame(fd_bg.items(), columns=["Bigramme","Fréquence"])
tab.sort_values(["Fréquence"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)

tab.to_csv('../00-Jupyter-Notebooks/test_bigrams.csv')

In [22]:
len_prior = len(bg)
print("Au départ, on a {} bigrammes.".format(len_prior))

Au départ, on a 156470 bigrammes.


In [23]:
def loglikelihood_ratio(c1, c2, c12, N):
    """
    Compute the ratio of two hypotheses of likelihood and return the ratio.
    
    Under the Independence hypothesis (H0) we assume that there is 
    no association between w1 and w2, i.e. they are independent: 
    let P(w1) and P(w2) be probabilities that 
    a random token in a text is w1 and w2 respectfully and 
    P(w1,w2) is the probability that (w1,w2) occur together in the text 
    (i.e. one follows another) so under H0, P(w1,w2) = P(w1)P(w2)
    we can compute the observed probability of P(w1,w2) 
    and compare it with the probability under H0
    if these probabilities are significantly different from each other,
    then (w1,w2) is a collocation.
    
    The formula here and test verification values are taken from 
    Manning & Schūtze _Foundations of Statistical Natural Language Processing_ p.172-175
    
    Parameters:
    c1: count of word 1
    c2: count of word 2
    c12: count of bigram (w1, w2)
    N: the number of words in the corpus
    
    The value: -2 * loglikelihood_ratio is asymptotically Chi-squared distributed 
    so we can use Chi-squared table values to test the Null-Hypothesis
    against the second Hypothesis--the observed values--as represented by the ratio.
    
    The following example is taken from Manning and Schütze _Statistical NLP_ p.107
    # 1990 NYT data
    >>> N = 14_307_668
    >>> c1 = 932 # powerful
    >>> c2 = 934 # computers
    >>> c12 = 10 # bigram count 
    >>> res = -2 * loglikelihood_ratio(c1, c2, c12, N)
    >>> res 
    82.37586050140558
    >>> # e.g. for a alpha value of: 0.005, 
    >>> # 1 Degree of freedom requires a chi-squared value of 7.88
    >>> from scipy.stats import chi2
    >>> print ('p-value: %.30f' % chi2.sf(res, 1)) # 1 degrees of freedom
    p-value: 0.000000000000000000112519634099
    
    """
    p = c2 / N
    p1 = c12 / c1
    p2 = (c2 - c12) / (N - c1)   
    # We proactively trap a runtimeWarning: divide by zero encountered in log,
    # which may occur with extreme collocations
    import warnings
    with warnings.catch_warnings(): # this will reset our filterwarnings setting
        warnings.filterwarnings('error')
        try:
            return (np.log(binom.pmf(c12, c1, p)) 
                    + np.log(binom.pmf(c2 - c12, N - c1, p)) 
                    - np.log(binom.pmf(c12, c1, p1) )
                    - np.log(binom.pmf(c2 - c12, N - c1, p2)))             
        except Warning:
            return np.inf 

Maintenant, on va le tester sur une liste de bigrammes pour voir comment on pourra l'automatiser dans une fonction

In [25]:
output = []
for b in set(bg):
    c1 = tokens.count(b[0])
    c2 = tokens.count(b[1])
    c12 = bg.count(b)

    res = -2 * loglikelihood_ratio(c1, c2, c12, N)
    p = chi2.sf(res, 1) # 1 degrees of freedom

    if p < 0.05 or (res == float('-inf') and p == 1):
        output.append([b, res, p])

In [None]:
df = pd.DataFrame(output, columns=['Bigramme', 'Log-Likelihood Ratio', 'p-value'])
output_path = '../00-Jupyter-Notebooks/test_LLR_bigrams.csv'

df.sort_values(['Log-Likelihood Ratio'], 
            axis=0,
            ascending=[False], 
            inplace=True)
df.to_csv(output_path)

In [None]:
len_after = len(output)

Maintenant, on va le tester sur tous les bigrammes du corpus et filtrer pour ne conserver uniquement que ceux pour lesquels p est significatif (< 0.05) OU ceux pour lesquels LLR = -ing ET p-value = 1.0