Log likelihood ratio - Bigrams

Lire le corpus

In [20]:
from collections import Counter
from random import sample

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import *
from tqdm import tqdm
from scipy.stats import binom, chi2

import shutil, re
from os import listdir, chdir, path
from pathlib import Path

acteur = 'pinel'
sous_corpus = False 
tag = ''

# Change the directory
base_path = '../03-corpus/2-data/1-fr/'
file_path = path.join(base_path, acteur) + '.csv'

In [21]:
with open(file_path, "r", encoding = "UTF-8") as f:
        data = read_csv(file_path)
        text = data['text'].tolist()
        corpus = [(re.sub('\d', '', t.strip('\n').lower().replace('’', '\''))) for t in text]

Extraire des bigrammes

In [22]:
import nltk

from nltk.tokenize import RegexpTokenizer

# Seulement les caractères alphabétiques
tokenizer_re = RegexpTokenizer(r"\w\'|\w+")

tokens = nltk.flatten([tokenizer_re.tokenize(doc) for doc in corpus])

In [23]:
N = len(tokens)

In [24]:
from nltk.util import bigrams, trigrams, ngrams
from nltk.probability import FreqDist

In [25]:
bg = list(bigrams(tokens))
tg = list(trigrams(tokens))
qg = list(ngrams(tokens, n=4))

In [26]:
# Stopwords fréquents en français (non lemmatisés)
file_path = "../04-filtrage/stopwords.txt"
with open(file_path, 'r', encoding="utf-8") as f:
    stopwords = [t.lower().strip('\n') for t in f.readlines()]
    stopwords += ["l'", "d'", "s'"]

# Filtrage des ngrammes
bg = [t for t in bg if not t[0] in stopwords and not t[-1] in stopwords and len(t[0]) > 2 and len(t[-1]) > 2]
tg = [t for t in tg if not t[0] in stopwords and not t[-1] in stopwords and len(t[0]) > 2 and len(t[-1]) > 2]
qg = [t for t in qg if not t[0] in stopwords and not t[-1] in stopwords and len(t[0]) > 2 and len(t[-1]) > 2]

In [27]:
N = len(tokens)
fd_ug = nltk.FreqDist(tokens)

# On prend les distributions de fréquences à partir des ngrammes issus de la liste des tokens parce que si on le fait à partir de la liste filtrée ci-dessus,
# on risque de se retrouver avec des divisions par zéro à l'étape suivante
fd_bg = nltk.FreqDist(bigrams(tokens)) #Bigrammes
fd_tg = nltk.FreqDist(trigrams(tokens)) #Trigrammes
fd_qg = nltk.FreqDist(ngrams(tokens, n=4)) # 4-grammes

tab_b = pd.DataFrame(fd_bg.items(), columns=["Collocation","Fréquence"]) 
tab_t = pd.DataFrame(fd_tg.items(), columns=["Collocation","Fréquence"])
tab_q = pd.DataFrame(fd_qg.items(), columns=["Collocation","Fréquence"])

In [28]:
tab_n = tab_b.append(tab_t).append(tab_q).drop_duplicates()
tab_n.sort_values(["Fréquence"], 
                    axis=0,
                    ascending=[False], 
                    inplace=True)


tab_n.to_csv('../00-Jupyter-Notebooks/test_ngrams.csv')

  tab_n = tab_b.append(tab_t).append(tab_q).drop_duplicates()


In [29]:
len_prior = len(tab_n)
print("Au départ, on a {} ngrammes.".format(len_prior))

Au départ, on a 65200 ngrammes.


In [30]:
def loglikelihood_ratio(c_prior, c_n, c_ngram, N):
    """
    Compute the ratio of two hypotheses of likelihood and return the ratio.
    The formula here and test verification values are taken from 
    Manning & Schūtze _Foundations of Statistical Natural Language Processing_ p.172-175
    Parameters:
    c_prior: count of word 1 if bigrams or count of [w1w2 .. w(n-1)] if ngram
    c_n : count of word 2 if bigrams or count of wn if ngram
    c12: count of bigram (w1, w2) if bigram or count of ngram if ngram
    N: the number of words in the corpus
    """

    p = c_n / N
    p1 = c_ngram / c_prior
    p2 = (c_n - c_ngram) / (N - c_prior)   
    # We proactively trap a runtimeWarning: divide by zero encountered in log,
    # which may occur with extreme collocations
    import warnings
    with warnings.catch_warnings(): # this will reset our filterwarnings setting
        warnings.filterwarnings('error')
        try:
            return (np.log(binom.pmf(c_ngram, c_prior, p)) 
                    + np.log(binom.pmf(c_n - c_ngram, N - c_prior, p)) 
                    - np.log(binom.pmf(c_ngram, c_prior, p1) )
                    - np.log(binom.pmf(c_n - c_ngram, N - c_prior, p2)))             
        except Warning:
            return np.inf 

In [40]:
llr_bigrammes = []

for b in set(bg):
    c1 = fd_ug[b[0]]
    c2 = fd_ug[b[1]]
    c12 = fd_bg[b]

    res = -2 * loglikelihood_ratio(c1, c2, c12, N)
    p = chi2.sf(res, 1) # 1 degrees of freedom

    if p < 0.05 or (res == float('-inf')):
        llr_bigrammes.append({'Collocation' : b, 'LLR': res, 'p-value': p})

In [41]:
llr_trigrammes = []

for t in set(tg):
    c_prior = fd_bg[t[:2]] # Antécédent = P(w1w2) (si on considère que P (w1w2w3) = P(w3) | P(w1w2)
    c_n = fd_ug[t[2]]
    c_ngram = fd_tg[t] 

    res = -2 * loglikelihood_ratio(c_prior, c_n, c_ngram, N)
    p = chi2.sf(res, 1) # 1 degrees of freedom

    if p < 0.05 or (res == float('-inf')):
        llr_trigrammes.append({'Collocation' : t, 'LLR': res, 'p-value': p})

In [42]:
llr_quadgrammes = []

for q in set(qg):
    c_prior = fd_tg[q[:3]] # Antécédent = P(w1w2w3) si on considère que P (w1w2w3w4) = P(w4 | P(w1w2w3)
    c_n = fd_ug[q[3]]
    c_ngram = fd_qg[q]

    res = -2 * loglikelihood_ratio(c_prior, c_n, c_ngram, N)
    p = chi2.sf(res, 1) # 1 degrees of freedom

    if p < 0.05 or (res == float('-inf')):
        llr_quadgrammes.append({'Collocation' : q, 'LLR': res, 'p-value': p})

In [43]:
df_bg = pd.DataFrame(llr_bigrammes)
df_tg = pd.DataFrame(llr_trigrammes)
df_qg = pd.DataFrame(llr_quadgrammes)

df = df_bg.append(df_tg).append(df_qg)
df.sort_values(['p-value'], 
            axis=0,
            ascending=[True], 
            inplace=True)

output_path = '../00-Jupyter-Notebooks/test_LLR_ngrams_CHUM.csv'
df.to_csv(output_path)

  df = df_bg.append(df_tg).append(df_qg)


Maintenant, on va le tester sur tous les bigrammes du corpus et filtrer pour ne conserver uniquement que ceux pour lesquels p est significatif (< 0.05) OU ceux pour lesquels LLR = -ing ET p-value = 1.0