In [None]:
import pandas as pd
import os
import string
from collections import Counter
from time import time
import numpy as np
import itertools
from pandarallel import pandarallel
import matplotlib.pyplot as plt

In [None]:
from textblob import TextBlob
import nltk
from nltk import word_tokenize
from nltk.util import ngrams

In [None]:
import gensim
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

In [None]:
import spacy
nlp = spacy.load("fr_core_news_sm")

In [None]:
seed = 1

In [None]:
dates_GDN = ["31-01-19","06-02-19","17-02-19","02-03-19","08-03-19","21-03-19"]
sujets_GDN = {"ecologie": "LA_TRANSITION_ECOLOGIQUE",
             "democratie": "DEMOCRATIE_ET_CITOYENNETE",
             "fisc": "LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES",
             "etat": "ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS"}
sujets_VD = os.listdir("data/VD")

In [None]:
#flatten = lambda l: [item for sublist in l for item in sublist]

In [None]:
#-> Plus efficace
flatten = lambda l: list(itertools.chain(*l))

In [None]:
filter_flatten_tags = lambda docs: [t for tokens in docs for t in tokens if not (t.is_punct or t.is_space or t.is_stop)]

In [None]:
pd.set_option("display.max_columns",70)

--------------------

---------

### Fichier utilisé actuellement pour les tests:

`2019-03-04_justice-police-armee_consultation-3.csv`

In [None]:
df = pd.read_csv(f"data/VD/2019-03-04_justice-police-armee_consultation-3.csv")

In [None]:
df

In [None]:
body = df[["contributions_bodyText"]].dropna()
body.reset_index(drop=True, inplace=True)

In [None]:
body.shape[0]

-> 1400 Propositions au total

In [None]:
start = time()
docs = list(nlp.pipe(body["contributions_bodyText"]))
print([list(filter(lambda t: t.pos_ not in {"SPACE", "PUNCT"}, tokens)) for tokens in docs])
print(f"Time: {time()-start:.2f}s")

In [None]:
print([t for tokens in docs for t in tokens if t.pos_ not in {"SPACE","PUNCT"} ])

In [None]:
timeit.timeit('flatten([list(filter(lambda t: t.pos_ not in {"SPACE", "PUNCT"}, tokens)) for tokens in docs])', "from __main__ import docs, flatten", number=100)

In [None]:
timeit.timeit('[t for tokens in docs for t in tokens if t.pos_ not in {"SPACE","PUNCT"} ]', "from __main__ import docs, flatten", number=100)

In [None]:
timeit.timeit('[t for tokens in docs for t in tokens if not (t.is_punct or t.is_space or t.is_stop)]', "from __main__ import docs, flatten", number=100)

In [None]:
timeit.timeit('[t for tokens in docs for t in tokens if not t.is_punct and not t.is_space and not t.is_stop]', "from __main__ import docs, flatten", number=100)

compréhesion de liste plus rapide

In [None]:
#Les propositions tokenisées
start = time()
body["tokens"] = body["contributions_bodyText"].apply(lambda doc: [mot for mot in nlp(doc) if not (mot.is_stop or mot.is_punct or len(mot) <3)])
print(f"Time: {time()-start:.2f}s")

In [None]:
#Lemme de chaque token
body["lemmas"] = body["tokens"].apply(lambda tokens: [token.lemma_ for token in tokens])

In [None]:
#Le part-of-speech tag pour chaque token, avec son lemme
body["tags"] = body["tokens"].apply(lambda tokens: [(token.pos_, token.lemma_) for token in tokens])#("PUNCT", "SPACE")])

In [None]:
#Nombre de tokens par proposition
body["propLen"] = body["tokens"].apply(lambda l: len(l))

In [None]:
#Nombre de phrases par proposition
body["nbPhrases"] = body["contributions_bodyText"].apply(lambda doc: len(TextBlob(doc).sentences))

In [None]:
#unigram
body["unigram"] = body["lemmas"].apply(lambda tokens: list(ngrams(tokens,1)))

In [None]:
#bigram
body["bigram"] = body["lemmas"].apply(lambda tokens: list(ngrams(tokens,2)))

In [None]:
list(ngrams(body['lemmas'][0],1))

In [None]:
body

In [None]:
unigram = flatten(body["unigram"].tolist())
bigrams = flatten(body["bigram"].tolist())

In [None]:
bigramCounter = Counter(bigrams)
unigramCounter = Counter(unigram)

In [None]:
unigramCounter.most_common(20)

In [None]:
bigramCounter.most_common(20)

In [None]:
def create_pos_dict(posList):
    pos_dict = dict()
    for lemma, tag in posList:
        pos_dict.setdefault(tag, []).append(lemma)
    return pos_dict

In [None]:
def init_counter_dict(pos_data):
    pos_counters = dict()
    for pos in pos_data:
        pos_counters[pos] = Counter(pos_data[pos])
    return pos_counters

In [None]:
pos_data = create_pos_dict(flatten(body["tags"].tolist()))

In [None]:
pos_counters = dict()
for pos in pos_data:
    pos_counters[pos] = Counter(pos_data[pos])

In [None]:
def dsp_most_common_pos(dict_data, dict_counters, n):
    for tag in dict_data:
        print(f'Les {n} {spacy.explain(tag)} les plus fréquents :')
        for mot in dict_counters[tag].most_common(n):
            print(mot)
        print("="*40)

In [None]:
dsp_most_common_pos(pos_data, pos_counters, 15)