In [1]:
import collections
import os
import string
import sys

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/favor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_path = "../data/txt/"

# Choix d'une décennie et du nombre de clusters

In [4]:
DECADE = '1902'
N_CLUSTERS = 2

# Chargement des fichiers de la décennie

In [5]:
files = [f for f in sorted(os.listdir(data_path)) if f"_{DECADE[:-1]}" in f]

In [6]:
# Exemple de fichiers
files[:1000]

['Bxl_1900_Tome_I1_Part_1.txt',
 'Bxl_1900_Tome_I1_Part_2.txt',
 'Bxl_1900_Tome_I1_Part_3.txt',
 'Bxl_1900_Tome_I1_Part_4.txt',
 'Bxl_1900_Tome_I1_Part_5.txt',
 'Bxl_1900_Tome_I1_Part_6.txt',
 'Bxl_1900_Tome_I1_Part_7.txt',
 'Bxl_1900_Tome_I1_Part_8.txt',
 'Bxl_1900_Tome_I2_Part_1.txt',
 'Bxl_1900_Tome_I2_Part_10.txt',
 'Bxl_1900_Tome_I2_Part_11.txt',
 'Bxl_1900_Tome_I2_Part_12.txt',
 'Bxl_1900_Tome_I2_Part_13.txt',
 'Bxl_1900_Tome_I2_Part_14.txt',
 'Bxl_1900_Tome_I2_Part_2.txt',
 'Bxl_1900_Tome_I2_Part_3.txt',
 'Bxl_1900_Tome_I2_Part_4.txt',
 'Bxl_1900_Tome_I2_Part_5.txt',
 'Bxl_1900_Tome_I2_Part_6.txt',
 'Bxl_1900_Tome_I2_Part_7.txt',
 'Bxl_1900_Tome_I2_Part_8.txt',
 'Bxl_1900_Tome_I2_Part_9.txt',
 'Bxl_1900_Tome_II1_Part_1.txt',
 'Bxl_1900_Tome_II1_Part_2.txt',
 'Bxl_1901_Tome_I1_Part_1.txt',
 'Bxl_1901_Tome_I1_Part_10.txt',
 'Bxl_1901_Tome_I1_Part_2.txt',
 'Bxl_1901_Tome_I1_Part_3.txt',
 'Bxl_1901_Tome_I1_Part_4.txt',
 'Bxl_1901_Tome_I1_Part_5.txt',
 'Bxl_1901_Tome_I1_Part_6.txt',


In [7]:
texts = [open(data_path + f).read() for f in files]

In [8]:
# Exemple de textes
texts[0][:1500]

"VILLE\n\nDE\n\nBULLETIN\n\nBRUXELLES.\n\nCOMMUNAL\n\nANNÉE\n\nP R E M I È R E\n\nTOME\n\nCOMPTE\n\nRENDU\n\n1900.\n\nP A R T I E .\n\nI.\n\nDES\n\nSÉANCES,\n\nBRUXELLES,\nIMPRIMERIE VEUVE JULIEN BAEllTSOEN, GRAND'PLACE, 5.\n1900\n\n\x0c\x0cN°l.\n\nCOMPTE RENDU DE LA SÉANCE DU 8 JANVIER 1900.\n\nVILLE DE B R U X E L L E S\n\nBULLETIN\n\nCOMMUNAL\n\nANNÉE\n\nCONSEIL\n\n1900\n\nC O M M U N A L .\n\nSéance du 8 Janvier 1 9 0 0 .\nPrésidence de M . EMILE D E M O T , Bourgmestre.\n\nSOMMAIRE :\n\n1.\n2.\n3.\n4.\n5.\n6.\n7.\n7A.\n8.\n\nPrestation de serment et installation des Conssi 11ers communaux\nélus le 15 octobre 1899.\nNomination et prestation de serment de quatre Echevins.\nCommunications.\nHommage à M . Charles Buis. — Proposition de MM. Leurs et\nBrabandt. — Adoption.\nPersonnel de l'Administration communale. — Délégation donnée\nau Collège pour la nomination.\nHospices. — Approbation d'actes divers d'administration.\nHospices. — Participation dans la formation du capital de la\nSo

# Vectorisation du texte

In [9]:
def process_text(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens

In [10]:
vectorizer = TfidfVectorizer(tokenizer=process_text,
                                stop_words=stopwords.words('french'),
                                max_df=0.5,
                                min_df=0.1,
                                lowercase=True)

In [11]:
%time tfidf_vectors = vectorizer.fit_transform(texts)

CPU times: user 2min 13s, sys: 378 ms, total: 2min 13s
Wall time: 4min 49s


In [12]:
tfidf_vectors

<233x6745 sparse matrix of type '<class 'numpy.float64'>'
	with 335537 stored elements in Compressed Sparse Row format>

In [28]:
# Exemple de vecteur TFIDF
pd.Series(
    tfidf_vectors[0].toarray()[0],
    index=vectorizer.get_feature_names()
    ).sort_values(ascending=False)

cochers        0.207570
costumes       0.180229
el             0.125428
voiture        0.124241
électeurs      0.115907
                 ...   
le+premier     0.000000
le+projet      0.000000
le+quartier    0.000000
le+r           0.000000
gymnase        0.000000
Length: 6745, dtype: float64

# Comprendre les vecteurs et leurs "distances"

In [14]:
from scipy.spatial.distance import cosine

In [15]:
cosine([1, 2, 3], [1, 2, 3])

0.0

In [16]:
cosine([1, 2, 3], [1, 2, 2])

0.02004211298777725

In [17]:
cosine([1, 2, 3], [2, 2, 2])

0.07417990022744858

In [18]:
tfidf_array = tfidf_vectors.toarray()

In [19]:
tfidf_array[0]

array([0.00641187, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ])

In [29]:
texts[10][:200]

'(24 Décembre\n\n1900)\n\n§2. —\n\n988\n\n-\n\nPropriétés.\n\n— Les art. 12 à 14 sont adoptés.\nArt. 15. « Produit des marchés affermés et des droits de\nplace sur la voie publique » : 104,200 francs.\nM . le Bourgme'

In [31]:
tfidf_array[4]

array([0.0086317 , 0.        , 0.        , ..., 0.        , 0.        ,
       0.03168281])

In [22]:
cosine(tfidf_array[0], tfidf_array[1])

0.6012276518483897

In [32]:
cosine(tfidf_array[3], tfidf_array[9])

0.9238305439169949

# Clustering des vecteurs TFIDF

Article intéressant sur le KMeans clustering :
- https://medium.com/dataseries/k-means-clustering-explained-visually-in-5-minutes-b900cc69d175

In [24]:
km_model = KMeans(n_clusters=N_CLUSTERS)

In [25]:
km_model.fit(tfidf_vectors)

KMeans(n_clusters=2)

In [26]:
clustering = collections.defaultdict(list)

for idx, label in enumerate(km_model.labels_):
    clustering[label].append(files[idx])

In [27]:
pprint(dict(clustering))

{0: ['Bxl_1900_Tome_II1_Part_1.txt',
     'Bxl_1900_Tome_II1_Part_2.txt',
     'Bxl_1902_Tome_II1_Part_1.txt',
     'Bxl_1902_Tome_II1_Part_2.txt',
     'Bxl_1903_Tome_II1_Part_1.txt',
     'Bxl_1903_Tome_II1_Part_2.txt',
     'Bxl_1904_Tome_II1_Part_1.txt',
     'Bxl_1904_Tome_II1_Part_2.txt',
     'Bxl_1905_Tome_II1_Part_1.txt',
     'Bxl_1905_Tome_II1_Part_2.txt',
     'Bxl_1905_Tome_II1_Part_3.txt',
     'Bxl_1906_Tome_II1_Part_1.txt',
     'Bxl_1906_Tome_II1_Part_2.txt',
     'Bxl_1906_Tome_II1_Part_3.txt',
     'Bxl_1907_Tome_II1_Part_1.txt',
     'Bxl_1907_Tome_II1_Part_2.txt',
     'Bxl_1907_Tome_II1_Part_3.txt',
     'Bxl_1908_Tome_II1_Part_1.txt',
     'Bxl_1908_Tome_II1_Part_2.txt',
     'Bxl_1908_Tome_II1_Part_3.txt',
     'Bxl_1909_Tome_II1_Part_1.txt',
     'Bxl_1909_Tome_II1_Part_2.txt'],
 1: ['Bxl_1900_Tome_I1_Part_1.txt',
     'Bxl_1900_Tome_I1_Part_2.txt',
     'Bxl_1900_Tome_I1_Part_3.txt',
     'Bxl_1900_Tome_I1_Part_4.txt',
     'Bxl_1900_Tome_I1_Part_5.txt',
     