# Preparation du corpus CAMille

Import

In [8]:
from pathlib import Path
import re
from collections import Counter

Recencement des années disponibles dans le corpus

In [9]:
data_dir = Path("../../data/txt")
all_txt = list(data_dir.rglob("*.txt"))

year_pat = re.compile(r"(18|19)\d{2}")
years = []

for p in all_txt:
    m = year_pat.search(p.name)
    if m:
        years.append(m.group(0))

year_counts = Counter(years)
print(f"Total fichiers txt: {len(all_txt)}")
print(f"Années détectées: {len(year_counts)}")

for y, c in sorted(year_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{y}: {c}")

Total fichiers txt: 51
Années détectées: 37
1950: 4
1892: 2
1903: 2
1913: 2
1894: 2
1899: 2
1933: 2
1949: 2
1939: 2
1927: 2
1860: 2
1911: 2
1884: 1
1906: 1
1926: 1
1920: 1
1846: 1
1912: 1
1947: 1
1853: 1
1922: 1
1836: 1
1940: 1
1857: 1
1924: 1
1850: 1
1902: 1
1946: 1
1895: 1
1918: 1
1930: 1
1886: 1
1925: 1
1887: 1
1943: 1
1873: 1
1885: 1


Choix et chargement de l'annee 1950

In [10]:
YEAR = "1950"
pat = re.compile(rf"{YEAR}")

files_year = [p for p in all_txt if pat.search(p.name)]
print(f"Fichiers trouvés pour {YEAR}: {len(files_year)}")

corpus_year = ""
for path in files_year:
    for enc in ("utf-8", "latin-1", "cp1252"):
        try:
            with open(path, "r", encoding=enc, errors="ignore") as f:
                corpus_year += f.read() + "\n"
            break
        except Exception:
            continue

print(f"Taille du corpus {YEAR}: {len(corpus_year):,} caractères")
print("\nExtrait:\n", corpus_year[:500])

Fichiers trouvés pour 1950: 4
Taille du corpus 1950: 133,895 caractères

Extrait:
 L'AVENIR DU LUXEMBOURG Samedi 15 avri j 350, 
MORHET 
Soirée dramatique 
1 Le cercle dramatique Sainte-Cécile 
de Morhet reprendra, ce dimanche 16 
avril ^Quasimodo), sa brillante soirée 
qui a remporté un succès si remarqua-
| bie le 10 mars dernier. 
i Rappelons ie programme : 
; 1) ouverture : « Brabançonne »,par 
• la Fantare ; 2) « La .bohème », chœur 
à 2 voix exécuté par JV^.es Renée Cara, 
j Josée Goffin, Anyse Hubermont et Hé-
f lène Bellanger ; a) La comédie en deux 
actes de Marcell* 


# Extraction de Keywords

## Imports

In [13]:
import os
import yake
import pandas as pd

## Extraire les mots clés d'un document avec Yake

https://github.com/LIAAD/yake

In [38]:
# Instancier l'extracteur
kw_extractor = yake.KeywordExtractor(lan="fr", n=3, dedupLim=0.9, dedupFunc="seqm", windowsSize=2, top=30)

In [39]:
# Lister les Fichiers de l'année 1950
YEAR = "1950"
files_YEAR = [p for p in all_txt if YEAR in p.name]

In [40]:
# Imprimer le nombre de fichiers identifiés
print(len(files_year))

4


In [41]:
# Les premiers fichiers
[f.name for f in files_year[:10]]

['KB_JB421_1950-04-15_01-00004.txt',
 'KB_JB572_1950-06-07_01-00004.txt',
 'KB_JB773_1950-07-22_01-00010.txt',
 'KB_JB837_1950-12-01_01-00007.txt']

In [42]:
# Choisir un fichier
this_file = files_year[0]
this_file

WindowsPath('../../data/txt/KB_JB421_1950-04-15_01-00004.txt')

In [43]:
# Récupérer le texte du fichier
text = open(this_file, "r", encoding="utf-8", errors="ignore").read()
len(text)

39690

In [44]:
# Extraire les mots clés de ce texte
kws = kw_extractor.extract_keywords(text)
kws[:10]

[('MORHET Soirée dramatique', np.float64(0.0030620731204945655)),
 ('Van Zeeland', np.float64(0.0034696802972468406)),
 ('Renée Cara', np.float64(0.005843820319018758)),
 ('cercle dramatique Sainte-Cécile', np.float64(0.0063214064561661)),
 ('Madeleine Van Mullem', np.float64(0.007451394848005148)),
 ('Bruxelles', np.float64(0.007964269390347075)),
 ('Madeleine Ska', np.float64(0.008697568145652755)),
 ("d'une", np.float64(0.008991250150359485)),
 ('Josée Goffin', np.float64(0.009245105952324528)),
 ('Van', np.float64(0.010757041762850186))]

Mettre en DataFrame

In [45]:
df = pd.DataFrame(kws, columns=["term", "score"])
df.head()

Unnamed: 0,term,score
0,MORHET Soirée dramatique,0.003062
1,Van Zeeland,0.00347
2,Renée Cara,0.005844
3,cercle dramatique Sainte-Cécile,0.006321
4,Madeleine Van Mullem,0.007451


Filtrer uniquement les bi-grammes

In [46]:
df2 = df[df["term"].str.split().str.len() == 2].copy()
len(df2)

15

Trions par score croissant et affichons

In [47]:
df2 = df2.sort_values("score", ascending=True).reset_index(drop=True)
df2.head(15)

Unnamed: 0,term,score
0,Van Zeeland,0.00347
1,Renée Cara,0.005844
2,Madeleine Ska,0.008698
3,Josée Goffin,0.009245
4,roi Léopold,0.011665
5,Solange Noirot,0.014375
6,Jeanine Grandjean,0.015065
7,Morhet reprendra,0.015255
8,Van Mullem,0.022524
9,Bruxelles Bruxelles,0.023435


## Faire la même opération sur tous les documents

Boucler sur tous les fichiers 

In [48]:
hasattr(kw_extractor, "extract_keywords"), type(kw)

(True, str)

In [49]:
for f in sorted(files_year)[:10]:
    text = open(f, "r", encoding="utf-8", errors="ignore").read()
    keywords = kw_extractor.extract_keywords(text)
    kept = []
    for kw_text, score in keywords:
        #words = kw_text.split()
        if len(kw_text.split()) == 2:
            kept.append(kw_text)
    print(f"{f.name} mentions these keywords: {', '.join(kept)}...")

KB_JB421_1950-04-15_01-00004.txt mentions these keywords: Van Zeeland, Renée Cara, Madeleine Ska, Josée Goffin, roi Léopold, Solange Noirot, Jeanine Grandjean, Morhet reprendra, Van Mullem, Bruxelles Bruxelles, MORHET Soirée, Solange Grandjean, cercle dramatique, dramatique Sainte-Cécile, Roi Bruxelles...
KB_JB572_1950-06-07_01-00004.txt mentions these keywords: pigeons lâchés, VAN ERCK, Van Oyen, Van Eycken, Van Binst...
KB_JB773_1950-07-22_01-00010.txt mentions these keywords: Lames Gillette, Gillette Bleues, Gillette Lames, Gillette L'homme, Garage VAN, VAN WANGH, L'AVENIR Samedi, vendre rue...
KB_JB837_1950-12-01_01-00007.txt mentions these keywords: pâte dentifrice, dentifrice PRODENT, Gala Wagner, Dieux barbares, BRUXELLES JEUDI, EST-IL POSSIBLE, PRODENT pâte...


In [50]:
rows = []
for f in sorted(files_year):
    text = open(f, "r", encoding="utf-8", errors="ignore").read()
    rows.append((f.name, kw_extractor.extract_keywords(text)))

In [51]:
out = []
for f in sorted(files_year):
    text = open(f, "r", encoding="utf-8", errors="ignore").read()
    rows.append((f.name, kw_extractor.extract_keywords(text)))

In [52]:
out = []
for fname, pairs in rows:
    for term, score in pairs:
        out.append((fname, term, score))

In [53]:
all_kw = pd.DataFrame(out, columns=["file", "term", "score"])
all_kw.head()

Unnamed: 0,file,term,score
0,KB_JB421_1950-04-15_01-00004.txt,MORHET Soirée dramatique,0.003062
1,KB_JB421_1950-04-15_01-00004.txt,Van Zeeland,0.00347
2,KB_JB421_1950-04-15_01-00004.txt,Renée Cara,0.005844
3,KB_JB421_1950-04-15_01-00004.txt,cercle dramatique Sainte-Cécile,0.006321
4,KB_JB421_1950-04-15_01-00004.txt,Madeleine Van Mullem,0.007451


In [54]:
big = all_kw[all_kw["term"].str.split().str.len() == 2].copy()
len(big)

70

In [55]:
top2 = (big.groupby("term")["score"]
        .mean()
        .sort_values()
        .head(20)
        .reset_index())
top2

Unnamed: 0,term,score
0,Van Zeeland,0.00347
1,Renée Cara,0.005844
2,Lames Gillette,0.00739
3,Madeleine Ska,0.008698
4,Josée Goffin,0.009245
5,roi Léopold,0.011665
6,pigeons lâchés,0.012354
7,Solange Noirot,0.014375
8,Gillette Bleues,0.015029
9,Jeanine Grandjean,0.015065


In [59]:
top2_by_file = (big.groupby(["file", "term"])["score"]
                .mean().sort_values()
                .groupby(level=0).head(5)
                .reset_index())
top2_by_file.head(10)

Unnamed: 0,file,term,score
0,KB_JB421_1950-04-15_01-00004.txt,Van Zeeland,0.00347
1,KB_JB421_1950-04-15_01-00004.txt,Renée Cara,0.005844
2,KB_JB773_1950-07-22_01-00010.txt,Lames Gillette,0.00739
3,KB_JB421_1950-04-15_01-00004.txt,Madeleine Ska,0.008698
4,KB_JB421_1950-04-15_01-00004.txt,Josée Goffin,0.009245
5,KB_JB421_1950-04-15_01-00004.txt,roi Léopold,0.011665
6,KB_JB572_1950-06-07_01-00004.txt,pigeons lâchés,0.012354
7,KB_JB773_1950-07-22_01-00010.txt,Gillette Bleues,0.015029
8,KB_JB773_1950-07-22_01-00010.txt,Gillette Lames,0.022169
9,KB_JB773_1950-07-22_01-00010.txt,Gillette L'homme,0.024058
