# TP3 run notebook (clean)

In [None]:
DECADE_START = 1950
DECADE_END = 1959
N_CLUSTERS = 6
TOP_N_TERMS = 15,
OUT_DIR = 'tps/tp3'
import os
os.makedirs(OUT_DIR, exist_ok=True)
print('params set')

In [None]:
import re
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
nltk.download('punkt', quiet=True)
data_txt = Path('data/txt')
files = [p for p in sorted(data_txt.glob('*.txt')) if re.search(r'(18|19)\d{2}', p.name) and DECADE_START <= int(re.search(r'(18|19)\d{2}', p.name).group(0)) <= DECADE_END]
print('found', len(files), 'files')
docs = []
names = []
for p in files:
    t = p.read_text(encoding='utf-8')
    t = re.sub(r'\s+', ' ', t)
    docs.append(t)
    names.append(p.name)
print('loaded docs', len(docs))
if docs:
    vectorizer = TfidfVectorizer(max_df=0.6, min_df=2, ngram_range=(1,2), stop_words='english')
    X = vectorizer.fit_transform(docs)
    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
    kmeans.fit(X)
    labels = kmeans.labels_
    df = pd.DataFrame({'filename': names, 'cluster': labels})
    df.to_csv(os.path.join(OUT_DIR, f'clusters_{DECADE_START}_{DECADE_END}.csv'), index=False)
    print('saved clusters csv')
    order = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()
    top_terms = {}
    for i in range(N_CLUSTERS):
        top = [terms[idx] for idx in order[i, :TOP_N_TERMS]]
        top_terms[i] = top
    pd.DataFrame.from_dict(top_terms, orient='index').to_csv(os.path.join(OUT_DIR, f'cluster_top_terms_{DECADE_START}_{DECADE_END}.csv'), header=False)
    print('saved top terms csv')
    # wordclouds
    for i in range(N_CLUSTERS):
        members = df[df.cluster==i].filename.tolist()
        text = ''
        for m in members:
            idx = names.index(m)
            text += docs[idx] + '\n'
        if not text.strip():
            continue
        wc = WordCloud(width=600, height=300, background_color='white').generate(text)
        out = os.path.join(OUT_DIR, f'cluster_{i}_wordcloud_{DECADE_START}_{DECADE_END}.png')
        wc.to_file(out)
        print('saved', out)
# sentences and word2vec
sents = []
for p in files:
    t = p.read_text(encoding='utf-8')
    for s in sent_tokenize(t, language='french'):
        toks = [w.lower() for w in word_tokenize(s) if re.search('[a-zA-Z0-9]', w)]
        if len(toks)>2:
            sents.append(toks)
print('built sents', len(sents))
if sents:
    model = Word2Vec(sentences=sents, vector_size=64, window=5, min_count=5, workers=4, epochs=5)
    model.save(os.path.join(OUT_DIR, f'word2vec_{DECADE_START}_{DECADE_END}.model'))
    print('saved w2v model')