
# ETH Foundation Discursive Analysis — Master Pipeline

Ce notebook exécute l'ensemble du pipeline critique sur le corpus Ethereum Foundation (567 articles).

**Modules inclus :**
1. Extraction de fréquence
2. Visualisation des fréquences
3. Cooccurrences
4. Réseau lexical


In [None]:

# Paramètres globaux de base
DATA_DIR = './data'
OUTPUT_DIR = './outputs'

# Création des dossiers si nécessaires
import os
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:

import re
import string
import pandas as pd
from collections import Counter

# Stopwords simples à améliorer au besoin
STOPWORDS = set([
    'the', 'and', 'of', 'to', 'in', 'for', 'is', 'on', 'that', 'with', 'as',
    'by', 'this', 'it', 'are', 'at', 'from', 'an', 'be', 'or', 'we', 'can',
    'not', 'have', 'has', 'our', 'also', 'more', 'which', 'their', 'will',
    'all', 'but', 'was', 'they', 'these', 'may', 'you', 'been', 'using', 'its'
])

def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text.strip()

def tokenize(text):
    tokens = text.split()
    tokens = [word for word in tokens if word not in STOPWORDS and len(word) > 2]
    return tokens

# Extraction fréquence
all_tokens = []
for filename in os.listdir(DATA_DIR):
    if filename.endswith('.txt'):
        with open(os.path.join(DATA_DIR, filename), 'r', encoding='utf-8') as f:
            content = f.read()
            clean = clean_text(content)
            tokens = tokenize(clean)
            all_tokens.extend(tokens)

counter = Counter(all_tokens)
total_tokens = sum(counter.values())
data = [{'word': word, 'frequency': freq, 'relative_per_1000': freq/total_tokens*1000} for word, freq in counter.most_common()]
df = pd.DataFrame(data)
df.to_csv(f'{OUTPUT_DIR}/word_frequencies.csv', index=False)
df.head(15)


In [None]:

import matplotlib.pyplot as plt
from wordcloud import WordCloud

df = pd.read_csv(f'{OUTPUT_DIR}/word_frequencies.csv')
N = 30
top_words = df.head(N)

plt.figure(figsize=(12, 6))
plt.bar(top_words['word'], top_words['frequency'], color='skyblue')
plt.xticks(rotation=45, ha='right')
plt.title(f'Top {N} mots les plus fréquents')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/top_words_bar_chart.png')
plt.show()

word_freq = dict(zip(df['word'], df['frequency']))
wordcloud = WordCloud(width=1600, height=800, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(16, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Nuage de mots complet')
plt.savefig(f'{OUTPUT_DIR}/wordcloud.png')
plt.show()


In [None]:

from collections import defaultdict
from itertools import combinations

WINDOW_SIZE = 5
cooccurrence = defaultdict(int)

for filename in os.listdir(DATA_DIR):
    if filename.endswith('.txt'):
        with open(os.path.join(DATA_DIR, filename), 'r', encoding='utf-8') as f:
            content = f.read()
            clean = clean_text(content)
            tokens = tokenize(clean)
            for i in range(len(tokens) - WINDOW_SIZE + 1):
                window = tokens[i:i+WINDOW_SIZE]
                for w1, w2 in combinations(set(window), 2):
                    pair = tuple(sorted((w1, w2)))
                    cooccurrence[pair] += 1

data_pairs = [{'word1': p[0], 'word2': p[1], 'count': c} for p, c in sorted(cooccurrence.items(), key=lambda x: x[1], reverse=True)]
df_pairs = pd.DataFrame(data_pairs)
df_pairs.to_csv(f'{OUTPUT_DIR}/cooccurrence_pairs.csv', index=False)
df_pairs.head(15)


In [None]:

import networkx as nx

df_pairs = pd.read_csv(f'{OUTPUT_DIR}/cooccurrence_pairs.csv')
threshold = 5  # seuil minimal pour épuration visuelle
df_filtered = df_pairs[df_pairs['count'] >= threshold]
G = nx.Graph()

for index, row in df_filtered.iterrows():
    G.add_edge(row['word1'], row['word2'], weight=row['count'])

plt.figure(figsize=(15, 15))
pos = nx.spring_layout(G, k=0.15, iterations=50)
edges = G.edges(data=True)
weights = [edata['weight'] for _,_,edata in edges]

nx.draw_networkx_nodes(G, pos, node_size=300, node_color='skyblue')
nx.draw_networkx_edges(G, pos, width=[w/2 for w in weights], alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif')

plt.title("Network of lexical co-occurrences")
plt.axis('off')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/lexical_network.png')
plt.show()

nx.write_graphml(G, f"{OUTPUT_DIR}/lexical_network.graphml")
