In [1]:
# Auto-reload pour développement interactif
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import json
import pickle
from pathlib import Path
from datetime import datetime
import warnings
import sys
warnings.filterwarnings('ignore')

# Analyses avancées
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from scipy import stats
import networkx as nx
from wordcloud import WordCloud
from collections import Counter
from difflib import SequenceMatcher




In [2]:
# Configuration
BASE_DIR = Path().resolve().parent.parent
sys.path.append(str(BASE_DIR / "src"))

# Répertoires
DATA_DIR = BASE_DIR / "data"
PROCESSED_DIR = DATA_DIR / "processed"


In [3]:
# Chargement des variables du preprocessing
try:
    with open(PROCESSED_DIR / 'preprocessing_variables.pkl', 'rb') as f:
        variables = pickle.load(f)
    
    df_clean_dedup = variables['df_clean_dedup']
    calibration_corpus = variables['calibration_corpus']
    quality_metrics = variables['quality_metrics']
    
    print(f"Données chargées:")
    print(f"   Dataset principal: {len(df_clean_dedup)} articles")
    print(f"   Corpus calibration: {len(calibration_corpus)} articles")
    print(f"   Période: {quality_metrics.get('processing_timestamp', 'N/A')}")

except FileNotFoundError:
    print("Fichier preprocessing_variables.pkl non trouvé!")
    print("   → Exécutez d'abord preprocessing_advanced.ipynb")
    exit(1)



Données chargées:
   Dataset principal: 128 articles
   Corpus calibration: 186 articles
   Période: 2025-07-27T10:08:32.511773


In [4]:
def analyze_source_reliability(df):
    """Analyse la fiabilité et qualité des sources"""
    
    # Métriques par source
    source_metrics = df.groupby('source').agg({
        'quality_score_advanced': ['mean', 'std', 'count'],
        'entities_total': 'mean',
        'text_cleaned': lambda x: x.str.len().mean(),
        'readability_score': 'mean',
        'text_complexity': 'mean'
    }).round(3)
    
    # Aplatir les colonnes multi-niveaux
    source_metrics.columns = ['_'.join(col).strip() for col in source_metrics.columns]
    source_metrics = source_metrics.reset_index()
    
    # Calcul du score de fiabilité composite
    source_metrics['reliability_score'] = (
        source_metrics['quality_score_advanced_mean'] * 0.4 +
        (1 - source_metrics['text_complexity_mean']) * 0.3 +
        source_metrics['readability_score_mean'] * 0.3
    )
    
    # Filtrer les sources avec au moins 3 articles
    reliable_sources = source_metrics[source_metrics['quality_score_advanced_count'] >= 3]
    reliable_sources = reliable_sources.sort_values('reliability_score', ascending=False)
    
    return reliable_sources



In [5]:
# Application de l'analyse
source_analysis = analyze_source_reliability(df_clean_dedup)

print(f"TOP 5 SOURCES LES PLUS FIABLES:")
for idx, row in source_analysis.head().iterrows():
    source_name = row['source'].split('/')[-1][:30]
    print(f"   {source_name}")
    print(f"       Score fiabilité: {row['reliability_score']:.3f}")
    print(f"       Qualité moyenne: {row['quality_score_advanced_mean']:.3f}")
    print(f"       Articles: {row['quality_score_advanced_count']:.0f}")
    print(f"       Entités/article: {row['entities_total_mean']:.1f}")
    print()



TOP 5 SOURCES LES PLUS FIABLES:
   titres.rss
       Score fiabilité: 0.435
       Qualité moyenne: 0.728
       Articles: 13
       Entités/article: 36.2

   actualites.xml
       Score fiabilité: 0.403
       Qualité moyenne: 0.688
       Articles: 21
       Entités/article: 16.1

   
       Score fiabilité: 0.388
       Qualité moyenne: 0.679
       Articles: 8
       Entités/article: 10.6

   feed
       Score fiabilité: 0.378
       Qualité moyenne: 0.679
       Articles: 15
       Entités/article: 12.9

   
       Score fiabilité: 0.369
       Qualité moyenne: 0.671
       Articles: 13
       Entités/article: 13.1



In [6]:
print(f"SOURCES À SURVEILLER (faible fiabilité):")
for idx, row in source_analysis.tail(3).iterrows():
    source_name = row['source'].split('/')[-1][:30]
    print(f"   {source_name}: Score {row['reliability_score']:.3f}")


SOURCES À SURVEILLER (faible fiabilité):
   index.xml: Score 0.354
   rss: Score 0.326
   rss.xml: Score 0.301


In [7]:
print(f"\nINSIGHTS SOURCES:")
print(f"   {len(source_analysis)} sources actives identifiées")
print(f"   Sources premium (score >0.7): {(source_analysis['reliability_score'] > 0.7).sum()}")
print(f"   Sources à améliorer (score <0.5): {(source_analysis['reliability_score'] < 0.5).sum()}")




INSIGHTS SOURCES:
   9 sources actives identifiées
   Sources premium (score >0.7): 0
   Sources à améliorer (score <0.5): 9


In [8]:
def advanced_temporal_analysis(df):
    """Analyse temporelle poussée avec détection de patterns"""
    
    # Préparation données temporelles
    df_temp = df.copy()
    df_temp['published_clean'] = pd.to_datetime(df_temp['published'], errors='coerce')
    df_temp = df_temp.dropna(subset=['published_clean'])
    
    # Extraction composants temporels
    df_temp['hour'] = df_temp['published_clean'].dt.hour
    df_temp['day_week'] = df_temp['published_clean'].dt.day_name()
    df_temp['day_month'] = df_temp['published_clean'].dt.day
    df_temp['week_year'] = df_temp['published_clean'].dt.isocalendar().week
    
    # Détection pics d'actualité
    daily_counts = df_temp.groupby(df_temp['published_clean'].dt.date).size()
    mean_daily = daily_counts.mean()
    std_daily = daily_counts.std()
    
    # Pics = jours avec +2 écarts-types
    peak_threshold = mean_daily + 2 * std_daily
    peak_days = daily_counts[daily_counts > peak_threshold]
    
    # Patterns saisonniers (simulation si plusieurs mois)
    if len(df_temp['published_clean'].dt.month.unique()) > 1:
        monthly_pattern = df_temp.groupby(df_temp['published_clean'].dt.month).size()
        seasonal_variance = monthly_pattern.var()
    else:
        seasonal_variance = 0
    
    return {
        'df_temporal': df_temp,
        'peak_days': peak_days,
        'daily_pattern': daily_counts,
        'seasonal_variance': seasonal_variance,
        'hour_distribution': df_temp['hour'].value_counts().sort_index(),
        'day_distribution': df_temp['day_week'].value_counts()
    }



In [9]:
# Application analyse temporelle
temporal_results = advanced_temporal_analysis(df_clean_dedup)

print(f"PATTERNS HORAIRES DÉTECTÉS:")
peak_hours = temporal_results['hour_distribution'].head(3)
for hour, count in peak_hours.items():
    pct = count / len(temporal_results['df_temporal']) * 100
    print(f"   {hour:02d}h: {count} articles ({pct:.1f}%)")



PATTERNS HORAIRES DÉTECTÉS:
   01h: 1 articles (0.8%)
   02h: 3 articles (2.3%)
   03h: 1 articles (0.8%)


In [10]:
print(f"\nPICS D'ACTUALITÉ IDENTIFIÉS:")
if len(temporal_results['peak_days']) > 0:
    for date, count in temporal_results['peak_days'].items():
        print(f"   {date}: {count} articles (pic d'actualité)")
        # Analyser les sujets de ce pic
        peak_articles = temporal_results['df_temporal'][
            temporal_results['df_temporal']['published_clean'].dt.date == date
        ]
        top_entities = []
        for entities in peak_articles['entities_advanced']:
            top_entities.extend(entities.get('locations', []))
            top_entities.extend(entities.get('organizations', []))
        if top_entities:
            top_3 = Counter(top_entities).most_common(3)
            print(f"       Sujets: {', '.join([f'{e}({c})' for e, c in top_3])}")
else:
    print(f"   Aucun pic significatif détecté")




PICS D'ACTUALITÉ IDENTIFIÉS:
   2025-07-25: 54 articles (pic d'actualité)
       Sujets: État(14), Paris(13), Europe(10)


In [11]:
print(f"\nINSIGHTS TEMPORELS:")
print(f"   Période d'analyse: {temporal_results['df_temporal']['published_clean'].min().date()} → {temporal_results['df_temporal']['published_clean'].max().date()}")
print(f"   Heure de pointe: {temporal_results['hour_distribution'].idxmax()}h ({temporal_results['hour_distribution'].max()} articles)")
print(f"   Jour le plus actif: {temporal_results['day_distribution'].idxmax()} ({temporal_results['day_distribution'].max()} articles)")
print(f"   Pics d'actualité: {len(temporal_results['peak_days'])} détectés")



INSIGHTS TEMPORELS:
   Période d'analyse: 2025-07-18 → 2025-07-26
   Heure de pointe: 4h (10 articles)
   Jour le plus actif: Friday (55 articles)
   Pics d'actualité: 1 détectés


In [12]:
def semantic_topic_analysis(df, n_topics=8):
    """Analyse sémantique avec LDA et clustering"""
    
    # Préparation corpus pour LDA
    texts = df['text_cleaned'].fillna('').tolist()
    
    # Nettoyage pour LDA
    texts_clean = []
    for text in texts:
        if len(text) > 100:  # Minimum 100 caractères
            texts_clean.append(text)
    
    if len(texts_clean) < 10:
        print("   Corpus trop petit pour l'analyse sémantique")
        return None
    
    # Vectorisation TF-IDF
    vectorizer = TfidfVectorizer(
        max_features=1000,
        stop_words=['le', 'de', 'et', 'à', 'un', 'il', 'être', 'et', 'en', 'avoir', 'que', 'pour'],
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )
    
    try:
        tfidf_matrix = vectorizer.fit_transform(texts_clean)
        feature_names = vectorizer.get_feature_names_out()
        
        # LDA Topic Modeling
        lda = LatentDirichletAllocation(
            n_components=n_topics,
            random_state=42,
            max_iter=20,
            doc_topic_prior=0.1,
            topic_word_prior=0.01
        )
        
        lda_topics = lda.fit_transform(tfidf_matrix)
        
        # Extraction topics
        topics = []
        for topic_idx, topic in enumerate(lda.components_):
            top_words_idx = topic.argsort()[-10:][::-1]
            top_words = [feature_names[i] for i in top_words_idx]
            topics.append({
                'id': topic_idx,
                'words': top_words,
                'weight': topic[top_words_idx]
            })
        
        # Clustering sémantique
        kmeans = KMeans(n_clusters=min(5, len(texts_clean)//10), random_state=42)
        clusters = kmeans.fit_predict(tfidf_matrix)
        
        return {
            'topics': topics,
            'lda_model': lda,
            'doc_topics': lda_topics,
            'clusters': clusters,
            'vectorizer': vectorizer,
            'texts_processed': texts_clean
        }
        
    except Exception as e:
        print(f"   Erreur analyse sémantique: {e}")
        return None



In [13]:
# Application analyse sémantique
semantic_results = semantic_topic_analysis(df_clean_dedup)

if semantic_results:
    print(f"TOPICS IDENTIFIÉS:")
    for topic in semantic_results['topics']:
        print(f"   Topic {topic['id']}: {', '.join(topic['words'][:5])}")
    
    # Analyse distribution topics
    topic_distribution = semantic_results['doc_topics'].mean(axis=0)
    dominant_topics = topic_distribution.argsort()[-3:][::-1]
    
    print(f"\nTOPICS DOMINANTS:")
    for topic_id in dominant_topics:
        topic_strength = topic_distribution[topic_id] * 100
        topic_words = semantic_results['topics'][topic_id]['words'][:3]
        print(f"   Topic {topic_id}: {topic_strength:.1f}% - {', '.join(topic_words)}")




TOPICS IDENTIFIÉS:
   Topic 0: forum, sites, messages, les données, objets
   Topic 1: lire, publicité, gaza, ministre, lire aussi
   Topic 2: pro, opération, humanitaire, outre, rapport
   Topic 3: trump, 2025, ans, sur les, 2024
   Topic 4: pas, avec, qu, vous, on
   Topic 5: vos, apple, etats, corse, etat
   Topic 6: faille, fondation, openai, désinformation, médaille
   Topic 7: paris, étude, décision, meta, mouvement

TOPICS DOMINANTS:
   Topic 4: 55.2% - pas, avec, qu
   Topic 3: 18.1% - trump, 2025, ans
   Topic 1: 11.5% - lire, publicité, gaza


In [14]:
def advanced_quality_metrics(df):
    """Calcul métriques qualité business"""
    
    # Densité informationnelle
    df['info_density'] = (
        df['entities_total'] / (df['text_cleaned'].str.len() / 1000)
    ).fillna(0)
    
    # Score de cohérence (variance entités par type)
    entity_variance = []
    for _, row in df.iterrows():
        entities = row['entities_advanced']
        if isinstance(entities, dict):
            counts = [
                len(entities.get('persons', [])),
                len(entities.get('organizations', [])),
                len(entities.get('locations', []))
            ]
            entity_variance.append(np.var(counts))
        else:
            entity_variance.append(0)
    
    df['coherence_score'] = 1 / (1 + np.array(entity_variance))
    
    # Score de nouveauté (basé sur similarité avec articles précédents)
    novelty_scores = []
    for i in range(len(df)):
        if i == 0:
            novelty_scores.append(1.0)
        else:
            # Simplicité: basé sur différence entités
            current_entities = set()
            if isinstance(df.iloc[i]['entities_advanced'], dict):
                for ent_list in df.iloc[i]['entities_advanced'].values():
                    current_entities.update(ent_list)
            
            prev_entities = set()
            if isinstance(df.iloc[i-1]['entities_advanced'], dict):
                for ent_list in df.iloc[i-1]['entities_advanced'].values():
                    prev_entities.update(ent_list)
            
            if len(current_entities) == 0:
                novelty_scores.append(0.5)
            else:
                overlap = len(current_entities & prev_entities) / len(current_entities)
                novelty_scores.append(1 - overlap)
    
    df['novelty_score'] = novelty_scores
    
    return df


In [15]:
# Application métriques avancées
df_quality = advanced_quality_metrics(df_clean_dedup.copy())

print(f"MÉTRIQUES QUALITÉ BUSINESS:")
print(f"   Densité info moyenne: {df_quality['info_density'].mean():.2f} entités/1000 chars")
print(f"   Score cohérence moyen: {df_quality['coherence_score'].mean():.3f}")
print(f"   Score nouveauté moyen: {df_quality['novelty_score'].mean():.3f}")


MÉTRIQUES QUALITÉ BUSINESS:
   Densité info moyenne: 4.15 entités/1000 chars
   Score cohérence moyen: 0.240
   Score nouveauté moyen: 0.974


In [16]:
# Identification articles premium
premium_threshold = df_quality['quality_score_advanced'].quantile(0.8)
premium_articles = df_quality[df_quality['quality_score_advanced'] > premium_threshold]


In [17]:
print(f"\nARTICLES PREMIUM (top 20%):")
print(f"   {len(premium_articles)} articles identifiés")
print(f"   Score qualité moyen: {premium_articles['quality_score_advanced'].mean():.3f}")
print(f"   Densité info moyenne: {premium_articles['info_density'].mean():.2f}")



ARTICLES PREMIUM (top 20%):
   26 articles identifiés
   Score qualité moyen: 0.762
   Densité info moyenne: 3.70


In [18]:
def detect_anomalies(df):
    """Détection d'articles outliers et contenus suspects"""
    
    anomalies = {
        'outliers': [],
        'suspects': [],
        'low_quality': [],
        'duplicates_potential': []
    }
    
    # 1. Outliers statistiques
    for metric in ['text_cleaned', 'entities_total', 'quality_score_advanced']:
        if metric == 'text_cleaned':
            values = df[metric].str.len()
        else:
            values = df[metric]
        
        Q1 = values.quantile(0.25)
        Q3 = values.quantile(0.75)
        IQR = Q3 - Q1
        
        outliers_idx = df[
            (values < (Q1 - 1.5 * IQR)) | 
            (values > (Q3 + 1.5 * IQR))
        ].index.tolist()
        
        anomalies['outliers'].extend(outliers_idx)
    
    # 2. Contenus suspects (peu d'entités + court)
    suspects_idx = df[
        (df['entities_total'] == 0) & 
        (df['text_cleaned'].str.len() < 300)
    ].index.tolist()
    anomalies['suspects'] = suspects_idx
    
    # 3. Qualité très faible
    low_quality_threshold = df['quality_score_advanced'].quantile(0.1)
    low_quality_idx = df[
        df['quality_score_advanced'] < low_quality_threshold
    ].index.tolist()
    anomalies['low_quality'] = low_quality_idx
    
    # 4. Doublons potentiels (titres très similaires)
    potential_duplicates = []
    titles = df['title'].fillna('').tolist()
    
    for i in range(len(titles)):
        for j in range(i+1, min(i+50, len(titles))):  # Limiter pour performance
            similarity = SequenceMatcher(None, titles[i], titles[j]).ratio()
            if similarity > 0.8:
                potential_duplicates.append((i, j, similarity))
    
    anomalies['duplicates_potential'] = potential_duplicates
    
    return anomalies


In [19]:
# Application détection anomalies
anomalies = detect_anomalies(df_quality)

print(f"ANOMALIES DÉTECTÉES:")
print(f"   Outliers statistiques: {len(set(anomalies['outliers']))} articles")
print(f"   Contenus suspects: {len(anomalies['suspects'])} articles")
print(f"   Qualité très faible: {len(anomalies['low_quality'])} articles")
print(f"   Doublons potentiels: {len(anomalies['duplicates_potential'])} paires")



ANOMALIES DÉTECTÉES:
   Outliers statistiques: 13 articles
   Contenus suspects: 0 articles
   Qualité très faible: 12 articles
   Doublons potentiels: 0 paires


In [20]:
if anomalies['suspects']:
    print(f"\nEXEMPLES CONTENUS SUSPECTS:")
    for idx in anomalies['suspects'][:3]:
        article = df_quality.iloc[idx]
        print(f"   \"{article['title'][:50]}...\"")
        print(f"       Entités: {article['entities_total']}")
        print(f"       Longueur: {len(article['text_cleaned'])} chars")
        print(f"       Score: {article['quality_score_advanced']:.3f}")


In [21]:
if anomalies['duplicates_potential']:
    print(f"\nEXEMPLES DOUBLONS POTENTIELS:")
    for i, j, sim in anomalies['duplicates_potential'][:3]:
        print(f"   Similarité {sim:.3f}:")
        print(f"       A: \"{df_quality.iloc[i]['title'][:40]}...\"")
        print(f"       B: \"{df_quality.iloc[j]['title'][:40]}...\"")


In [22]:
print(f"SOURCES:")
print(f"   Sources fiables identifiées: {len(source_analysis[source_analysis['reliability_score'] > 0.6])}")
print(f"   Source top: {source_analysis.iloc[0]['source'].split('/')[-1]}")
print(f"   Score fiabilité max: {source_analysis.iloc[0]['reliability_score']:.3f}")



SOURCES:
   Sources fiables identifiées: 0
   Source top: titres.rss
   Score fiabilité max: 0.435


In [23]:
print(f"\nTEMPOREL:")
print(f"   Pics d'actualité: {len(temporal_results['peak_days'])}")
print(f"   Heure de pointe: {temporal_results['hour_distribution'].idxmax()}h")
print(f"   Distribution: {temporal_results['hour_distribution'].std():.1f} (écart-type)")




TEMPOREL:
   Pics d'actualité: 1
   Heure de pointe: 4h
   Distribution: 3.1 (écart-type)


In [24]:
if semantic_results:
    print(f"\nSÉMANTIQUE:")
    print(f"   Topics identifiés: {len(semantic_results['topics'])}")
    dominant_topic_id = semantic_results['doc_topics'].mean(axis=0).argmax()
    print(f"   Topic dominant: {', '.join(semantic_results['topics'][dominant_topic_id]['words'][:3])}")




SÉMANTIQUE:
   Topics identifiés: 8
   Topic dominant: pas, avec, qu


In [25]:
print(f"\nQUALITÉ:")
print(f"   Articles premium: {len(premium_articles)} ({len(premium_articles)/len(df_quality)*100:.1f}%)")
print(f"   Densité info moyenne: {df_quality['info_density'].mean():.2f}")
print(f"   Score cohérence: {df_quality['coherence_score'].mean():.3f}")




QUALITÉ:
   Articles premium: 26 (20.3%)
   Densité info moyenne: 4.15
   Score cohérence: 0.240


In [26]:
print(f"\nANOMALIES:")
print(f"   Contenus suspects: {len(anomalies['suspects'])}")
print(f"   Articles faible qualité: {len(anomalies['low_quality'])}")
print(f"   Doublons potentiels: {len(anomalies['duplicates_potential'])}")




ANOMALIES:
   Contenus suspects: 0
   Articles faible qualité: 12
   Doublons potentiels: 0


In [27]:
print(f"\nRECOMMANDATIONS BUSINESS:")
print(f"   1. Prioriser les {len(source_analysis.head(3))} sources les plus fiables")
print(f"   2. Collecter davantage pendant les heures de pointe identifiées")
print(f"   3. Nettoyer {len(anomalies['suspects'])} articles suspects")
print(f"   4. Exploiter les {len(premium_articles)} articles premium pour l'entraînement")
if semantic_results:
    print(f"   5. Équilibrer la couverture des {len(semantic_results['topics'])} topics identifiés")




RECOMMANDATIONS BUSINESS:
   1. Prioriser les 3 sources les plus fiables
   2. Collecter davantage pendant les heures de pointe identifiées
   3. Nettoyer 0 articles suspects
   4. Exploiter les 26 articles premium pour l'entraînement
   5. Équilibrer la couverture des 8 topics identifiés


In [28]:
# Sauvegarde des résultats EDA
eda_results = {
    'source_analysis': source_analysis,
    'temporal_results': temporal_results,
    'semantic_results': semantic_results,
    'df_quality': df_quality,
    'premium_articles': premium_articles,
    'anomalies': anomalies
}

with open(PROCESSED_DIR / 'eda_business_results.pkl', 'wb') as f:
    pickle.dump(eda_results, f)