In [2]:
import pandas as pd
import json
import os
import numpy as np
from scipy.stats import ks_2samp

# --- CONFIGURATION ---
BASE_DIR = ".."
REF_DATA_PATH = os.path.join(BASE_DIR, "datas/02_preprocess/datas.csv")
PROD_LOG_PATH = os.path.join(BASE_DIR, "api/production_logs/prediction_log.jsonl")
REPORT_OUTPUT_PATH = os.path.join(BASE_DIR, "reports/monitoring_report_complete.html")

os.makedirs(os.path.dirname(REPORT_OUTPUT_PATH), exist_ok=True)

# --- 1. CHARGEMENT DONN√âES DE R√âF√âRENCE ---
print("1Ô∏è‚É£ Chargement R√©f√©rence...")
df_ref = pd.read_csv(REF_DATA_PATH, index_col=0)
if 'TARGET' in df_ref.columns:
    df_ref = df_ref.drop(columns=['TARGET'])

# --- 2. ANALYSE OP√âRATIONNELLE (Latence & Erreurs) ---
print("2Ô∏è‚É£ Analyse des Logs Op√©rationnels...")

total_calls = 0
errors = 0
latencies = []
prod_records = []

if os.path.exists(PROD_LOG_PATH):
    with open(PROD_LOG_PATH, 'r') as f:
        for line in f:
            try:
                log = json.loads(line)
                total_calls += 1
                
                # R√©cup√©ration latence
                if 'latency_ms' in log:
                    latencies.append(log['latency_ms'])
                
                # V√©rification statut
                if log.get('status') == 'FAILURE':
                    errors += 1
                elif log.get('status') == 'SUCCESS':
                    # On garde les features pour le drift seulement si succ√®s
                    if 'input_features' in log:
                        prod_records.append(log['input_features'])
            except:
                continue

# Calcul des m√©triques op√©rationnelles
error_rate = (errors / total_calls) * 100 if total_calls > 0 else 0
avg_latency = np.mean(latencies) if latencies else 0
max_latency = np.max(latencies) if latencies else 0

# --- 3. PR√âPARATION DONN√âES DRIFT ---
print("3Ô∏è‚É£ Pr√©paration Analyse Drift...")
if not prod_records:
    print("‚ö†Ô∏è Pas de logs succ√®s. Utilisation √©chantillon fictif.")
    df_prod = df_ref.sample(50)
else:
    df_prod = pd.DataFrame.from_records(prod_records)
    # Alignement colonnes et types
    df_prod = df_prod.reindex(columns=df_ref.columns)
    df_prod = df_prod.fillna(0).astype(float, errors='ignore')

# --- 4. CALCUL DU DRIFT (Algorithme KS) ---
drift_results = []
alerts_drift = 0

numeric_cols = df_ref.select_dtypes(include=[np.number]).columns
common_cols = [c for c in numeric_cols if c in df_prod.columns]

for col in common_cols:
    data_ref = df_ref[col].dropna()
    data_prod = df_prod[col].dropna()
    
    if len(data_prod) < 5: continue
        
    # Test statistique
    stat, p_value = ks_2samp(data_ref, data_prod)
    is_drift = p_value < 0.05
    
    if is_drift: alerts_drift += 1
    
    drift_results.append({
        "Feature": col,
        "P-Value": round(p_value, 4),
        "Drift": "üî¥ OUI" if is_drift else "üü¢ NON",
        "Moy Ref": round(data_ref.mean(), 2),
        "Moy Prod": round(data_prod.mean(), 2)
    })

df_drift = pd.DataFrame(drift_results)
if not df_drift.empty:
    df_drift = df_drift.sort_values(by="P-Value")

# --- 5. G√âN√âRATION RAPPORT HTML COMPLET ---
print("4Ô∏è‚É£ G√©n√©ration du Rapport...")

html_content = f"""
<html>
<head>
    <title>Rapport de Monitoring ML</title>
    <style>
        body {{ font-family: 'Segoe UI', Arial, sans-serif; margin: 40px; background-color: #f4f6f9; }}
        h1, h2 {{ color: #2c3e50; }}
        .card {{ background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); margin-bottom: 20px; }}
        .metric-box {{ display: inline-block; width: 30%; text-align: center; }}
        .metric-val {{ font-size: 24px; font-weight: bold; color: #3498db; }}
        .metric-label {{ color: #7f8c8d; }}
        .alert {{ color: #e74c3c; }}
        table {{ border-collapse: collapse; width: 100%; font-size: 14px; }}
        th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
        th {{ background-color: #34495e; color: white; }}
        tr:nth-child(even) {{ background-color: #f9f9f9; }}
    </style>
</head>
<body>
    <h1>üõ°Ô∏è Dashboard de Monitoring API</h1>
    <p>G√©n√©r√© le : {pd.Timestamp.now()}</p>

    <div class="card">
        <h2>‚öôÔ∏è Performance Op√©rationnelle</h2>
        <div class="metric-box">
            <div class="metric-val" style="color: {'red' if error_rate > 0 else 'green'}">{error_rate:.1f}%</div>
            <div class="metric-label">Taux d'Erreur</div>
        </div>
        <div class="metric-box">
            <div class="metric-val">{avg_latency:.1f} ms</div>
            <div class="metric-label">Latence Moyenne</div>
        </div>
        <div class="metric-box">
            <div class="metric-val">{total_calls}</div>
            <div class="metric-label">Appels Totaux</div>
        </div>
    </div>

    <div class="card">
        <h2>üìâ Analyse de D√©rive des Donn√©es (Data Drift)</h2>
        <p>Comparaison statistique (Test KS) entre Donn√©es d'Entra√Ænement et Production.</p>
        <p><strong>Colonnes en alerte :</strong> <span class="{'alert' if alerts_drift > 0 else ''}" style="font-weight:bold; font-size:1.2em">{alerts_drift}</span> / {len(common_cols)}</p>
        
        <div style="max-height: 500px; overflow-y: auto;">
            {df_drift.to_html(index=False, classes='table').replace('üî¥ OUI', '<span style="color:red; font-weight:bold">üî¥ OUI</span>')}
        </div>
    </div>
</body>
</html>
"""

with open(REPORT_OUTPUT_PATH, "w", encoding="utf-8") as f:
    f.write(html_content)

print(f"\n‚úÖ RAPPORT COMPLET G√âN√âR√â : {os.path.abspath(REPORT_OUTPUT_PATH)}")
print(f"üìä M√©triques : {error_rate}% Erreur | {avg_latency:.1f}ms Latence | {alerts_drift} Drifts")

1Ô∏è‚É£ Chargement R√©f√©rence...
2Ô∏è‚É£ Analyse des Logs Op√©rationnels...
3Ô∏è‚É£ Pr√©paration Analyse Drift...
4Ô∏è‚É£ G√©n√©ration du Rapport...

‚úÖ RAPPORT COMPLET G√âN√âR√â : /Users/chrismaker/Documents/_Formation_AI/Projet_8/pret_a_depenser/reports/monitoring_report_complete.html
üìä M√©triques : 0.0% Erreur | 5.2ms Latence | 35 Drifts


In [None]:
import pandas as pd
import matplotlib
matplotlib.use("Agg")  # backend non interactif
import json
import os
import nannyml as nml

# --- Chemins ---
BASE_DIR = ".." 
REF_DATA_PATH = os.path.join(BASE_DIR, "datas/02_preprocess/datas.csv")
PROD_LOG_PATH = os.path.join(BASE_DIR, "api/production_logs/prediction_log.jsonl")
REPORT_OUTPUT_PATH = os.path.join(BASE_DIR, "reports/drift_report_nannyml.html")

os.makedirs(os.path.dirname(REPORT_OUTPUT_PATH), exist_ok=True)

# --- 1. Chargement R√©f√©rence ---
print("Chargement r√©f√©rence...")
reference_data = pd.read_csv(REF_DATA_PATH, index_col=0) 
if 'TARGET' in reference_data.columns:
    reference_data = reference_data.drop(columns=['TARGET'])

# --- 2. Chargement Production ---
print("Chargement logs production...")
production_records = []
try:
    with open(PROD_LOG_PATH, 'r') as f:
        for line in f:
            try:
                log = json.loads(line)
                if log.get('status') == 'SUCCESS':
                    production_records.append(log['input_features'])
            except:
                continue
except FileNotFoundError:
    print("‚ö†Ô∏è Pas de logs.")
    
if not production_records:
    print("‚ö†Ô∏è Fichier vide ou introuvable. Arr√™t.")
else:
    production_data = pd.DataFrame.from_records(production_records)
    # Alignement et conversion
    production_data = production_data[reference_data.columns]
    production_data = production_data.astype(reference_data.dtypes)
    print(f"‚úÖ {len(production_data)} logs charg√©s.")

    # Ajout Timestamps fictifs
    reference_data['timestamp'] = pd.date_range(start='2024-01-01', periods=len(reference_data), freq='h')
    production_data['timestamp'] = pd.date_range(start='2025-01-01', periods=len(production_data), freq='h')

    # --- 3. Calcul Drift (La Correction est ICI) ---
    print("Calcul NannyML en cours...")
    
    features = [c for c in reference_data.columns if c != 'timestamp']
    
    calc = nml.UnivariateDriftCalculator(
        column_names=features,
        timestamp_column_name='timestamp',
        continuous_methods=['kolmogorov_smirnov'],
        categorical_methods=['chi2'],
        chunk_number=1  # <--- FORCE 1 SEUL BLOC (Obligatoire pour < 50 lignes)
    )
    
    calc.fit(reference_data)
    results = calc.calculate(production_data)
    
    # --- 4. Rapport ---
    figure = results.plot()
    figure.write_html(REPORT_OUTPUT_PATH)
    print(f"‚úÖ Rapport g√©n√©r√© : {os.path.abspath(REPORT_OUTPUT_PATH)}")
    
    # --- 5. R√©sultats Console ---
    drift_metrics = results.to_df()
    # On regarde les alertes sur le seul et unique chunk
    alert_count = drift_metrics.loc[:, (slice(None), 'alert')].iloc[0].sum()
    print(f"\n--- üìä R√âSULTAT : {alert_count} alertes d√©tect√©es ---")

In [6]:
import matplotlib
matplotlib.use("Agg")  # IMPORTANT : emp√™che matplotlib de bloquer

import pandas as pd
import json
import os
import nannyml as nml

BASE_DIR = ".."
REF_DATA_PATH = os.path.join(BASE_DIR, "datas/02_preprocess/datas.csv")
PROD_LOG_PATH = os.path.join(BASE_DIR, "api/production_logs/prediction_log.jsonl")
REPORT_OUTPUT_PATH = os.path.join(BASE_DIR, "reports/drift_report_nannyml.html")

os.makedirs(os.path.dirname(REPORT_OUTPUT_PATH), exist_ok=True)

print("Chargement r√©f√©rence...")
reference_data = pd.read_csv(REF_DATA_PATH, index_col=0)

# Optionnel : enlever la cible
if 'TARGET' in reference_data.columns:
    reference_data = reference_data.drop(columns=['TARGET'])

# ‚ö° R√©duction massive du dataset (obligatoire pour NannyML)
reference_data = reference_data.sample(500, random_state=42)

print("Chargement production...")
production_records = []
try:
    with open(PROD_LOG_PATH, 'r') as f:
        for line in f:
            log = json.loads(line)
            if log.get("status") == "SUCCESS":
                production_records.append(log["input_features"])
except FileNotFoundError:
    print("‚ö†Ô∏è Pas de logs.")

if not production_records:
    print("‚ö†Ô∏è Aucun log")
    exit()

production_data = pd.DataFrame.from_records(production_records)
production_data = production_data[reference_data.columns]  # m√™mes colonnes

reference_data["timestamp"] = pd.date_range(start='2024-01-01', periods=len(reference_data), freq='h')
production_data["timestamp"] = pd.date_range(start='2025-01-01', periods=len(production_data), freq='h')

features = [c for c in reference_data.columns if c != "timestamp"]

print("Calcul du drift...")
calc = nml.UnivariateDriftCalculator(
    column_names=features,
    timestamp_column_name="timestamp",
    continuous_methods=["kolmogorov_smirnov"],
    categorical_methods=["chi2"],
    chunk_size=len(production_data),  # 1 chunk
)

calc.fit(reference_data)
results = calc.calculate(production_data)

print("G√©n√©ration du rapport...")
fig = results.plot()
fig.write_html(REPORT_OUTPUT_PATH)

print("Fini.")



Chargement r√©f√©rence...
Chargement production...
Calcul du drift...
G√©n√©ration du rapport...


KeyboardInterrupt: 

In [4]:
print(len(reference_data))


307507
