# 07: M1/S2 - Unsupervised Anomaly Detection (Sentence Level)

**C√≠l:** Detekovat subjektivitu (L1) jako anom√°lii v≈Øƒçi neutralitƒõ (L0) na √∫rovni cel√Ωch vƒõt.
**Metoda:** Mahalanobisova vzd√°lenost (Unsupervised).
**Hypot√©za:** Subjektivn√≠ vƒõty se s√©manticky li≈°√≠ od neutr√°ln√≠ch, tak≈æe budou m√≠t v embedding prostoru velkou vzd√°lenost od centroidu neutrality.

**Sc√©n√°≈ôe:**
* **S2a - Baseline:** Tr√©nink pouze na Target L0 vƒõt√°ch (Gold).
* **S2c - Robustness (Context):** Tr√©nink na Target L0 + Context L0 vƒõt√°ch.

**Pooling:**
* Porovn√°me **Mean** vs **[CLS]**.

## 1. Setup & Imports

In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import logging
from pathlib import Path
import os
from itables import show

from sklearn.metrics import classification_report


# Auto-reload modules for development
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Add src to path
current_dir = os.getcwd()
src_dir = os.path.abspath(os.path.join(current_dir, '..', 'src'))
if src_dir not in sys.path:
    sys.path.append(src_dir)

# Vlastn√≠ moduly
import config
import data_splitting
import models
import evaluation
import visualization


# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Vizualizace
visualization.setup_style()

print(f"‚úÖ Setup complete. Results dir: {config.RESULTS_DIR}")

## 2. Data Check
V Unsupervised learningu n√°s zaj√≠m√° hlavnƒõ poƒçet **Neutr√°ln√≠ch (L0)** vƒõt v tr√©ninku. Model se uƒç√≠ jen z nich.

In [None]:
SCENARIOS_TO_CHECK = ['baseline', 'robustness']

print(f"{'='*80}")
print(f"üìä DATA CHECK REPORT (M1/S2 - Unsupervised)")
print(f"{'='*80}")

for sc in SCENARIOS_TO_CHECK:
    print(f"\nüîπ SC√âN√Å≈ò: {sc.upper()}")
    try:
        # Naƒçteme data (Mean pooling)
        data = data_splitting.get_train_val_test_splits(
            scenario=sc,
            level='sentence',
            pooling='mean',
            random_state=42
        )
        
        # Pro M1 n√°s v TRAIN zaj√≠m√° jen L0 (zbytek se zahod√≠)
        n_train_total = len(data['y_train'])
        n_train_L0 = sum(data['y_train'] == 0)
        n_train_L1 = sum(data['y_train'] == 1)
        
        print(f"   TRAIN (Total): {n_train_total}")
        print(f"   üëâ Pou≈æiteln√© pro M1 (L0): {n_train_L0} (Model se uƒç√≠ jen toto)")
        print(f"   üëâ Ignorovan√© v Train (L1): {n_train_L1}")
        print(f"   TEST (Total): {len(data['y_test'])} (L0: {sum(data['y_test']==0)}, L1: {sum(data['y_test']==1)})")
        
    except Exception as e:
        print(f"   ‚ùå Chyba: {e}")

## 3. Experiment Loop
Tr√©nujeme **Mahalanobis** detektor.
Iterujeme p≈ôes:
1.  **Pooling:** `mean` vs `cls`
2.  **Sc√©n√°≈ôe:** `baseline` (m√°lo dat) vs `robustness` (hodnƒõ L0 dat z kontextu)

In [None]:
RESULTS_PATH = config.RESULTS_DIR / "M1_S2_experiment_results_v1.csv"

SCENARIOS = [
    {'id': 'S2a', 'name': 'Baseline (Target L0)',    'scenario': 'baseline'},
    {'id': 'S2c', 'name': 'Robustness (Target+Ctx)', 'scenario': 'robustness'}
]

POOLING_METHODS = ['mean', 'cls']
MODEL_NAME = 'Mahalanobis'

results = []
best_auprc = 0.0
best_run = None

print(f"üöÄ STARTING M1/S2 EXPERIMENTS...")
print(f"üíæ Results path: {RESULTS_PATH}")

for pooling in POOLING_METHODS:
    print(f"\n{'#'*60}")
    print(f"üåä POOLING METHOD: {pooling.upper()}")
    print(f"{'#'*60}")
    
    for exp in SCENARIOS:
        print(f"\n   üß™ SCENARIO: {exp['id']} - {exp['name']}")
        
        try:
            # 1. Naƒçten√≠ dat
            data = data_splitting.get_train_val_test_splits(
                scenario=exp['scenario'],
                level='sentence',
                pooling=pooling,
                random_state=42
            )
            
            # 2. Filtrace pro Unsupervised (Train = pouze L0)
            X_train = data['X_train'][data['y_train'] == 0]
            
            # Val/Test nech√°v√°me kompletn√≠ (pro ovƒõ≈ôen√≠ detekce)
            X_val, y_val = data['X_val'], data['y_val']
            X_test, y_test = data['X_test'], data['y_test']
            
            print(f"      üìä Train L0 Size: {X_train.shape[0]} samples")
            
            # Kontrola: M√°me dost dat?
            # Mahalanobis pot≈ôebuje v√≠ce vzork≈Ø ne≈æ dimenz√≠ (768), jinak sel≈æe (Singular Matrix)
            # Pokud pou≈æijeme PCA, sn√≠≈æ√≠me dimenzi.
            
            # 3. Fit Model
            # Pou≈æijeme PCA redukci, pokud je m√°lo dat (m√©nƒõ ne≈æ 1000)
            n_components = 0.95 if X_train.shape[0] > 50 else 0.90
            
            # Inicializace Mahalanobise
            clf = models.get_unsupervised_model(MODEL_NAME, pca_components=n_components, random_state=42)
            clf.fit(X_train)
            
            # 4. Sk√≥rov√°n√≠ (Anomaly Score)
            # Vy≈°≈°√≠ sk√≥re = vƒõt≈°√≠ anom√°lie
            s_train = clf.decision_function(X_train)
            s_val   = clf.decision_function(X_val)
            s_test  = clf.decision_function(X_test)
            
            # 5. Threshold Tuning (na Val)
            threshold, _ = evaluation.find_optimal_threshold(y_val, s_val, metric='f1')
            
            # 6. Metriky
            m_train = evaluation.calculate_metrics(np.zeros(len(s_train)), (s_train > threshold).astype(int), s_train)
            m_val   = evaluation.calculate_metrics(y_val, (s_val > threshold).astype(int), s_val)
            m_test  = evaluation.calculate_metrics(y_test, (s_test > threshold).astype(int), s_test)
            
            # Log
            res = {
                'id': exp['id'],
                'scenario': exp['scenario'],
                'scenario_name': exp['name'],
                'pooling': pooling,
                'model': MODEL_NAME,
                'pca_dim': clf.pca.n_components_ if hasattr(clf, 'pca') else 768,
                'threshold': threshold,
                
                'test_f1': m_test['f1'], 'test_auprc': m_test['avg_precision'], 
                'test_roc_auc': m_test['roc_auc'], 'test_prec': m_test['precision'], 'test_rec': m_test['recall']
            }
            results.append(res)
            pd.DataFrame(results).to_csv(RESULTS_PATH, index=False)
            print(f"      ‚úÖ Result: AUPRC={m_test['avg_precision']:.4f}, F1={m_test['f1']:.4f}")

            # Best Run Save
            if m_test['avg_precision'] > best_auprc:
                best_auprc = m_test['avg_precision']
                best_run = {
                    'info': res,
                    'model': clf,
                    'data': data,
                    'scores_test': s_test,
                    'y_test': y_test,
                    'threshold': threshold
                }
                
        except Exception as e:
            print(f"      ‚ùå Error: {e}")

print("\n‚úÖ All M1/S2 experiments finished.")

## 4. Results Overview

In [None]:
if RESULTS_PATH.exists():
    df_results = pd.read_csv(RESULTS_PATH)
    
    # Pivot Table
    print("üìä SROVN√ÅN√ç AUPRC (Pooling x Scenario):")
    pivot = df_results.pivot_table(
        values='test_auprc', 
        index='pooling', 
        columns='scenario_name',
        aggfunc='max'
    )
    display(pivot.style.background_gradient(cmap='Blues').format("{:.4f}"))
    
    # Grafy
    print("\nüìä GRAFY V√ùSLEDK≈Æ:")
    visualization.plot_pooling_breakdown(df_results, metric='auprc')
    visualization.plot_pooling_breakdown(df_results, metric='f1')
else:
    print("≈Ω√°dn√© v√Ωsledky.")

## 5. Deep Dive: Winner Analysis
Detailn√≠ pohled na nejlep≈°√≠ model (pravdƒõpodobnƒõ Robustness).

In [None]:
if best_run:
    info = best_run['info']
    print(f"üèÜ WINNER: {info['model']} ({info['scenario_name']})")
    print(f"üåä Pooling: {info['pooling'].upper()}")
    print(f"üìâ PCA Dimensions: {info['pca_dim']}")
    print(f"üìä Test AUPRC: {info['test_auprc']:.4f}")
    
    # Predikce
    y_test = best_run['y_test']
    scores = best_run['scores_test']
    thresh = info['threshold']
    y_pred = (scores > thresh).astype(int)
    
    # Vizualizace
    visualization.plot_confusion_matrix_heatmap(y_test, y_pred, normalize=True, title="Confusion Matrix")
    visualization.plot_pr_curve(y_test, scores, title="PR Curve")
    visualization.plot_anomaly_histogram(y_test, scores, threshold=thresh, title="Anomaly Score Dist.")
    
    # Kvalitativn√≠ anal√Ωza
    df_qual = best_run['data']['meta_test'].copy()
    df_qual['true'] = y_test
    df_qual['pred'] = y_pred
    df_qual['score'] = scores
    
    conds = [
        (df_qual.true==1) & (df_qual.pred==1), (df_qual.true==0) & (df_qual.pred==0),
        (df_qual.true==0) & (df_qual.pred==1), (df_qual.true==1) & (df_qual.pred==0)
    ]
    df_qual['category'] = np.select(conds, ['TP', 'TN', 'FP', 'FN'], default='UNKNOWN')
    
    print("\n‚ùå TOP 5 FP (Fale≈°n√Ω poplach):")
    display(df_qual[df_qual['category'] == 'FP'].sort_values('score', ascending=False).head(5))
    
    df_qual.to_csv(config.RESULTS_DIR / "M1_S2_Qualitative.csv", index=False)
else:
    print("Spus≈• experimenty.")