# M3: LLM Benchmark (Zero-Shot Inference)

V tomto notebooku porovn√°me v√Ωkon na≈°ich natr√©novan√Ωch model≈Ø (Logistic Regression, Mahalanobis) s modern√≠mi Large Language Models (LLM) v re≈æimu Zero-Shot.

## 1. Setup & Imports

In [2]:
import sys
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

# Auto-reload modules for development
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Add src to path
current_dir = os.getcwd()
src_dir = os.path.abspath(os.path.join(current_dir, '..', 'src'))
if src_dir not in sys.path:
    sys.path.append(src_dir)

# Import custom modules
import config
import data_splitting
import models
import visualization
import experiments
import evaluation
from llm_client import LLMClassifier

# Setup visualization style
visualization.setup_style()

print(f"‚úÖ Setup complete. Data dir: {config.DATA_DIR}")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


2026-02-09 15:56:27,407 - INFO - üé® Visualization style set: whitegrid


‚úÖ Setup complete. Data dir: C:\Users\dobes\Documents\UniversityCodingProject\ThesisCoding\data


# 1. Naƒçten√≠ Testovac√≠ch Dat
Mus√≠me pou≈æ√≠t **stejnou testovac√≠ sadu** jako v M2/S2 (Sentence Supervised), aby bylo srovn√°n√≠ f√©r.

In [None]:
# Naƒçteme Baseline sc√©n√°≈ô (S2a)
data = data_splitting.get_train_val_test_splits(
    scenario='baseline',
    level='sentence',
    pooling='mean', # Pooling tu nehraje roli, jde n√°m o text
    random_state=42
)

# N√°s zaj√≠m√° jen TEST set a p≈Øvodn√≠ texty
df_test = data['meta_test'].copy()
df_test['true_label'] = data['y_test']

print(f"üìä Test Set Size: {len(df_test)} sentences")
display(df_test.head(3))

## 2. Definice LLM Model≈Ø
Vybereme modely, kter√© chceme testovat.
- **Gemini:** Vy≈æaduje GOOGLE_API_KEY v .env
- **Mistral:** Vy≈æaduje HF_TOKEN v .env (zdarma p≈ôes Inference API, ale m≈Ø≈æe m√≠t rate limit)

In [None]:
MODELS_TO_TEST = [
    {
        'id': 'gemini-flash',
        'provider': 'gemini',
        'model_name': 'gemini-1.5-flash',
        'display_name': 'Gemini 1.5 Flash'
    },
    {
        'id': 'mistral-7b',
        'provider': 'huggingface',
        # Pou≈æijeme v0.3, je novƒõj≈°√≠ a lep≈°√≠ v n√°sledov√°n√≠ instrukc√≠
        'model_name': 'mistralai/Mistral-7B-Instruct-v0.3', 
        'display_name': 'Mistral 7B (HF)'
    }
]

# Cesta pro ukl√°d√°n√≠ v√Ωsledk≈Ø
RESULTS_FILE = config.RESULTS_DIR / "M3_LLM_predictions.csv"

## 3. Inference Loop (Smyƒçka dotazov√°n√≠)
Iterujeme p≈ôes vƒõty a modely. Pr≈Øbƒõ≈ænƒõ ukl√°d√°me v√Ωsledky, abychom o nƒõ nep≈ôi≈°li.

In [None]:
# Pokud u≈æ soubor existuje, naƒçteme ho, abychom neplatili/neƒçekali zbyteƒçnƒõ
if RESULTS_FILE.exists():
    df_results = pd.read_csv(RESULTS_FILE)
    print(f"‚ôªÔ∏è Naƒçteny p≈ôedchoz√≠ v√Ωsledky: {len(df_results)} ≈ô√°dk≈Ø.")
else:
    df_results = pd.DataFrame()

# Projdeme v≈°echny modely
for model_cfg in MODELS_TO_TEST:
    model_id = model_cfg['id']
    print(f"\nüöÄ Starting Inference for: {model_cfg['display_name']}...")
    
    # Inicializace klienta
    try:
        clf = LLMClassifier(
            provider=model_cfg['provider'], 
            model_name=model_cfg['model_name']
        )
    except Exception as e:
        print(f"‚ùå Skipper model {model_id}: {e}")
        continue

    # P≈ôiprav√≠me sloupec pro v√Ωsledky, pokud neexistuje
    col_name = f"pred_{model_id}"
    if col_name not in df_results.columns:
        df_results[col_name] = np.nan
        # Zkop√≠rujeme metadata z df_test, pokud je df_results pr√°zdn√Ω
        if df_results.empty:
            df_results = df_test.copy()
    
    # Smyƒçka p≈ôes vƒõty
    # Proch√°z√≠me jen ty, kter√© je≈°tƒõ nemaj√≠ v√Ωsledek (NaN)
    missing_mask = df_results[col_name].isna()
    indices_to_process = df_results[missing_mask].index
    
    print(f"   -> Processing {len(indices_to_process)} sentences...")
    
    for idx in tqdm(indices_to_process, desc=f"Asking {model_id}"):
        text = df_results.loc[idx, 'text']
        
        # Vol√°n√≠ API
        prediction = clf.predict(text)
        
        # Ulo≈æen√≠
        if prediction is not None:
            df_results.loc[idx, col_name] = prediction
        else:
            print(f"   ‚ö†Ô∏è Failed prediction for index {idx}")
        
        # Pr≈Øbƒõ≈æn√© ulo≈æen√≠ ka≈æd√Ωch 10 vƒõt (prevence ztr√°ty dat)
        if idx % 10 == 0:
            df_results.to_csv(RESULTS_FILE, index=False)
            
    # Fin√°ln√≠ ulo≈æen√≠ po modelu
    df_results.to_csv(RESULTS_FILE, index=False)
    print(f"‚úÖ Model {model_id} finished.")

print("\nüèÅ All benchmarks completed.")
display(df_results.head())

## 4. Vyhodnocen√≠ a Srovn√°n√≠
Spoƒç√≠t√°me metriky pro LLM a p≈ôid√°me "ruƒçnƒõ" nejlep≈°√≠ v√Ωsledek z M2 (LogReg), abychom mƒõli kontext.

In [None]:
# 1. Definice Baseline (Tv≈Øj nejlep≈°√≠ model z M2/S2)
# Z tƒõchto zadan√Ωch csv vz√≠t nejlep≈°√≠ modely podle zadan√© matriky
# Nap≈ô.:
#           M2_S2_path =  config.RESULTS_DIR / ".csv"



# 2. V√Ωpoƒçet metrik pro LLM
metrics_list = [BASELINE_PERFORMANCE]

for model_cfg in MODELS_TO_TEST:
    col_name = f"pred_{model_cfg['id']}"
    
    # Vynech√°me ≈ô√°dky, kde se API nepovedlo (NaN)
    valid_rows = df_results.dropna(subset=[col_name])
    
    if len(valid_rows) == 0:
        continue
        
    y_true = valid_rows['true_label']
    y_pred = valid_rows[col_name].astype(int)
    
    f1 = f1_score(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    
    metrics_list.append({
        'Model': model_cfg['display_name'],
        'F1 Score': f1,
        'Accuracy': acc,
        'Type': 'LLM (Zero-Shot)'
    })
    
    print(f"\nüìä Report: {model_cfg['display_name']}")
    print(classification_report(y_true, y_pred, target_names=['Neutral', 'Bias']))

## 5. Vizualizace Srovn√°n√≠
Kdo vyhr√°l? David (LogReg) nebo Goli√°≈° (Gemini)?