## 1. Setup and Imports
Import necessary libraries and project modules. Set up paths to access `src` code.

In [10]:
import os, json
from pathlib import Path
import pandas as pd

# Make src importable
import sys
sys.path.append(str(Path("..").resolve()))  # if notebook inside /notebooks
sys.path.append(str((Path("..") / "src").resolve()))

from src.common.io import load_docs, load_rules, load_gold
from src.baselines.tfidf_ir import tfidf_ir_predict
from src.baselines.logistic_baseline import logistic_predict
from src.baselines.fasttext_05a import fasttext_predict_05A
from src.eval.metrics import evaluate

## 2. Load Data
Load documents, compliance rules, and gold standard labels from the `data` directory.

In [11]:
DOCS_DIR = "../data/docs"
RULES_PATH = "../data/rules.json"
GOLD_PATH = "../data/gold_labels.json"

docs = load_docs(DOCS_DIR)
rules = load_rules(RULES_PATH)
gold = load_gold(GOLD_PATH)

print("Loaded docs:", [d.doc_id for d in docs])
print("Loaded rules:", [r.id for r in rules])

Loaded docs: ['doc_001', 'doc_002', 'doc_003', 'DOC_004', 'DOC_005', 'DOC_006', 'DOC_007', 'DOC_008', 'DOC_009', 'DOC_010', 'DOC_011', 'DOC_012', 'DOC_013', 'DOC_014', 'DOC_015']
Loaded rules: ['01', '02', '03', '05A', '08', '10A', '12', '13']


## 3. Baseline: TF-IDF Information Retrieval
Run a simple vector space model baseline. It computes Cosine Similarity between the document text and the rule description. If similarity < threshold, it predicts VIOLATED.

In [12]:
preds_tfidf = tfidf_ir_predict(
    docs=docs,
    rules=rules,
    model_name="TFIDF-IR",
    threshold=0.10  # you can tune this
)

eval_tfidf = evaluate(preds_tfidf, gold)
eval_tfidf["overall"], list(eval_tfidf["per_rule"].items())[:2]

({'macro_precision': 0.23958333333333334,
  'macro_recall': 0.75,
  'macro_f1': 0.3374295467174724},
 [('01',
   {'tp': 3,
    'fp': 9,
    'fn': 0,
    'precision': 0.25,
    'recall': 1.0,
    'f1': 0.4,
    'coverage': 1.0}),
  ('02',
   {'tp': 0,
    'fp': 12,
    'fn': 0,
    'precision': 0.0,
    'recall': 0.0,
    'f1': 0.0,
    'coverage': 1.0})])

## 4. Baseline: Logistic Regression
Train a Logistic Regression classifier on TF-IDF features. Note: This is purely illustrative due to the very small dataset size.

In [13]:
preds_logreg = logistic_predict(
    docs=docs,
    rules=rules,
    gold=gold,
    model_name="TFIDF+LogReg"
)

eval_logreg = evaluate(preds_logreg, gold)
eval_logreg["overall"], list(eval_logreg["per_rule"].items())[:2]

({'macro_precision': 0.25, 'macro_recall': 0.25, 'macro_f1': 0.25},
 [('01',
   {'tp': 0,
    'fp': 0,
    'fn': 3,
    'precision': 0.0,
    'recall': 0.0,
    'f1': 0.0,
    'coverage': 1.0}),
  ('02',
   {'tp': 0,
    'fp': 0,
    'fn': 0,
    'precision': 0.0,
    'recall': 0.0,
    'f1': 0.0,
    'coverage': 1.0})])

## 5. Compare Baseline Results
Aggregate evaluation metrics (Precision, Recall, F1) for both baselines into a Pandas DataFrame for easy comparison.

In [14]:
def eval_to_df(eval_obj, model_name):
    rows = []
    for rid, m in eval_obj["per_rule"].items():
        rows.append({
            "model": model_name,
            "rule": rid,
            "precision": m["precision"],
            "recall": m["recall"],
            "f1": m["f1"],
            "coverage": m["coverage"]
        })
    return pd.DataFrame(rows)

df = pd.concat([
    eval_to_df(eval_tfidf, "TFIDF-IR"),
    eval_to_df(eval_logreg, "TFIDF+LogReg"),
], ignore_index=True)

df.sort_values(["rule", "model"])

## 6. Save Baseline Results
Save the summary table to a CSV file in the `results` directory.

In [15]:
OUT_DIR = Path("../results")
OUT_DIR.mkdir(parents=True, exist_ok=True)

df.to_csv(OUT_DIR / "summary.csv", index=False)
print("Saved:", OUT_DIR / "summary.csv")

Saved: ..\results\summary.csv


## 7. Baseline: FastText Embeddings (Rule 05A)
Run a semantic similarity baseline specifically for Rule 05A using pre-trained Italian FastText embeddings. This approach matches keywords like 'resistente' or 'refrattario', but may fail on negations.

In [16]:
from pathlib import Path

p = Path("../models/cc.it.300.bin")
print("exists:", p.exists())
print("absolute:", p.resolve())
print("suffix:", p.suffix)
print("size MB:", round(p.stat().st_size / (1024*1024), 2) if p.exists() else None)

exists: True
absolute: C:\Users\loren\Desktop\Uni\Statale\NLP\Exam_Rule2LLM\models\cc.it.300.bin
suffix: .bin
size MB: 6903.07


In [17]:
ft_preds = fasttext_predict_05A(
    docs=docs,
    model_path="../models/cc.it.300.bin",
    threshold=0.60
)

eval_ft = evaluate(ft_preds, gold)
eval_ft["overall"], eval_ft["per_rule"].get("05A")

Loading vectors from ../models/cc.it.300.bin...


({'macro_precision': 0.75,
  'macro_recall': 0.6,
  'macro_f1': 0.6666666666666665},
 {'tp': 3,
  'fp': 1,
  'fn': 2,
  'precision': 0.75,
  'recall': 0.6,
  'f1': 0.6666666666666665,
  'coverage': 1.0})

## 8. FastText Results
View the performance of the FastText baseline on Rule 05A.

In [18]:
df_ft = eval_to_df(eval_ft, "fastText-sim-05A")
df_ft

---
## 9. LLM Evaluation Results

Load pre-computed predictions produced by `src/llm/runner.py` and evaluated by `src/eval/eval_llm.py`.

To reproduce (PowerShell):
```powershell
# OpenAI
python -m src.llm.runner --provider openai --model gpt-4.1 --docs data/docs --rules data/rules.json --out results/predictions/openai_gpt41.jsonl
python -m src.eval.eval_llm --predictions results/predictions/openai_gpt41.jsonl --gold data/gold_labels.json --out-dir results/metrics --model-tag openai_gpt41

# Ollama
python -m src.llm.runner --provider ollama --model mistral:latest --docs data/docs --rules data/rules.json --out results/predictions/ollama_mistral.jsonl
python -m src.eval.eval_llm --predictions results/predictions/ollama_mistral.jsonl --gold data/gold_labels.json --out-dir results/metrics --model-tag ollama_mistral
```

In [19]:
import json
from pathlib import Path
import pandas as pd

METRICS_DIR = Path("../results/metrics")

# Load overall CSVs for both models
llm_models = ["openai_gpt41", "ollama_mistral"]
overall_frames = []
for tag in llm_models:
    p = METRICS_DIR / f"{tag}_overall.csv"
    if p.exists():
        overall_frames.append(pd.read_csv(p))

df_llm_overall = pd.concat(overall_frames, ignore_index=True)

for _, row in df_llm_overall.iterrows():
    print(f"{row['model']:<20}: macro_f1={row['macro_f1']:.4f}  "
          f"precision={row['macro_precision']:.4f}  recall={row['macro_recall']:.4f}  "
          f"abstention={row['abstention_rate']:.2%}  n={int(row['n_predictions'])}")

openai_gpt41 : macro_f1=0.6332  precision=0.6125  recall=0.7292  abstention=0.00%  n=120
ollama_mistral: macro_f1=0.0369  precision=0.0645  recall=0.0296  abstention=8.94%  n=123


## 10. LLM Per-Rule Breakdown

Load per-rule CSV for the best LLM model (OpenAI gpt-4.1).

In [20]:
df_gpt41_per_rule = pd.read_csv(METRICS_DIR / "openai_gpt41_per_rule.csv")
df_gpt41_per_rule

   model rule_id  tp  fp  fn  precision    recall        f1  coverage
0  openai_gpt41      01   3   0   0       1.00    1.0000    1.0000       1.0
1  openai_gpt41      02   0   2   0       0.00    0.0000    0.0000       1.0
2  openai_gpt41      03   1   4   0       0.20    1.0000    0.3333       1.0
3  openai_gpt41     05A   7   3   0       0.70    1.0000    0.8235       1.0
4  openai_gpt41      08   2   0   0       1.00    1.0000    1.0000       1.0
5  openai_gpt41     10A   8   0   0       1.00    1.0000    1.0000       1.0
6  openai_gpt41      12   5   0   1       1.00    0.8333    0.9091       1.0
7  openai_gpt41      13   0   3   0       0.00    0.0000    0.0000       1.0


## 11. Full Comparison — All Models

Combine baselines and LLM results into a single summary table.

> **Note on Ollama/Mistral:** Mistral returned hallucinated rule IDs (`R1`–`R11`, `rule_1`–`rule_12`) instead of the canonical IDs specified in the prompt. This instruction-following failure drove its F1 to near-zero and is discussed in the paper as an error analysis finding.

In [21]:
# Load master results CSV
df_all = pd.read_csv(Path("../results/run_results.csv"))

# Display clean summary
df_summary = df_all[[
    "model", "macro_precision", "macro_recall", "macro_f1", "abstention_rate"
]].copy()
df_summary = df_summary.sort_values("macro_f1", ascending=False).reset_index(drop=True)
df_summary

                     model  macro_precision  macro_recall  macro_f1  abstention_rate
0                 TFIDF-IR           0.2396        0.7500    0.3374              0.0
1             TFIDF+LogReg           0.2500        0.2500    0.2500              0.0
2         fastText-sim-05A           0.7500        0.6000    0.6667              0.0
3             openai_gpt41           0.6125        0.7292    0.6332              0.0
4           ollama_mistral            0.0645        0.0296    0.0369             0.0894


In [22]:
# Save full combined summary
df_summary.to_csv(Path("../results/metrics/summary.csv"), index=False)
print("Updated summary.csv saved.")
df_summary