In [5]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

In [2]:
PROJECT_ROOT = Path("..").resolve()
MODEL_DIR    = PROJECT_ROOT / "saved_models" / "distilgpt2-npc"

EVAL_REPORT_PATH      = MODEL_DIR / "eval_report.json"
TRAINER_STATE_PATH    = MODEL_DIR / "trainer_state.json"   # optional
OUTPUT_DIR            = MODEL_DIR / "viz"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print("Report:", EVAL_REPORT_PATH)
print("Trainer state (optional):", TRAINER_STATE_PATH)
print("Out:", OUTPUT_DIR)

Report: D:\Game\Backend\saved_models\distilgpt2-npc\eval_report.json
Trainer state (optional): D:\Game\Backend\saved_models\distilgpt2-npc\trainer_state.json
Out: D:\Game\Backend\saved_models\distilgpt2-npc\viz


In [3]:
def load_json(path: Path):
    if not path.exists():
        print(f"[warn] {path} not found.")
        return None
    return json.loads(path.read_text(encoding="utf-8"))

eval_report   = load_json(EVAL_REPORT_PATH)
trainer_state = load_json(TRAINER_STATE_PATH)

assert eval_report is not None, "eval_report.json not found—run training/evaluation first."
eval_report.keys()

[warn] D:\Game\Backend\saved_models\distilgpt2-npc\trainer_state.json not found.


dict_keys(['eval_loss', 'perplexity', 'val_generation_metrics', 'test_generation_metrics', 'config'])

In [6]:
val = eval_report["val_generation_metrics"]
test = eval_report["test_generation_metrics"]

df_core = pd.DataFrame({
    "metric": ["ROUGE-1", "ROUGE-2", "ROUGE-L", "BLEU"],
    "val":  [val["rouge1"], val["rouge2"], val["rougeL"], val["bleu"]],
    "test": [test["rouge1"], test["rouge2"], test["rougeL"], test["bleu"]],
})

df_div = pd.DataFrame({
    "metric": ["Distinct-1", "Distinct-2", "Repetition rate"],
    "val":  [val["distinct1"], val["distinct2"], val["repetition_rate"]],
    "test": [test["distinct1"], test["distinct2"], test["repetition_rate"]],
})

df_len = pd.DataFrame({
    "split": ["Validation", "Test"],
    "avg_gen_len": [val["avg_gen_len"], test["avg_gen_len"]],
})

summary_df = pd.DataFrame({
    "split": ["Validation", "Test"],
    "ROUGE-1": [val["rouge1"], test["rouge1"]],
    "ROUGE-2": [val["rouge2"], test["rouge2"]],
    "ROUGE-L": [val["rougeL"], test["rougeL"]],
    "BLEU": [val["bleu"], test["bleu"]],
    "Distinct-1": [val["distinct1"], test["distinct1"]],
    "Distinct-2": [val["distinct2"], test["distinct2"]],
    "Repetition rate": [val["repetition_rate"], test["repetition_rate"]],
    "Avg gen len": [val["avg_gen_len"], test["avg_gen_len"]],
    "Samples": [val["samples_eval"], test["samples_eval"]],
    "Perplexity (val)": [eval_report.get("perplexity", None), None],
    "Eval loss (val)": [eval_report.get("eval_loss", None), None],
})

df_core, df_div, df_len.head(), summary_df.head()

(    metric       val      test
 0  ROUGE-1  0.189928  0.195643
 1  ROUGE-2  0.129748  0.140858
 2  ROUGE-L  0.176881  0.184545
 3     BLEU  0.085142  0.092637,
             metric       val      test
 0       Distinct-1  0.091697  0.089613
 1       Distinct-2  0.226830  0.222140
 2  Repetition rate  0.041159  0.039660,
         split  avg_gen_len
 0  Validation    42.509804
 1        Test    43.192157,
         split   ROUGE-1   ROUGE-2   ROUGE-L      BLEU  Distinct-1  Distinct-2  \
 0  Validation  0.189928  0.129748  0.176881  0.085142    0.091697     0.22683   
 1        Test  0.195643  0.140858  0.184545  0.092637    0.089613     0.22214   
 
    Repetition rate  Avg gen len  Samples  Perplexity (val)  Eval loss (val)  
 0         0.041159    42.509804      255          1.342301         0.294385  
 1         0.039660    43.192157      255               NaN              NaN  )

In [7]:
plt.figure()
plt.bar(["Validation"], [eval_report.get("perplexity", None)])
plt.ylabel("Perplexity")
plt.title("Perplexity")
out_path = OUTPUT_DIR / "perplexity.png"
plt.savefig(out_path, bbox_inches="tight")
plt.close()
print("Saved:", out_path)


Saved: D:\Game\Backend\saved_models\distilgpt2-npc\viz\perplexity.png


In [8]:
def plot_train_val_loss(trainer_state_json, out_dir: Path):
    if not trainer_state_json:
        print("[skip] trainer_state.json missing; no loss curve.")
        return None

    logs = trainer_state_json.get("log_history", [])
    if not logs:
        print("[skip] trainer_state.log_history empty; no loss curve.")
        return None

    steps_tr, loss_tr = [], []
    steps_ev, loss_ev = [], []

    for rec in logs:
        if "loss" in rec and "step" in rec:
            steps_tr.append(rec["step"])
            loss_tr.append(rec["loss"])
        if "eval_loss" in rec and "step" in rec:
            steps_ev.append(rec["step"])
            loss_ev.append(rec["eval_loss"])

    if not steps_tr and not steps_ev:
        print("[skip] no (eval_)loss entries in log_history.")
        return None

    plt.figure()
    if steps_tr:
        plt.plot(steps_tr, loss_tr, label="Training loss")
    if steps_ev:
        plt.plot(steps_ev, loss_ev, label="Validation loss")
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.title("Training vs Validation Loss")
    plt.legend()
    out = out_dir / "loss_training_vs_validation.png"
    plt.savefig(out, bbox_inches="tight")
    plt.close()
    print("Saved:", out)
    return out

_ = plot_train_val_loss(trainer_state, OUTPUT_DIR)

[skip] trainer_state.json missing; no loss curve.


In [9]:
x = range(len(df_core))
width = 0.35

plt.figure()
plt.bar([i - width/2 for i in x], df_core["val"],  width, label="Validation")
plt.bar([i + width/2 for i in x], df_core["test"], width, label="Test")
plt.xticks(list(x), df_core["metric"])
plt.ylabel("Score")
plt.title("Generation Quality — ROUGE/BLEU (Val vs Test)")
plt.legend()
out_path = OUTPUT_DIR / "quality_rouge_bleu.png"
plt.savefig(out_path, bbox_inches="tight")
plt.close()
print("Saved:", out_path)

Saved: D:\Game\Backend\saved_models\distilgpt2-npc\viz\quality_rouge_bleu.png


In [10]:
x = range(len(df_div))
width = 0.35

plt.figure()
plt.bar([i - width/2 for i in x], df_div["val"],  width, label="Validation")
plt.bar([i + width/2 for i in x], df_div["test"], width, label="Test")
plt.xticks(list(x), df_div["metric"])
plt.ylabel("Score")
plt.title("Diversity & Repetition (Val vs Test)")
plt.legend()
out_path = OUTPUT_DIR / "diversity_repetition.png"
plt.savefig(out_path, bbox_inches="tight")
plt.close()
print("Saved:", out_path)

Saved: D:\Game\Backend\saved_models\distilgpt2-npc\viz\diversity_repetition.png


In [11]:
plt.figure()
plt.bar(df_len["split"], df_len["avg_gen_len"])
plt.ylabel("Average generated length (words)")
plt.title("Average Generated Length")
out_path = OUTPUT_DIR / "avg_generated_length.png"
plt.savefig(out_path, bbox_inches="tight")
plt.close()
print("Saved:", out_path)

Saved: D:\Game\Backend\saved_models\distilgpt2-npc\viz\avg_generated_length.png


In [12]:
csv_path = OUTPUT_DIR / "metrics_summary.csv"
summary_df.to_csv(csv_path, index=False)
copy_path = OUTPUT_DIR / "eval_report.copy.json"
copy_path.write_text(json.dumps(eval_report, indent=2), encoding="utf-8")

print("Saved summary CSV:", csv_path)
print("Saved report copy:", copy_path)

Saved summary CSV: D:\Game\Backend\saved_models\distilgpt2-npc\viz\metrics_summary.csv
Saved report copy: D:\Game\Backend\saved_models\distilgpt2-npc\viz\eval_report.copy.json
