
# The Structural Factor Analysis of benchmark for Over-Refusal Behavior Based on Varies LLMS

## Model 1: Gemma3-4b

## Model 2: Llama3.1-8b

## Model 3: Qwen3-4b

## Model 4: Gemini-2.5-flash

## Model 5: Deepseek-V3.2

In [1]:
# -*- coding: utf-8 -*-
"""
多模型 + 多语言 依存树深度 / 词汇信息熵 分析
- 特征缓存：./cache/<model_slug>_features_cache.csv
- 图表保存：./images/dep_chart.json, ./images/ent_chart.json
- 最终只画 2 张图（信息密度高）：
    1) Dependency Tree Depth by Model & Language (refuse only)
    2) Lexical Information Entropy by Model & Language (refuse only)
"""

# ========== Imports ==========
import importlib
import re
import math
import os
import gc
from functools import reduce
from pathlib import Path
from statistics import mean
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import torch
from collections import Counter
from tqdm import tqdm
import altair as alt
from IPython.display import display

torch.set_grad_enabled(False)

# ========== Global Config ==========

# 模型 slug -> 展示名
MODEL_MAPPING: Dict[str, str] = {
    "deepseekv32":   "deepseek-v3.2",
    "llama318b":     "llama3-8b",
    "qwen34b":       "qwen3-4b",
    "gemini25flash": "gemini-2.5-flash",
    "gemma34b":      "gemma3-4b",
}

# 你的数据目录（CSV 所在目录）
# 会去找：test_<slug>_on_local_data_results_labeled.csv
BASE_DATA_DIR = "../data/label_fusion"   # ← 按需要改

# 路径：当前目录下保存缓存和图表
ROOT_DIR   = Path.cwd()
CACHE_DIR  = ROOT_DIR / "cache"
IMAGES_DIR = ROOT_DIR / "images"
CACHE_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)

print(f"[INFO] Cache dir : {CACHE_DIR.resolve()}")
print(f"[INFO] Images dir: {IMAGES_DIR.resolve()}")

# ========== Altair & Device ==========

alt.data_transformers.disable_max_rows()
try:
    alt.renderers.enable("default")
except Exception:
    pass

USE_GPU = torch.cuda.is_available()
DEVICE = "cuda" if USE_GPU else "cpu"
print(f"[INFO] GPU Acceleration: {'Enabled' if USE_GPU else 'Disabled'} (Device: {DEVICE})")


def empty_cuda_cache():
    """小工具：安全地清空 CUDA 缓存。"""
    if torch.cuda.is_available():
        try:
            torch.cuda.synchronize()
        except Exception:
            pass
        try:
            torch.cuda.empty_cache()
        except Exception:
            pass


# ========== NLP (Stanza) ==========

try:
    stanza = importlib.import_module("stanza")
except Exception as e:
    raise ImportError(
        "stanza 未安装，请先 `pip install stanza`，"
        "并在首次使用某个语言时允许自动下载模型。"
    ) from e

_NLP_CACHE: Dict[str, "stanza.Pipeline"] = {}


def get_nlp(lang_code: str):
    """获取/构建 Stanza pipeline（内存缓存 + GPU/CPU 配置）"""
    if lang_code not in _NLP_CACHE:
        print(f"[INFO] Loading Stanza Pipeline: '{lang_code}' ...")
        try:
            _NLP_CACHE[lang_code] = stanza.Pipeline(
                lang_code,
                processors="tokenize,pos,lemma,depparse",
                tokenize_no_ssplit=False,
                use_gpu=USE_GPU,
                device=DEVICE,
            )
        except Exception:
            print(f"[INFO] Downloading language model for '{lang_code}' ...")
            stanza.download(lang_code)
            _NLP_CACHE[lang_code] = stanza.Pipeline(
                lang_code,
                processors="tokenize,pos,lemma,depparse",
                tokenize_no_ssplit=False,
                use_gpu=USE_GPU,
                device=DEVICE,
            )
    return _NLP_CACHE[lang_code]


def release_nlp(lang_code: str):
    """释放 Stanza pipeline & CUDA 缓存"""
    if lang_code in _NLP_CACHE:
        print(f"[INFO] Releasing Stanza Pipeline: '{lang_code}'")
        _NLP_CACHE.pop(lang_code, None)
        gc.collect()
        empty_cuda_cache()


# ========== 语言学特征 ==========

CN_COMPLEX_PUNCT = re.compile(r"[；：——…—]")
SUBORDINATE_TAGS = {"mark", "advcl", "acl", "ccomp", "xcomp", "dep", "parataxis"}


def count_complex_punct(text: str) -> int:
    return len(CN_COMPLEX_PUNCT.findall(str(text)))


def unigram_entropy(tokens):
    if not tokens:
        return 0.0
    cnt = Counter(tokens)
    n = len(tokens)
    ent = 0.0
    for c in cnt.values():
        p = c / n
        ent -= p * math.log(p + 1e-12)
    return float(ent)


def type_token_ratio(tokens):
    return (len(set(tokens)) / len(tokens)) if tokens else 0.0


def compute_dep_tree_depth(sent):
    children = {}
    for w in sent.words:
        try:
            head_id = int(w.head)
        except (ValueError, TypeError):
            continue
        children.setdefault(head_id, []).append(w.id)

    def dfs(node_id, depth):
        if node_id not in children:
            return depth
        return max(dfs(ch, depth + 1) for ch in children[node_id])

    depths = [dfs(ch, 1) for ch in children.get(0, [])] or [1]
    return max(depths)


def compute_dep_distance_mean(sent):
    if not sent.words:
        return 0.0
    dists = []
    for w in sent.words:
        if w.head is not None and w.id is not None:
            try:
                head_id = int(w.head)
                word_id = int(w.id)
                if head_id != 0:
                    dists.append(abs(word_id - head_id))
            except (ValueError, TypeError):
                continue
    return mean(dists) if dists else 0.0


def compute_sub_clause_count(sent):
    return sum(1 for w in sent.words if (w.deprel or "").lower() in SUBORDINATE_TAGS)


def stanza_features_for_text(text: str, nlp):
    text = str(text or "").strip()
    if not text:
        return {
            "character_len": 0,
            "sentence_count": 0,
            "token_len": 0,
            "dep_depth_mean": 0.0,
            "dep_distance_mean": 0.0,
            "sub_clause_count": 0,
            "punct_complex_count": 0,
            "type_token_ratio": 0.0,
            "lexical_information_entropy": 0.0,
        }

    doc = nlp(text)
    sents = doc.sentences
    sent_count = len(sents)
    tok_len = sum(len(s.words) for s in sents)

    dep_depths = [compute_dep_tree_depth(s) for s in sents] if sents else [0]
    dep_depth_mean = float(mean(dep_depths)) if dep_depths else 0.0

    dep_distance_means = [compute_dep_distance_mean(s) for s in sents] if sents else [0.0]
    dep_distance_mean = float(mean(dep_distance_means)) if dep_distance_means else 0.0

    sub_clause_total = int(sum(compute_sub_clause_count(s) for s in sents))
    tokens = [w.text for s in sents for w in s.words]

    return {
        "character_len": len(text),
        "sentence_count": sent_count,
        "token_len": tok_len,
        "dep_depth_mean": dep_depth_mean,
        "dep_distance_mean": dep_distance_mean,
        "sub_clause_count": sub_clause_total,
        "punct_complex_count": int(count_complex_punct(text)),
        "type_token_ratio": float(type_token_ratio(tokens)),
        "lexical_information_entropy": float(unigram_entropy(tokens)),
    }


# ========== 特征转为 long-format ==========

def to_long_features(
    df_feat: pd.DataFrame,
    model_slug: str,
    model_name: str,
    label_en: str,
    label_cn: str,
    label_mix: str,
) -> pd.DataFrame:
    """
    把单模型特征表 df_feat 展开为统一格式：
    id, model_slug, model_name, language, label,
    dep_depth_mean, lexical_information_entropy
    """
    records = []

    config = [
        ("EN", "dep_depth_mean_EN",  "lexical_information_entropy_EN",  label_en),
        ("CN", "dep_depth_mean_CN",  "lexical_information_entropy_CN",  label_cn),
        ("MIX","dep_depth_mean_MIX", "lexical_information_entropy_MIX", label_mix),
    ]

    for lang, dep_col, ent_col, lab_col in config:
        if dep_col in df_feat.columns and ent_col in df_feat.columns and lab_col in df_feat.columns:
            tmp = df_feat[["id", dep_col, ent_col, lab_col]].copy()
            tmp.rename(
                columns={
                    dep_col: "dep_depth_mean",
                    ent_col: "lexical_information_entropy",
                    lab_col: "label",
                },
                inplace=True,
            )
            tmp["language"]   = lang
            tmp["model_slug"] = model_slug
            tmp["model_name"] = model_name
            records.append(tmp)

    if not records:
        return pd.DataFrame(
            columns=[
                "id", "model_slug", "model_name", "language",
                "label", "dep_depth_mean", "lexical_information_entropy",
            ]
        )

    return pd.concat(records, ignore_index=True)


# ========== 单模型处理：读取 CSV + 特征提取 + 缓存 + 输出 long format ==========

def process_single_model(model_slug: str, model_name: str, base_dir: Path) -> pd.DataFrame:
    """
    读取单个模型的 CSV，提取 EN/CN/MIX 的依存深度 & 词汇熵特征，
    使用 cache 加速，并返回 long-format DataFrame。
    """
    print(f"\n==================== Analyzing: {model_name} (slug: {model_slug}) ====================")
    base_dir = Path(base_dir)
    csv_path   = base_dir / f"test_{model_slug}_on_local_data_results_labeled.csv"
    cache_path = CACHE_DIR / f"{model_slug}_features_cache.csv"

    if not csv_path.exists():
        print(f"[ERROR] Data file not found: {csv_path}")
        return pd.DataFrame()

    # --- 读数据 & 列名预处理 ---
    df = pd.read_csv(csv_path)
    df.columns = [c.strip() for c in df.columns]

    def find_col(exact_name_pattern: str):
        for c in df.columns:
            if re.fullmatch(exact_name_pattern, c, flags=re.I):
                return c
        return None

    TEXT_EN  = find_col(r"English")
    TEXT_CN  = find_col(r"Chinese")
    TEXT_MIX = find_col(r"Mixed")

    LABEL_EN = "Final_Label_EN"
    LABEL_CN = "Final_Label_CN"
    LABEL_MIX = "Final_Label_MIX"

    if not any([TEXT_EN, TEXT_CN, TEXT_MIX]):
        print(f"[ERROR] No text columns (English/Chinese/Mixed) found. Skip {model_name}.")
        return pd.DataFrame()

    # label 统一小写
    for lab in [LABEL_EN, LABEL_CN, LABEL_MIX]:
        if lab in df.columns:
            df[lab] = df[lab].astype(str).str.lower().str.strip()

    # id 列
    if "id" not in df.columns:
        df = df.reset_index().rename(columns={"index": "id"})
    df = df.rename(columns={"Rewrite Method": "method", "Category": "category"}, errors="ignore")
    df_cn = df.copy()

    # 语言变体配置
    variants: List[Tuple[str, str, str, str]] = []
    if TEXT_EN in df_cn.columns:
        variants.append(("EN", TEXT_EN, LABEL_EN, "en"))
    if TEXT_CN in df_cn.columns:
        variants.append(("CN", TEXT_CN, LABEL_CN, "zh"))
    if TEXT_MIX in df_cn.columns:
        variants.append(("MIX", TEXT_MIX, LABEL_MIX, "zh"))

    # --- 读取/生成 特征缓存 ---
    df_feat: pd.DataFrame | None = None
    if cache_path.exists():
        try:
            cached = pd.read_csv(cache_path)
            print(f"[INFO] Loaded features from cache: {cache_path}")
            req = [f"dep_depth_mean_{v[0]}" for v in variants] + \
                  [f"lexical_information_entropy_{v[0]}" for v in variants]
            if all(c in cached.columns for c in req) and len(cached) == len(df_cn):
                cached["id"] = pd.to_numeric(cached["id"], errors="coerce").astype("Int64")
                df_cn["id"] = pd.to_numeric(df_cn["id"], errors="coerce").astype("Int64")
                df_feat = cached
            else:
                print("[WARN] Cache incomplete/outdated. Recompute features.")
        except Exception as e:
            print(f"[WARN] Failed reading cache ({cache_path}): {e}")

    if df_feat is None:
        feature_frames: List[pd.DataFrame] = []
        df_cn["id"] = pd.to_numeric(df_cn["id"], errors="coerce").astype("Int64")

        for name, text_col, label_col, lang_code in variants:
            print(f">> Computing features for {name} from '{text_col}' ...")
            nlp = get_nlp(lang_code)
            try:
                rows = []
                for _id, text in tqdm(
                    df_cn[["id", text_col]].itertuples(index=False, name=None),
                    total=len(df_cn),
                ):
                    feats = stanza_features_for_text(text, nlp)
                    rows.append({f"{k}_{name}": v for k, v in feats.items()})
                df_f = pd.DataFrame(rows)
                df_f["id"] = df_cn["id"]
                df_f.drop(columns=[c for c in df_f.columns if c.startswith("id_")],
                          inplace=True, errors="ignore")
                if label_col in df_cn.columns:
                    df_f = df_f.merge(df_cn[["id", label_col]], on="id", how="left")
                feature_frames.append(df_f)
            finally:
                release_nlp(lang_code)
                empty_cuda_cache()

        if len(feature_frames) > 1:
            df_feat = reduce(lambda l, r: pd.merge(l, r, on="id", how="outer"), feature_frames)
        else:
            df_feat = feature_frames[0].copy()

        try:
            df_feat.to_csv(cache_path, index=False)
            print(f"[INFO] Features cached -> {cache_path}")
        except Exception as e:
            print(f"[WARN] Fail to save cache: {e}")

    # --- 构造 long-format 特征表 ---
    df_long = to_long_features(
        df_feat=df_feat,
        model_slug=model_slug,
        model_name=model_name,
        label_en=LABEL_EN,
        label_cn=LABEL_CN,
        label_mix=LABEL_MIX,
    )

    # 模型级清理
    try:
        del df, df_cn, df_feat
        gc.collect()
        empty_cuda_cache()
    except Exception:
        pass

    print(f"[INFO] Long-format rows for {model_name}: {len(df_long)}")
    print(f"==================== Done: {model_name} ====================")
    return df_long


# ========== 主流程：跑所有模型，合并特征，只画 2 张图 ==========

print(f"[INFO] Models configured: {len(MODEL_MAPPING)}")
print("-" * 60)

all_features: List[pd.DataFrame] = []
for slug, name in MODEL_MAPPING.items():
    df_long = process_single_model(slug, name, Path(BASE_DATA_DIR))
    if df_long is not None and not df_long.empty:
        all_features.append(df_long)

if not all_features:
    print("[ERROR] No features collected from any model. Abort plotting.")
else:
    df_all = pd.concat(all_features, ignore_index=True)
    print("[INFO] Combined feature shape:", df_all.shape)

    # 只保留拒绝样本
    df_all["label"] = df_all["label"].astype(str).str.lower().str.strip()
    df_refuse = df_all[df_all["label"] == "refuse"].copy()
    print("[INFO] Refuse-only subset shape:", df_refuse.shape)

    if df_refuse.empty:
        print("[ERROR] No refuse samples found in combined data. Abort plotting.")
    else:
        # 依存树深度：分组箱型图（模型 × 语言）
        dep_chart = (
            alt.Chart(df_refuse)
            .mark_boxplot(size=18)
            .encode(
                x=alt.X("model_name:N", title="Model"),
                xOffset=alt.XOffset("language:N"),
                y=alt.Y("dep_depth_mean:Q", title="Dependency Tree Depth"),
                color=alt.Color("language:N", title="Language"),
                tooltip=[
                    "model_name:N",
                    "language:N",
                    "dep_depth_mean:Q",
                ],
            )
            .properties(
                width=60 * df_refuse["model_name"].nunique(),
                height=300,
                title="Dependency Tree Depth by Model & Language (refuse only)",
            )
        )

        # 词汇信息熵：分组箱型图（模型 × 语言）
        ent_chart = (
            alt.Chart(df_refuse)
            .mark_boxplot(size=18)
            .encode(
                x=alt.X("model_name:N", title="Model"),
                xOffset=alt.XOffset("language:N"),
                y=alt.Y("lexical_information_entropy:Q", title="Lexical Information Entropy"),
                color=alt.Color("language:N", title="Language"),
                tooltip=[
                    "model_name:N",
                    "language:N",
                    "lexical_information_entropy:Q",
                ],
            )
            .properties(
                width=60 * df_refuse["model_name"].nunique(),
                height=300,
                title="Lexical Information Entropy by Model & Language (refuse only)",
            )
        )

        # 保存 JSON（当前目录下的 ./images）
        dep_json_path = IMAGES_DIR / "dep_chart.json"
        ent_json_path = IMAGES_DIR / "ent_chart.json"
        try:
            dep_chart.save(dep_json_path)
            print(f"[INFO] Dependency chart spec saved -> {dep_json_path}")
        except Exception as e:
            print(f"[WARN] Fail to save dep_chart JSON: {e}")
        try:
            ent_chart.save(ent_json_path)
            print(f"[INFO] Entropy chart spec saved -> {ent_json_path}")
        except Exception as e:
            print(f"[WARN] Fail to save ent_chart JSON: {e}")

        # 在 Notebook 中显示两张图
        display(dep_chart)
        display(ent_chart)


[INFO] Cache dir : D:\Workspae\usyd\capstone\USYD-25S2-Capstone-CS62-2\evaluation\cache
[INFO] Images dir: D:\Workspae\usyd\capstone\USYD-25S2-Capstone-CS62-2\evaluation\images
[INFO] GPU Acceleration: Enabled (Device: cuda)


  from .autonotebook import tqdm as notebook_tqdm
2025-11-10 15:53:31 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


[INFO] Models configured: 5
------------------------------------------------------------

>> Computing features for EN from 'English' ...
[INFO] Loading Stanza Pipeline: 'en' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 25.7MB/s]                    
2025-11-10 15:53:31 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 15:53:32 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2025-11-10 15:53:32 INFO: Using device: cuda
2025-11-10 15:53:32 INFO: Loading: tokenize
2025-11-10 15:53:33 INFO: Loading: mwt
2025-11-10 15:53:33 INFO: Loading: pos
2025-11-10 15:53:35 INFO: Loading: lemma
2025-11-10 15:53:36 INFO: Loading: depparse
2025-11-10 15:53:36 INFO: Done loading processors!
100%|██████████| 600/600 [01:07<00:00,  8.85it/s]
2025-11-10 15:54:44 INFO: Checking for updates to resources.json in case models have been 

[INFO] Releasing Stanza Pipeline: 'en'
>> Computing features for CN from 'Chinese' ...
[INFO] Loading Stanza Pipeline: 'zh' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 25.7MB/s]                    
2025-11-10 15:54:44 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 15:54:44 INFO: "zh" is an alias for "zh-hans"
2025-11-10 15:54:45 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package          |
--------------------------------
| tokenize  | gsdsimp          |
| pos       | gsdsimp_charlm   |
| lemma     | gsdsimp_nocharlm |
| depparse  | gsdsimp_charlm   |

2025-11-10 15:54:45 INFO: Using device: cuda
2025-11-10 15:54:45 INFO: Loading: tokenize
2025-11-10 15:54:45 INFO: Loading: pos
2025-11-10 15:54:47 INFO: Loading: lemma
2025-11-10 15:54:48 INFO: Loading: depparse
2025-11-10 15:54:48 INFO: Done loading processors!
100%|██████████| 600/600 [00:33<00:00, 18.16it/s]
2025-11-10 15:55:22 INFO: Checking for updates to resources.json in case models have been update

[INFO] Releasing Stanza Pipeline: 'zh'
>> Computing features for MIX from 'Mixed' ...
[INFO] Loading Stanza Pipeline: 'zh' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 42.5MB/s]                    
2025-11-10 15:55:22 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 15:55:22 INFO: "zh" is an alias for "zh-hans"
2025-11-10 15:55:22 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package          |
--------------------------------
| tokenize  | gsdsimp          |
| pos       | gsdsimp_charlm   |
| lemma     | gsdsimp_nocharlm |
| depparse  | gsdsimp_charlm   |

2025-11-10 15:55:22 INFO: Using device: cuda
2025-11-10 15:55:22 INFO: Loading: tokenize
2025-11-10 15:55:22 INFO: Loading: pos
2025-11-10 15:55:25 INFO: Loading: lemma
2025-11-10 15:55:25 INFO: Loading: depparse
2025-11-10 15:55:26 INFO: Done loading processors!
100%|██████████| 600/600 [00:45<00:00, 13.19it/s]


[INFO] Releasing Stanza Pipeline: 'zh'
[INFO] Features cached -> d:\Workspae\usyd\capstone\USYD-25S2-Capstone-CS62-2\evaluation\cache\deepseekv32_features_cache.csv
[INFO] Long-format rows for deepseek-v3.2: 1800



2025-11-10 15:56:12 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


>> Computing features for EN from 'English' ...
[INFO] Loading Stanza Pipeline: 'en' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 43.5MB/s]                    
2025-11-10 15:56:12 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 15:56:12 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2025-11-10 15:56:12 INFO: Using device: cuda
2025-11-10 15:56:12 INFO: Loading: tokenize
2025-11-10 15:56:12 INFO: Loading: mwt
2025-11-10 15:56:12 INFO: Loading: pos
2025-11-10 15:56:14 INFO: Loading: lemma
2025-11-10 15:56:15 INFO: Loading: depparse
2025-11-10 15:56:15 INFO: Done loading processors!
100%|██████████| 600/600 [01:07<00:00,  8.88it/s]
2025-11-10 15:57:23 INFO: Checking for updates to resources.json in case models have been 

[INFO] Releasing Stanza Pipeline: 'en'
>> Computing features for CN from 'Chinese' ...
[INFO] Loading Stanza Pipeline: 'zh' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 38.6MB/s]                    
2025-11-10 15:57:23 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 15:57:23 INFO: "zh" is an alias for "zh-hans"
2025-11-10 15:57:24 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package          |
--------------------------------
| tokenize  | gsdsimp          |
| pos       | gsdsimp_charlm   |
| lemma     | gsdsimp_nocharlm |
| depparse  | gsdsimp_charlm   |

2025-11-10 15:57:24 INFO: Using device: cuda
2025-11-10 15:57:24 INFO: Loading: tokenize
2025-11-10 15:57:24 INFO: Loading: pos
2025-11-10 15:57:27 INFO: Loading: lemma
2025-11-10 15:57:27 INFO: Loading: depparse
2025-11-10 15:57:27 INFO: Done loading processors!
100%|██████████| 600/600 [00:32<00:00, 18.27it/s]
2025-11-10 15:58:00 INFO: Checking for updates to resources.json in case models have been update

[INFO] Releasing Stanza Pipeline: 'zh'
>> Computing features for MIX from 'Mixed' ...
[INFO] Loading Stanza Pipeline: 'zh' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 25.0MB/s]                    
2025-11-10 15:58:00 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 15:58:00 INFO: "zh" is an alias for "zh-hans"
2025-11-10 15:58:01 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package          |
--------------------------------
| tokenize  | gsdsimp          |
| pos       | gsdsimp_charlm   |
| lemma     | gsdsimp_nocharlm |
| depparse  | gsdsimp_charlm   |

2025-11-10 15:58:01 INFO: Using device: cuda
2025-11-10 15:58:01 INFO: Loading: tokenize
2025-11-10 15:58:01 INFO: Loading: pos
2025-11-10 15:58:04 INFO: Loading: lemma
2025-11-10 15:58:04 INFO: Loading: depparse
2025-11-10 15:58:04 INFO: Done loading processors!
100%|██████████| 600/600 [00:43<00:00, 13.75it/s]


[INFO] Releasing Stanza Pipeline: 'zh'
[INFO] Features cached -> d:\Workspae\usyd\capstone\USYD-25S2-Capstone-CS62-2\evaluation\cache\llama318b_features_cache.csv
[INFO] Long-format rows for llama3-8b: 1800



2025-11-10 15:58:49 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


>> Computing features for EN from 'English' ...
[INFO] Loading Stanza Pipeline: 'en' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 69.5MB/s]                    
2025-11-10 15:58:49 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 15:58:49 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2025-11-10 15:58:49 INFO: Using device: cuda
2025-11-10 15:58:49 INFO: Loading: tokenize
2025-11-10 15:58:49 INFO: Loading: mwt
2025-11-10 15:58:49 INFO: Loading: pos
2025-11-10 15:58:51 INFO: Loading: lemma
2025-11-10 15:58:52 INFO: Loading: depparse
2025-11-10 15:58:52 INFO: Done loading processors!
100%|██████████| 600/600 [01:04<00:00,  9.23it/s]
2025-11-10 15:59:57 INFO: Checking for updates to resources.json in case models have been 

[INFO] Releasing Stanza Pipeline: 'en'
>> Computing features for CN from 'Chinese' ...
[INFO] Loading Stanza Pipeline: 'zh' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 29.1MB/s]                    
2025-11-10 15:59:57 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 15:59:57 INFO: "zh" is an alias for "zh-hans"
2025-11-10 15:59:58 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package          |
--------------------------------
| tokenize  | gsdsimp          |
| pos       | gsdsimp_charlm   |
| lemma     | gsdsimp_nocharlm |
| depparse  | gsdsimp_charlm   |

2025-11-10 15:59:58 INFO: Using device: cuda
2025-11-10 15:59:58 INFO: Loading: tokenize
2025-11-10 15:59:58 INFO: Loading: pos
2025-11-10 16:00:01 INFO: Loading: lemma
2025-11-10 16:00:01 INFO: Loading: depparse
2025-11-10 16:00:01 INFO: Done loading processors!
100%|██████████| 600/600 [00:33<00:00, 17.83it/s]
2025-11-10 16:00:35 INFO: Checking for updates to resources.json in case models have been update

[INFO] Releasing Stanza Pipeline: 'zh'
>> Computing features for MIX from 'Mixed' ...
[INFO] Loading Stanza Pipeline: 'zh' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 36.5MB/s]                    
2025-11-10 16:00:35 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 16:00:35 INFO: "zh" is an alias for "zh-hans"
2025-11-10 16:00:36 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package          |
--------------------------------
| tokenize  | gsdsimp          |
| pos       | gsdsimp_charlm   |
| lemma     | gsdsimp_nocharlm |
| depparse  | gsdsimp_charlm   |

2025-11-10 16:00:36 INFO: Using device: cuda
2025-11-10 16:00:36 INFO: Loading: tokenize
2025-11-10 16:00:36 INFO: Loading: pos
2025-11-10 16:00:38 INFO: Loading: lemma
2025-11-10 16:00:39 INFO: Loading: depparse
2025-11-10 16:00:39 INFO: Done loading processors!
100%|██████████| 600/600 [00:44<00:00, 13.44it/s]


[INFO] Releasing Stanza Pipeline: 'zh'
[INFO] Features cached -> d:\Workspae\usyd\capstone\USYD-25S2-Capstone-CS62-2\evaluation\cache\qwen34b_features_cache.csv
[INFO] Long-format rows for qwen3-4b: 1800



2025-11-10 16:01:24 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


>> Computing features for EN from 'English' ...
[INFO] Loading Stanza Pipeline: 'en' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 44.0MB/s]                    
2025-11-10 16:01:24 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 16:01:25 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2025-11-10 16:01:25 INFO: Using device: cuda
2025-11-10 16:01:25 INFO: Loading: tokenize
2025-11-10 16:01:25 INFO: Loading: mwt
2025-11-10 16:01:25 INFO: Loading: pos
2025-11-10 16:01:27 INFO: Loading: lemma
2025-11-10 16:01:28 INFO: Loading: depparse
2025-11-10 16:01:28 INFO: Done loading processors!
100%|██████████| 600/600 [01:06<00:00,  9.03it/s]
2025-11-10 16:02:35 INFO: Checking for updates to resources.json in case models have been 

[INFO] Releasing Stanza Pipeline: 'en'
>> Computing features for CN from 'Chinese' ...
[INFO] Loading Stanza Pipeline: 'zh' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 82.3MB/s]                    
2025-11-10 16:02:35 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 16:02:35 INFO: "zh" is an alias for "zh-hans"
2025-11-10 16:02:36 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package          |
--------------------------------
| tokenize  | gsdsimp          |
| pos       | gsdsimp_charlm   |
| lemma     | gsdsimp_nocharlm |
| depparse  | gsdsimp_charlm   |

2025-11-10 16:02:36 INFO: Using device: cuda
2025-11-10 16:02:36 INFO: Loading: tokenize
2025-11-10 16:02:36 INFO: Loading: pos
2025-11-10 16:02:39 INFO: Loading: lemma
2025-11-10 16:02:39 INFO: Loading: depparse
2025-11-10 16:02:39 INFO: Done loading processors!
100%|██████████| 600/600 [00:32<00:00, 18.59it/s]
2025-11-10 16:03:12 INFO: Checking for updates to resources.json in case models have been update

[INFO] Releasing Stanza Pipeline: 'zh'
>> Computing features for MIX from 'Mixed' ...
[INFO] Loading Stanza Pipeline: 'zh' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 28.8MB/s]                    
2025-11-10 16:03:12 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 16:03:12 INFO: "zh" is an alias for "zh-hans"
2025-11-10 16:03:13 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package          |
--------------------------------
| tokenize  | gsdsimp          |
| pos       | gsdsimp_charlm   |
| lemma     | gsdsimp_nocharlm |
| depparse  | gsdsimp_charlm   |

2025-11-10 16:03:13 INFO: Using device: cuda
2025-11-10 16:03:13 INFO: Loading: tokenize
2025-11-10 16:03:13 INFO: Loading: pos
2025-11-10 16:03:15 INFO: Loading: lemma
2025-11-10 16:03:15 INFO: Loading: depparse
2025-11-10 16:03:16 INFO: Done loading processors!
100%|██████████| 600/600 [00:43<00:00, 13.65it/s]


[INFO] Releasing Stanza Pipeline: 'zh'
[INFO] Features cached -> d:\Workspae\usyd\capstone\USYD-25S2-Capstone-CS62-2\evaluation\cache\gemini25flash_features_cache.csv
[INFO] Long-format rows for gemini-2.5-flash: 1800



2025-11-10 16:04:00 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


>> Computing features for EN from 'English' ...
[INFO] Loading Stanza Pipeline: 'en' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 39.1MB/s]                    
2025-11-10 16:04:00 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 16:04:01 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2025-11-10 16:04:01 INFO: Using device: cuda
2025-11-10 16:04:01 INFO: Loading: tokenize
2025-11-10 16:04:01 INFO: Loading: mwt
2025-11-10 16:04:01 INFO: Loading: pos
2025-11-10 16:04:03 INFO: Loading: lemma
2025-11-10 16:04:03 INFO: Loading: depparse
2025-11-10 16:04:04 INFO: Done loading processors!
100%|██████████| 600/600 [01:07<00:00,  8.93it/s]
2025-11-10 16:05:11 INFO: Checking for updates to resources.json in case models have been 

[INFO] Releasing Stanza Pipeline: 'en'
>> Computing features for CN from 'Chinese' ...
[INFO] Loading Stanza Pipeline: 'zh' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 38.1MB/s]                    
2025-11-10 16:05:11 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 16:05:11 INFO: "zh" is an alias for "zh-hans"
2025-11-10 16:05:12 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package          |
--------------------------------
| tokenize  | gsdsimp          |
| pos       | gsdsimp_charlm   |
| lemma     | gsdsimp_nocharlm |
| depparse  | gsdsimp_charlm   |

2025-11-10 16:05:12 INFO: Using device: cuda
2025-11-10 16:05:12 INFO: Loading: tokenize
2025-11-10 16:05:12 INFO: Loading: pos
2025-11-10 16:05:14 INFO: Loading: lemma
2025-11-10 16:05:15 INFO: Loading: depparse
2025-11-10 16:05:15 INFO: Done loading processors!
100%|██████████| 600/600 [00:33<00:00, 18.14it/s]
2025-11-10 16:05:48 INFO: Checking for updates to resources.json in case models have been update

[INFO] Releasing Stanza Pipeline: 'zh'
>> Computing features for MIX from 'Mixed' ...
[INFO] Loading Stanza Pipeline: 'zh' ...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 55.0MB/s]                    
2025-11-10 16:05:49 INFO: Downloaded file to C:\Users\precision\stanza_resources\resources.json
2025-11-10 16:05:49 INFO: "zh" is an alias for "zh-hans"
2025-11-10 16:05:49 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package          |
--------------------------------
| tokenize  | gsdsimp          |
| pos       | gsdsimp_charlm   |
| lemma     | gsdsimp_nocharlm |
| depparse  | gsdsimp_charlm   |

2025-11-10 16:05:49 INFO: Using device: cuda
2025-11-10 16:05:49 INFO: Loading: tokenize
2025-11-10 16:05:49 INFO: Loading: pos
2025-11-10 16:05:52 INFO: Loading: lemma
2025-11-10 16:05:52 INFO: Loading: depparse
2025-11-10 16:05:52 INFO: Done loading processors!
100%|██████████| 600/600 [00:43<00:00, 13.69it/s]


[INFO] Releasing Stanza Pipeline: 'zh'
[INFO] Features cached -> d:\Workspae\usyd\capstone\USYD-25S2-Capstone-CS62-2\evaluation\cache\gemma34b_features_cache.csv
[INFO] Long-format rows for gemma3-4b: 1800
[INFO] Combined feature shape: (9000, 7)
[INFO] Refuse-only subset shape: (1685, 7)
[INFO] Dependency chart spec saved -> d:\Workspae\usyd\capstone\USYD-25S2-Capstone-CS62-2\evaluation\images\dep_chart.json
[INFO] Entropy chart spec saved -> d:\Workspae\usyd\capstone\USYD-25S2-Capstone-CS62-2\evaluation\images\ent_chart.json
