In [24]:
import pandas as pd
import re
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from sklearn.metrics import classification_report, accuracy_score

In [2]:
def preprocess_function(example):
    system_label = (
        "You are a lead qualification assistant.\n"
        "You will be given call transcript excerpts (tool output).\n"
        "Return exactly ONE Action command of the form:\n"
        "<respond> LABELS </respond>\n"
        "where LABELS is either:\n"
        "- None\n"
        "- or a comma-separated subset of: Authority, Budget, Timeline, Need\n"
        "No other text."
    )
    return {
        "prompt": [
            {"role": "system", "content": system_label},
            {"role": "user", "content": example["input"]},
            {"role": "assistant", "content": example["output"]},
            {"role": "tool", "content": example["query_results"]},
            {"role": "user", "content": (
                "Now provide the final qualification result.\n"
                "Return exactly one action command:\n"
                "<respond> LABELS </respond>\n"
                "LABELS must be None or a comma-separated subset of: Authority, Budget, Timeline, Need."
            )},
        ],
        "ground_truth": example["answers"]
    }

In [3]:
def create_dataset(training_file_path):
    df = pd.read_csv(training_file_path)
    df = df.fillna('')

    new_answers = [f"<respond> {a} </respond>" if a else f"<respond> None </respond>" for a in df['answers']]
    df['answers'] = new_answers

    df_train = df.sample(frac=0.9, random_state=42)
    df_test = df.drop(df_train.index).reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)

    hf_dataset_dict = DatasetDict({
        "train": Dataset.from_pandas(df_train),
        "test": Dataset.from_pandas(df_test)
    })

    dset = hf_dataset_dict.map(preprocess_function, remove_columns=["instruction", "input", "output", "query_results", "answers"])

    return dset

In [4]:
dataset = create_dataset("data/SLM_Data_LQ_Augmented_397.csv")
test_message_lists = [d['prompt'] for d in dataset['test']]
gt_responses = [d['ground_truth'] for d in dataset['test']]

Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [5]:
model_name = "Qwen/Qwen3-4B-Instruct-2507"
fine_tune_path = '/Users/micksmith/Library/CloudStorage/GoogleDrive-csmith715@gmail.com/My Drive/Neuromatic/SLM-Training/qwen3-4b-grpo-lora-adapter'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [6]:
base = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    type=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
    load_in_4bit=True,
)

ft = PeftModel.from_pretrained(base, fine_tune_path)
ft.eval()

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 2560)
        (layers): ModuleList(
          (0-35): 36 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [7]:
def generate_test_response(message_list):
    prompt = tokenizer.apply_chat_template(message_list, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(ft.device)

    with torch.no_grad():
        out = ft.generate(
            **inputs,
            max_new_tokens=48,
            do_sample=False
        )

    gen_tokens = out[0, inputs["input_ids"].shape[1]:]
    answer = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    return answer

In [10]:
RESP_RE = re.compile(r"^\s*<\s*respond\s*>\s*(.*?)\s*<\s*/\s*respond\s*>\s*$", re.I | re.S)
ALLOWED = {"Authority", "Budget", "Timeline", "Need"}
CANON_ORDER = ["Authority", "Budget", "Timeline", "Need"]

def normalize_answer(text: str) -> str:
    t = text.strip()
    t = re.sub(r'^[`"\']+|[`"\']+$', "", t).strip()
    t = re.sub(r"\s+", " ", t)
    return t

def parse_label_set(t: str):
    t = normalize_answer(t)

    # None is valid and exclusive
    if t.lower() == "none":
        return "None"

    parts = [p.strip() for p in t.split(",") if p.strip()]
    if not parts:
        return "None"

    if any(p not in ALLOWED for p in parts):
        return None

    uniq = sorted(set(parts), key=lambda x: CANON_ORDER.index(x))
    return ", ".join(uniq)

def parse_action(text: str):
    """
    Enforce <respond> ... </respond>. Return canonical label string or None if invalid.
    """
    m = RESP_RE.match(text.strip())
    if not m:
        return None
    inner = m.group(1)
    return parse_label_set(inner)

In [8]:
test_results = [generate_test_response(t) for t in test_message_lists]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [30]:
parsed_results = [parse_action(t) for t in test_results]
parsed_gt = [parse_action(g) for g in gt_responses]

matched = [1 if pred == actual else 0 for pred, actual in zip(parsed_results, parsed_gt)]
sum(matched)

29

In [26]:
label_list = list(set(parsed_gt+parsed_results))

In [25]:
accuracy_score(y_true=parsed_gt, y_pred=parsed_results)

0.725

In [28]:
cr = classification_report(y_true=parsed_gt, y_pred=parsed_results, labels=label_list)
print(cr)

                precision    recall  f1-score   support

Timeline, Need       0.00      0.00      0.00         1
          Need       0.86      0.86      0.86         7
          None       0.44      1.00      0.62         8
     Authority       1.00      0.50      0.67         8
        Budget       1.00      0.89      0.94         9
      Timeline       1.00      0.43      0.60         7

      accuracy                           0.72        40
     macro avg       0.72      0.61      0.61        40
  weighted avg       0.84      0.72      0.72        40



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [31]:
pd.DataFrame({'prediction': parsed_results, 'actual': parsed_gt, 'match': matched})

Unnamed: 0,prediction,actual,match
0,,,1
1,,Need,0
2,Authority,Authority,1
3,,Budget,0
4,,,1
5,Authority,Authority,1
6,,Authority,0
7,Authority,Authority,1
8,Budget,Budget,1
9,Budget,Budget,1
