In [None]:
import pandas as pd
import re
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, PeftModel
from trl import GRPOConfig, GRPOTrainer
from huggingface_hub import login
import os

# Initial Setup

Login to Hugginface if not already done globally

In [None]:
my_token = 'hf_...' # Huggingface token
login(my_token)


The goal has been to stick with "small" LLM's whereby the base model is 4 billion parameters or less. So far, `Qwen/Qwen3-4B-Instruct-2507` has performed the best.

It's also worth noting that there is a lot of potential with the Gemma models, but the configuration and setup is a different. Almost to the point that it should probably have it's own separate training script.

In [None]:
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"
# model_name = "google/gemma-3-1b-it"
# model_name = "Qwen/Qwen3-4B-Instruct-2507"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=my_token,
    use_fast=True
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=my_token,
    device_map="auto",
    dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
    load_in_4bit=True,
)

# If tokenizer doesn't have pad token set, do it (common for some LMs)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

# Data Configuration

In [None]:
df = pd.read_csv('data/SLM_Data_LQ_Augmented_397.csv')
df = df.fillna('')

new_answers = [f"<respond> {a} </respond>" if a else f"<respond> None </respond>" for a in df['answers']]
df['answers'] = new_answers

df_train = df.sample(frac=0.9, random_state=42)
df_test = df.drop(df_train.index).reset_index(drop=True)
df_train = df_train.reset_index(drop=True)

hf_dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "test": Dataset.from_pandas(df_test)
})

LABEL_SYS = (
    "You are a lead qualification assistant.\n"
    "You will be given call transcript excerpts (tool output).\n"
    "Return exactly ONE Action command of the form:\n"
    "<respond> LABELS </respond>\n"
    "where LABELS is either:\n"
    "- None\n"
    "- or a comma-separated subset of: Authority, Budget, Timeline, Need\n"
    "No other text."
)

def preprocess_function(example):
    return {
        "prompt": [
            # {"role": "system", "content": example["instruction"]},
            {"role": "system", "content": LABEL_SYS},
            {"role": "user", "content": example["input"]},
            {"role": "assistant", "content": example["output"]},
            {"role": "tool", "content": example["query_results"]},
            {"role": "user", "content": (
                "Now provide the final qualification result.\n"
                "Return exactly one action command:\n"
                "<respond> LABELS </respond>\n"
                "LABELS must be None or a comma-separated subset of: Authority, Budget, Timeline, Need."
            )},
        ],
        "ground_truth": example["answers"]
    }


# For Gemma
# def preprocess_function(example):
#     return {
#         "prompt": [
#             {
#                 "role": "user",
#                 "content": f"{LABEL_SYS}\n\n{example['input']}"
#             },
#             {
#                 "role": "assistant",
#                 "content": example["output"]
#             },
#             {
#                 "role": "user",
#                 "content": (
#                     f"Tool Results: {example['query_results']}\n\n"
#                     "Now provide the final qualification result.\n"
#                     "Return exactly one action command:\n"
#                     "<respond> LABELS </respond>\n"
#                     "LABELS must be None or a comma-separated subset of: Authority, Budget, Timeline, Need."
#                 )
#             },
#         ],
#         "ground_truth": example["answers"]
#     }

In [None]:
dataset = hf_dataset_dict.map(preprocess_function, remove_columns=["instruction", "input", "output", "query_results", "answers"])

Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [None]:
dataset['test'][10]

{'prompt': [{'content': 'You are a lead qualification assistant.\nYou will be given call transcript excerpts (tool output).\nReturn exactly ONE Action command of the form:\n<respond> LABELS </respond>\nwhere LABELS is either:\n- None\n- or a comma-separated subset of: Authority, Budget, Timeline, Need\nNo other text.',
   'role': 'system'},
  {'content': 'Review the latest transcript and determine whether this lead should be qualified.',
   'role': 'user'},
  {'content': "<execute> SELECT Id, Body__c, CreatedDate, LeadId__c FROM VoiceCallTranscript__c WHERE LeadId__c = '00QWtRGHOY32nfr5py' </execute>",
   'role': 'assistant'},
  {'content': 'Salesforce instance output: [{\'Id\': \'a05WtBTW5ZE9LFaez7\', \'Body__c\': "[2023-10-09T10:00:00] Amir Brown: Hi Sam, thanks for taking the call. How are things going?\\\\n[2023-10-09T10:00:25] Sam Garcia: Doing well—happy to chat.\\\\n[2023-10-09T10:00:50] Amir Brown: We\'re in financial services and evaluating a lead scoring platform, but it’s ea

# Reward Functions

At the moment, the reward functions are rather generic. There is likely a lot more intuition that could go into them. But for now, the reward function behavior is as follows:

* **−2.0** → Invalid or malformed output
* **+0.5** → Valid `<respond>` structure
* **+2.0** → Exact label-set match with the ground truth example

In [None]:
ALLOWED = {"Authority", "Budget", "Timeline", "Need"}
CANON_ORDER = ["Authority", "Budget", "Timeline", "Need"]

RESP_RE = re.compile(r"^\s*<\s*respond\s*>\s*(.*?)\s*<\s*/\s*respond\s*>\s*$", re.I | re.S)

def normalize_answer(text: str) -> str:
    t = text.strip()
    t = re.sub(r'^[`"\']+|[`"\']+$', "", t).strip()
    t = re.sub(r"\s+", " ", t)
    return t

def parse_label_set(t: str):
    t = normalize_answer(t)

    # None is valid and exclusive
    if t.lower() == "none":
        return "None"

    parts = [p.strip() for p in t.split(",") if p.strip()]
    if not parts:
        return "None"

    if any(p not in ALLOWED for p in parts):
        return None

    uniq = sorted(set(parts), key=lambda x: CANON_ORDER.index(x))
    return ", ".join(uniq)

# def parse_action(text: str):
#     m = RESP_RE.search(text.strip())
#     if not m:
#         return None
#     inner = m.group(1)
#     return parse_label_set(inner)

def parse_action(text: str):
    """
    Enforce <respond> ... </respond>. Return canonical label string or None if invalid.
    """
    m = RESP_RE.match(text.strip())
    if not m:
        return None
    inner = m.group(1)
    return parse_label_set(inner)

def reward_fn(prompts, completions, **kwargs):
    texts = []
    for c in completions:
        if isinstance(c, str):
            texts.append(c)
        elif isinstance(c, list) and len(c) and isinstance(c[-1], dict) and "content" in c[-1]:
            texts.append(c[-1]["content"])
        else:
            texts.append(str(c))

    golds = kwargs.get("ground_truth", None)

    rewards = []
    for i, out in enumerate(texts):
        pred = parse_action(out)
        if pred is None:
            rewards.append(-2.0)
            continue
        r = 0.5

        if golds is not None:
            gold = parse_action(golds[i])
            if gold is not None and pred == gold:
                r = 2.0
        else:
            r = 1.0

        rewards.append(r)

    return rewards


These are the reward functions I used for Gemma

In [None]:
# def reward_fn(prompts, completions, **kwargs):
#     texts = []
#     for c in completions:
#         if isinstance(c, str):
#             texts.append(c)
#         elif isinstance(c, list) and len(c) and isinstance(c[-1], dict) and "content" in c[-1]:
#             texts.append(c[-1]["content"])
#         else:
#             texts.append(str(c))

#     golds = kwargs.get("ground_truth", None)

#     rewards = []
#     for i, out in enumerate(texts):
#         score = 0.0

#         if "<respond>" in out.lower():
#             score += 0.2
#         if "</respond>" in out.lower():
#             score += 0.2

#         # 2. Strict Parse
#         pred = parse_action(out)
#         if pred is not None:
#             score += 0.5

#             # 3. Accuracy Reward
#             if golds is not None:
#                 gold = parse_action(golds[i])
#                 if pred == gold:
#                     score += 1.1
#                 else:
#                     score -= 0.5
#         else:
#             if score == 0:
#                 score = -0.5

#         rewards.append(score)
#     return rewards

I was having trouble authenticating the base model and tokenizer through Huggingface, so I forced it through below. This only seemed to be an issue with Gemma though

In [None]:
# Ensure token is set
# os.environ["HF_TOKEN"] = "hf_..."
# token = os.environ["HF_TOKEN"]
#
# # Load ONLY the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(
#     "google/gemma-3-1b-it",
#     token=token
# )
#
# # Set padding side to left (required for GRPO generation)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "left"

# Model Training

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

In [None]:
config = GRPOConfig(
    output_dir="grpo-lora",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    max_prompt_length=2048,
    max_completion_length=32,
    num_generations=6,
    generation_batch_size=6,
    temperature=0.9, # I need to adjust future trainings with a lower temp to nudge towards more strict generations
    bf16=torch.cuda.is_available(),
    logging_steps=10,
    save_steps=100,
    report_to=[]
)

trainer = GRPOTrainer(
    model=model,
    args=config,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    reward_funcs=reward_fn,
    peft_config=peft_config,
    processing_class=tokenizer
)

trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 32014}.


Step,Training Loss
10,0.0
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0
90,0.0
100,0.0


TrainOutput(global_step=268, training_loss=0.0, metrics={'train_runtime': 2313.7939, 'train_samples_per_second': 0.154, 'train_steps_per_second': 0.116, 'total_flos': 0.0, 'train_loss': 0.0})

In [None]:
trainer.model.save_pretrained("/content/drive/MyDrive/Neuromatic/SLM-Training/deepseek-coder-1-3b-grpo-lora-adapter")
tokenizer.save_pretrained("/content/drive/MyDrive/Neuromatic/SLM-Training/deepseek-coder-1-3b-grpo-lora-adapter")

('/content/drive/MyDrive/Neuromatic/SLM-Training/deepseek-coder-1-3b-grpo-lora-adapter/tokenizer_config.json',
 '/content/drive/MyDrive/Neuromatic/SLM-Training/deepseek-coder-1-3b-grpo-lora-adapter/special_tokens_map.json',
 '/content/drive/MyDrive/Neuromatic/SLM-Training/deepseek-coder-1-3b-grpo-lora-adapter/chat_template.jinja',
 '/content/drive/MyDrive/Neuromatic/SLM-Training/deepseek-coder-1-3b-grpo-lora-adapter/tokenizer.json')

# Evaluation

In [None]:
base = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
    load_in_4bit=True,
)

ft = PeftModel.from_pretrained(base, "output/ft-grpo-lora-adapter")
ft.eval()


`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32256, 2048)
        (layers): ModuleList(
          (0-23): 24 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

In [None]:
test_message_lists = [d['prompt'] for d in dataset['test']]
gt_responses = [d['ground_truth'] for d in dataset['test']]

# qdf = pd.read_json('/content/Lead_Qualification_Qwen_test.jsonl', lines=True, orient='records')

# test_message_lists = qdf['message']
# gt_responses = qdf['answer']

In [None]:
def generate_test_response(message_list):
    prompt = tokenizer.apply_chat_template(message_list, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(prompt, return_tensors="pt").to(ft.device)

    with torch.no_grad():
        out = ft.generate(
            **inputs,
            max_new_tokens=48,
            do_sample=False
        )

    # result = tokenizer.decode(out[0], skip_special_tokens=True)
    gen_tokens = out[0, inputs["input_ids"].shape[1]:]
    answer = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    return answer


In [None]:
generate_test_response(test_message_lists[0])

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


"Based on the transcript, the lead does not meet all the necessary qualification factors for a qualified lead. The lead does not meet the 'Authority' factor as the lead is not a member of the authority group. The lead does"

In [None]:
deepseek_tests = [generate_test_response(t) for t in test_message_lists]

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end gene

In [None]:
ds_pred_df = pd.DataFrame({
    'Predicted': deepseek_tests,
    'Actual': gt_responses
})
ds_pred_df.to_csv('/content/drive/MyDrive/Neuromatic/SLM-Training/deepseek1_tests.csv', index=False)
ds_pred_df

Unnamed: 0,Predicted,Actual
0,"Based on the transcript, the lead does not mee...",<respond> None </respond>
1,"Based on the transcript, the lead does not mee...",<respond> Need </respond>
2,"Based on the transcript, the lead does not mee...",<respond> Authority </respond>
3,"Based on the transcript, the lead does not mee...",<respond> Budget </respond>
4,"Based on the transcript, the lead does not mee...",<respond> None </respond>
5,"Based on the transcript, the lead does not mee...",<respond> Authority </respond>
6,"Based on the transcript, the lead does not mee...",<respond> Authority </respond>
7,"Based on the provided call transcript, the lea...",<respond> Authority </respond>
8,"Based on the provided transcripts, the lead is...",<respond> Budget </respond>
9,"Based on the transcripts, the lead does not me...",<respond> Budget </respond>


In [None]:
gemma_pred_df = pd.DataFrame({
    'Predicted': gemma_base10_tests,
    'Actual': gt_responses
})
gemma_pred_df.to_csv('/content/drive/MyDrive/Neuromatic/SLM-Training/gemma3-1b-grpo_tests.csv', index=False)
gemma_pred_df

Unnamed: 0,Predicted,Actual
0,Prepare proposal,<respond> None </respond>
1,Budget,<respond> Need </respond>
2,Provide a detailed analysis of the lead qualif...,<respond> Authority </respond>
3,Evaluate the lead qualification based on the p...,<respond> Budget </respond>
4,Budget,<respond> None </respond>
5,Timeline,<respond> Authority </respond>
6,Analyze the conversation and determine if the ...,<respond> Authority </respond>
7,Provide a detailed analysis of the lead qualif...,<respond> Authority </respond>
8,"<execute> SELECT Body__c, CreatedDate FROM Voi...",<respond> Budget </respond>
9,"<execute> SELECT Id, Body__c, CreatedDate FROM...",<respond> Budget </respond>


In [None]:
gemma_pred_df = pd.DataFrame({
    'Predicted': gemma_base10_tests,
    'Actual': gt_responses
})

gemma_pred_df

Unnamed: 0,Predicted,Actual
0,Prepare proposal</respond>,<respond> None </respond>
1,Analyze upcoming needs</respond>,<respond> Need </respond>
2,Provide a detailed analysis of the lead qualif...,<respond> Authority </respond>
3,Analyze proposal</respond>,<respond> Budget </respond>
4,Proceed with the order to proceed with the ins...,<respond> None </respond>
5,Prepare Proposal</respond>,<respond> Authority </respond>
6,Schedule a call with Ravi Sharma to discuss th...,<respond> Authority </respond>
7,Schedule</respond>,<respond> Authority </respond>
8,Analyze upcoming sales engagement suite implem...,<respond> Budget </respond>
9,Analyze lead qualification status.,<respond> Budget </respond>


In [None]:
qwen_pred_df = pd.DataFrame({
    'Predicted': qwen_base10_tests,
    'Actual': gt_responses
})

qwen_pred_df

Unnamed: 0,Predicted,Actual
0,"<execute> SELECT Id, Title, FAQ_Answer__c FROM...",[Authority]
1,<respond> Authority </respond>,[Authority]
2,<respond> None </respond>,[Timeline]
3,<respond> None </respond>,[Authority]
4,<respond> None </respond>,[Budget]
5,<respond> Budget </respond>,[Authority]
6,<respond> Budget </respond>,[Need]
7,<respond> None </respond>,[None]
8,<respond> None </respond>,[None]
9,<respond> None </respond>,[Need]


In [None]:
qwen_pred_df['Predicted'][0]

'<execute> SELECT Id, Title, FAQ_Answer__c FROM Knowledge__'

In [None]:
predicted_tests = [generate_qwen_response(t) for t in test_message_lists]

In [None]:
pred_df = pd.DataFrame({
    'Predicted': predicted_tests,
    'Actual': gt_responses
})

pred_df

Unnamed: 0,Predicted,Actual
0,<respond> None </respond>,<respond> None </respond>
1,<respond> Budget </respond>,<respond> Need </respond>
2,<respond> Authority </respond>,<respond> Authority </respond>
3,<respond>Budget</respond>,<respond> Budget </respond>
4,<respond> None </respond>,<respond> None </respond>
5,<respond> Authority </respond>,<respond> Authority </respond>
6,<respond>None</respond>,<respond> Authority </respond>
7,<respond> Authority </respond>,<respond> Authority </respond>
8,<respond> Budget </respond>,<respond> Budget </respond>
9,<respond> Budget </respond>,<respond> Budget </respond>


In [None]:
sum([1 if x == y else 0 for x,y in zip(pred_df['Actual'], pred_df['Predicted'])])

30

In [None]:
tested_df = pd.DataFrame(dataset['test'])
tested_df['predictions'] = predicted_tests

tested_df.to_csv('/content/drive/MyDrive/Neuromatic/SLM-Training/qwen3-4b-grpo_tests.csv', index=False)

In [None]:
# Prior
pd.DataFrame({
    'Predicted': predicted_tests,
    'Actual': gt_responses
})

Unnamed: 0,Predicted,Actual
0,,
1,,Need
2,,Authority
3,,Budget
4,,
5,,Authority
6,,Authority
7,,Authority
8,,Budget
9,,Budget
