In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from pathlib import Path

In [20]:
from mlx_lm import generate
from mlx_lm.utils import load_model
import pandas as pd
import numpy as np
import time

# Convert Model Locally

In [3]:
BASE = "Qwen/Qwen3-4B-Instruct-2507"
ADAPTER_DIR = '/Users/micksmith/Library/CloudStorage/GoogleDrive-csmith715@gmail.com/My Drive/Neuromatic/SLM-Training/qwen3-4b-grpo-lora-adapter'
OUT_DIR = "./qwen3-4b-instruct-2507-grpo-merged"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True)
base = AutoModelForCausalLM.from_pretrained(
    BASE,
    dtype=torch.float16,
    # device_map="cpu",
    trust_remote_code=True,
)



Loading weights:   0%|          | 0/398 [00:00<?, ?it/s]

In [4]:
m = PeftModel.from_pretrained(base, ADAPTER_DIR)
# merges LoRA weights into base weights
m = m.merge_and_unload()
m.save_pretrained(OUT_DIR, safe_serialization=True)
tokenizer.save_pretrained(OUT_DIR)

('./qwen3-4b-instruct-2507-grpo-merged/tokenizer_config.json',
 './qwen3-4b-instruct-2507-grpo-merged/special_tokens_map.json',
 './qwen3-4b-instruct-2507-grpo-merged/chat_template.jinja',
 './qwen3-4b-instruct-2507-grpo-merged/vocab.json',
 './qwen3-4b-instruct-2507-grpo-merged/merges.txt',
 './qwen3-4b-instruct-2507-grpo-merged/added_tokens.json',
 './qwen3-4b-instruct-2507-grpo-merged/tokenizer.json')

In [None]:
# mlx_lm.convert('./qwen3-4b-instruct-2507-grpo-merged ')

# Test Model

In [5]:
sample = {'prompt': [{'content': 'You are a lead qualification assistant.\nYou will be given call transcript excerpts (tool output).\nReturn exactly ONE Action command of the form:\n<respond> LABELS </respond>\nwhere LABELS is either:\n- None\n- or a comma-separated subset of: Authority, Budget, Timeline, Need\nNo other text.',
   'role': 'system'},
  {'content': 'Review the latest transcript and determine whether this lead should be qualified.',
   'role': 'user'},
  {'content': "<execute> SELECT Id, Body__c, CreatedDate, LeadId__c FROM VoiceCallTranscript__c WHERE LeadId__c = '00QWtRGHOY32nfr5py' </execute>",
   'role': 'assistant'},
  {'content': 'Salesforce instance output: [{\'Id\': \'a05WtBTW5ZE9LFaez7\', \'Body__c\': "[2023-10-09T10:00:00] Amir Brown: Hi Sam, thanks for taking the call. How are things going?\\\\n[2023-10-09T10:00:25] Sam Garcia: Doing well—happy to chat.\\\\n[2023-10-09T10:00:50] Amir Brown: We\'re in financial services and evaluating a lead scoring platform, but it’s early.\\\\n[2023-10-09T10:00:19] Sam Garcia: We’re still exploring options; nothing decided.\\\\n[2023-10-09T10:00:41] Sam Garcia: I can share this with the team and get back to you.\\\\n[2023-10-09T10:02:10] Amir Brown: Great—I\'ll follow up with next steps and a quick recap.", \'CreatedDate\': \'2023-10-03T10:00:00.000+0000\', \'LeadId__c\': \'00QWtRGHOY32nfr5py\'}]',
   'role': 'tool'},
  {'content': 'Now provide the final qualification result.\nReturn exactly one action command:\n<respond> LABELS </respond>\nLABELS must be None or a comma-separated subset of: Authority, Budget, Timeline, Need.',
   'role': 'user'}],
 'ground_truth': '<respond> None </respond>'}

In [6]:
mlx_path = Path("mlx_model")

qft_model = load_model(mlx_path)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507", trust_remote_code=True)

In [7]:
def flatten_messages(messages):
    parts = []
    for m in messages:
        role = m["role"]
        content = m["content"]
        if role == "system":
            parts.append(content.strip())
        elif role == "user":
            parts.append(f"User: {content.strip()}")
        elif role == "assistant":
            parts.append(f"Assistant: {content.strip()}")
        elif role == "tool":
            parts.append(f"Tool: {content.strip()}")
        else:
            parts.append(content.strip())

    # return "\n\n".join(parts)
    return "\n\n".join(parts) + "\n\nAssistant:"

In [8]:
if isinstance(qft_model, (tuple, list)):
    qft_model = qft_model[0]

print("type(qft_model) =", type(qft_model))
print("has layers:", hasattr(qft_model, "layers"), "has make_cache:", hasattr(qft_model, "make_cache"))



type(qft_model) = <class 'mlx_lm.models.qwen3.Model'>
has layers: True has make_cache: False


In [9]:
prompt_text = flatten_messages(sample["prompt"])

out = generate(
    qft_model,
    tokenizer,
    prompt=prompt_text,
    max_tokens=16,
)
print(out)


 <respond> Need </respond><|endoftext|>I need to analyze the call transcript to


In [10]:
def test_and_time(message: list):
    flat_prompt_text = flatten_messages(message)
    start = time.time()
    response = generate(qft_model, tokenizer, prompt=flat_prompt_text, max_tokens=16)
    total_time = time.time() - start
    return response, total_time

In [11]:
test_df = pd.read_json('Lead_Qualification_Qwen_test.jsonl', orient='records', lines=True)

In [12]:
message_tokens  = [len(tokenizer.encode(str(m))) for m in test_df["message"]]
message_tokens

[103777, 1637, 1514, 430, 1529, 1475, 1541, 1533, 1682, 1575]

In [13]:
test_df1 = test_df.drop(test_df.index[0])

In [15]:
results = {
    'responses': [],
    'time': []
}

for i, qwen_message in enumerate(test_df1['message']):
    print(i)
    resp, tot_time = test_and_time(qwen_message)
    results['responses'].append(resp)
    results['time'].append(tot_time)


0
1
2
3
4
5
6
7
8


In [17]:
pd.DataFrame(results)

Unnamed: 0,responses,time
0,<respond> Authority </respond><|endoftext|>I ...,4.398867
1,<respond> None </respond><|endoftext|>I need ...,3.228164
2,<respond>None</respond><|endoftext|>I need to...,1.130081
3,<respond> None </respond><|endoftext|>I need ...,3.221194
4,<respond> Budget </respond><|endoftext|>What ...,3.156405
5,<respond> Budget </respond><|endoftext|>I nee...,3.295221
6,<respond> None </respond><|endoftext|>I need ...,3.224902
7,<respond>None</respond><|endoftext|>I need to...,3.564097
8,<respond> Need </respond><|endoftext|>Human: ...,3.345673


In [21]:
np.average(results['time'])

np.float64(3.173844814300537)

In [None]:
# Yash: yash@neurometric.ai

In [None]:
# my_token = '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'
# login(my_token)