In [1]:
!pip install -q transformers accelerate sentencepiece peft datasets


In [2]:
!pip install -U evaluate datasets sacrebleu rouge-score numpy

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy
  Downloading numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m62.1/62.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_

In [1]:
import os, json, random, ast, math
from pathlib import Path
from typing import List, Dict, Any

from tqdm import tqdm
import evaluate
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM




In [2]:
from google.colab import drive
import shutil
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
DATASET_DIR = "/content/drive/MyDrive/API-Pack-Dataset"
IN_JSONL = f"{DATASET_DIR}/api_pack_training.jsonl"

RNG_SEED = 42
random.seed(RNG_SEED)
torch.manual_seed(RNG_SEED)

assert os.path.exists(IN_JSONL), f"Combined jsonl not found at {IN_JSONL}"

In [4]:
def read_jsonl(path: str) -> List[Dict[str, Any]]:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                rows.append(json.loads(line))
    return rows

all_rows = read_jsonl(IN_JSONL)


python_rows = [
    r for r in all_rows
    if r.get("input", {}).get("lang", "").strip().lower() == "python"
    and r.get("output", {}).get("api_call", "").strip() != ""
]

print(f"Total rows: {len(all_rows)} | Python rows: {len(python_rows)}")

K = 10
if len(python_rows) < K:
    print(f"Warning: only {len(python_rows)} Python samples available; using all of them.")
eval_samples = random.sample(python_rows, k=min(K, len(python_rows)))


Total rows: 1014093 | Python rows: 100860


In [5]:

def clean_text(t):
    if t is None:
        return ""
    return t.strip()

def build_prompt(sample: Dict[str, Any]) -> str:
    inp = sample["input"]
    return (
        f"You are an API client code generator. "
        f"Your job is to output ONLY a valid {inp['lang']} code snippet that calls the given API. "
        f"No explanations. No comments. No markdown. No extra text. Only raw code.\n\n"
        f"### USER REQUEST:\n{inp['instruction']}\n\n"
        f"### ENDPOINT:\n{inp['path']}\n\n"
        f"### DESCRIPTION:\n{inp['description']}\n\n"
        f"### PARAMETERS:\n{inp['api_arguments']}\n\n"
        f"### Generate ONLY the {inp['lang']} API call code:\n"
    )


In [6]:

def exact_match(pred: str, gold: str) -> int:
    return int(pred.strip() == gold.strip())


def python_is_valid(code: str) -> bool:
    try:
        ast.parse(code)
        return True
    except Exception:
        return False

@torch.no_grad()
def compute_perplexity(text: str, tokenizer, model, stride: int = 1024) -> float:

    enc = tokenizer(text, return_tensors="pt")
    input_ids = enc["input_ids"].to(model.device)
    nlls = []
    for i in range(0, input_ids.size(1), stride):
        begin = i
        end   = min(i + stride, input_ids.size(1))
        trg_len = end - begin
        input_slice = input_ids[:, begin:end]
        target_ids = input_slice.clone()

        target_ids[:, :-1] = -100
        out = model(input_slice, labels=target_ids)
        nlls.append(out.loss * trg_len)
    ppl = torch.exp(torch.stack(nlls).sum() / input_ids.size(1))
    return float(ppl.item())


bleu_metric = evaluate.load("bleu")
sacrebleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [7]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32

def load_model(model_id: str):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)

    if tok.pad_token_id is None:
        tok.pad_token_id = tok.eos_token_id
    mdl = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=DTYPE,
        device_map="auto" if torch.cuda.is_available() else None
    )
    return tok, mdl

QWEN_ID   = "Qwen/Qwen2.5-1.5B"



In [8]:

@torch.no_grad()
def generate_code(sample, tokenizer, model, max_tokens=256):
    prompt = build_prompt(sample)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.2,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if decoded.startswith(prompt):
        completion = decoded[len(prompt):].strip()
    else:
        completion = decoded.strip()
    return completion, prompt


In [9]:
def eval_model(model_id: str, samples: List[Dict[str, Any]], compute_ppl=False):
    print(f"\n=== Loading {model_id} ===")
    tokenizer, model = load_model(model_id)

    results = {
        "per_sample": [],
        "means": {}
    }

    all_exact, all_bleu, all_sacre, all_rougel, all_syntax, all_ppl = [], [], [], [], [], []

    for ex in tqdm(samples, desc=f"Evaluating {model_id}"):
        gold = ex["output"]["api_call"].strip()

        pred, prompt = generate_code(ex, tokenizer, model)
        pred_str = pred.strip()


        em = exact_match(pred_str, gold)
        all_exact.append(em)

        bleu = bleu_metric.compute(predictions=[pred_str], references=[[gold]])["bleu"]
        all_bleu.append(bleu)


        sacre = sacrebleu_metric.compute(predictions=[pred_str], references=[[gold]])["score"]
        all_sacre.append(sacre)


        rouge = rouge_metric.compute(predictions=[pred_str], references=[gold])
        rougeL = rouge.get("rougeL", 0.0)
        all_rougel.append(rougeL)


        syntax_ok = int(python_is_valid(pred_str))
        all_syntax.append(syntax_ok)


        ppl_val = None
        if compute_ppl:
            ppl_val = compute_perplexity(prompt + pred_str, tokenizer, model)
            all_ppl.append(ppl_val)

        results["per_sample"].append({
            "prompt": prompt,
            "gold": gold,
            "pred": pred_str,
            "exact_match": em,
            "bleu": bleu,
            "sacrebleu": sacre,
            "rougeL": rougeL,
            "syntax": syntax_ok,
            "perplexity": ppl_val
        })


    def mean(xs):
        xs = [x for x in xs if x is not None]
        return float(sum(xs)/len(xs)) if xs else None

    results["means"] = {
        "exact_match": mean(all_exact),
        "bleu": mean(all_bleu),
        "sacrebleu": mean(all_sacre),
        "rougeL": mean(all_rougel),
        "syntax": mean(all_syntax),
        "perplexity": mean(all_ppl) if compute_ppl else None,
        "n": len(samples)
    }

    return results


In [10]:

from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [11]:
BASE_OUT_DIR = Path(DATASET_DIR) / "baseline_eval"
BASE_OUT_DIR.mkdir(parents=True, exist_ok=True)

In [12]:

qwen_results = eval_model(QWEN_ID, eval_samples)
QWEN_DIR = BASE_OUT_DIR / "qwen2.5_1.5b"
QWEN_DIR.mkdir(exist_ok=True)
with open(QWEN_DIR / "results.json", "w", encoding="utf-8") as f:
    json.dump(qwen_results, f, ensure_ascii=False, indent=2)
print("Qwen results saved:", QWEN_DIR)


=== Loading Qwen/Qwen2.5-1.5B ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Evaluating Qwen/Qwen2.5-1.5B: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [03:24<00:00, 20.42s/it]


Qwen results saved: /content/drive/MyDrive/API-Pack-Dataset/baseline_eval/qwen2.5_1.5b


 "means": {
    "exact_match": 0.0,
    "bleu": 0.00717109668761752,
    "sacrebleu": 1.0018421409383476,
    "rougeL": 0.08370810548093814,
    "syntax": 0.2,
    "perplexity": null,
    "n": 10

In [None]:
#RESULTS

QWEN :   "means": {
    "exact_match": 0.0,
    "bleu": 0.00717109668761752,
    "sacrebleu": 1.0018421409383476,
    "rougeL": 0.08370810548093814,
    "syntax": 0.2,
    "perplexity": null,
    "n": 10

In [13]:
SAVE_DIR = "/content/drive/MyDrive/API-Pack-Dataset/baseline_eval"
os.makedirs(SAVE_DIR, exist_ok=True)

eval_path = os.path.join(SAVE_DIR, "eval_samples_python_10.json")

with open(eval_path, "w", encoding="utf-8") as f:
    json.dump(eval_samples, f, indent=2, ensure_ascii=False)

print("Saved eval sample set:", eval_path)


Saved eval sample set: /content/drive/MyDrive/API-Pack-Dataset/baseline_eval/eval_samples_python_10.json


In [14]:
from IPython.display import Markdown, display

def show_examples(tag: str, results_obj: Dict[str, Any], k=3):
    for i, row in enumerate(results_obj["per_sample"][:k], start=1):
        md = (
            f"# üîπ {tag} Example {i}\n\n"
            f"## INPUT PROMPT\n"
            "```\n" + row["prompt"] + "\n```\n\n"
            f"## Actual (API Call)\n"
            "```python\n" + row["gold"] + "\n```\n\n"
            f"## MODEL OUTPUT\n"
            "```python\n" + row["pred"] + "\n```\n"
            f"**Metrics:** EM={row['exact_match']} | BLEU={row['bleu']:.4f} | SacreBLEU={row['sacrebleu']:.2f} | "
            f"ROUGE-L={row['rougeL']:.4f} | Syntax={row['syntax']} "
            + (f"| PPL={row['perplexity']:.2f}" if row["perplexity"] is not None else "")
        )
        display(Markdown(md))

show_examples("Qwen2.5-1.5B", qwen_results, k=3)



# üîπ Qwen2.5-1.5B Example 1

## INPUT PROMPT
```
You are an API client code generator. Your job is to output ONLY a valid Python code snippet that calls the given API. No explanations. No comments. No markdown. No extra text. Only raw code.

### USER REQUEST:
I'd like to ensure that the vehicle I'm about to purchase has a valid insurance certificate. Could you please help me verify this information using the Ministry of Road Transport and Highways' Vehicle Insurance Certificate API (vhinsc)?

### ENDPOINT:
/vhinsc/certificate

### DESCRIPTION:
API to verify Vehicle Insurance Certificate.

### PARAMETERS:
{}

### Generate ONLY the Python API call code:

```

## Actual (API Call)
```python
import http.client

conn = http.client.HTTPSConnection("apisetu.gov.in")

payload = "{\"certificateParameters\":{\"FullName\":\"Sunil Kumar\",\"UID\":\"123412341234\",\"chasis_no\":\"MA3EJKD1S00A06535\",\"reg_no\":\"DL01AA0101\",\"swd_name\":\"Veer Pratap Singh\"},\"consentArtifact\":{\"consent\":{\"consentId\":\"ea9c43aa-7f5a-4bf3-a0be-e1caa24737ba\",\"data\":{\"id\":\"string\"},\"dataConsumer\":{\"id\":\"string\"},\"dataProvider\":{\"id\":\"string\"},\"permission\":{\"access\":\"string\",\"dateRange\":{\"from\":\"2019-08-24T14:15:22Z\",\"to\":\"2019-08-24T14:15:22Z\"},\"frequency\":{\"repeats\":0,\"unit\":\"string\",\"value\":0}},\"purpose\":{\"description\":\"string\"},\"timestamp\":\"2019-08-24T14:15:22Z\",\"user\":{\"email\":\"string\",\"idNumber\":\"string\",\"idType\":\"string\",\"mobile\":\"string\"}},\"signature\":{\"signature\":\"string\"}},\"format\":\"pdf\",\"txnId\":\"f7f1469c-29b0-4325-9dfc-c567200a70f7\"}"

headers = {
    'X-APISETU-APIKEY': "REPLACE_KEY_VALUE",
    'content-type': "application/json"
    }

conn.request("POST", "/transport/v3/vhinsc/certificate", payload, headers)

res = conn.getresponse()
data = res.read()

print(data.decode("utf-8"))
```

## MODEL OUTPUT
```python
vhinsc.verify_certificate()
```
**Metrics:** EM=0 | BLEU=0.0000 | SacreBLEU=0.00 | ROUGE-L=0.0305 | Syntax=1 

# üîπ Qwen2.5-1.5B Example 2

## INPUT PROMPT
```
You are an API client code generator. Your job is to output ONLY a valid Python code snippet that calls the given API. No explanations. No comments. No markdown. No extra text. Only raw code.

### USER REQUEST:
Whenever I need to access metadata on different editorial programme versions for a specific broadcast, I can use the BBC Nitro API with the `listVersions` endpoint. By using this endpoint, I can easily retrieve information related to original, signed, audio-described, and other versions of a programme. This functionality is crucial when working with various programme presentations, whether it's for designing a linear broadcast or setting up an on-demand service.

### ENDPOINT:
/versions

### DESCRIPTION:
The versions feed exposes editorial "Versions" of programmes. These are concepts used to capture different presentations of an overall programme: for example, versions of a programme may include one with sign language, one with audio description, one edited for content and more. Versions are also important to understand for broadcasts: a linear broadcast or an ondemand is always of a specific version, not merely of a programme.

### PARAMETERS:
{}

### Generate ONLY the Python API call code:

```

## Actual (API Call)
```python
import http.client

conn = http.client.HTTPSConnection("programmes.api.bbc.com")

conn.request("GET", "/versions?availability=SOME_ARRAY_VALUE&descendants_of=SOME_ARRAY_VALUE&media_set=SOME_ARRAY_VALUE&page=SOME_INTEGER_VALUE&page_size=SOME_INTEGER_VALUE&partner_id=SOME_ARRAY_VALUE&partner_pid=SOME_ARRAY_VALUE&payment_type=SOME_ARRAY_VALUE&pid=SOME_ARRAY_VALUE&embargoed=SOME_STRING_VALUE")

res = conn.getresponse()
data = res.read()

print(data.decode("utf-8"))
```

## MODEL OUTPUT
```python
```python
response = requests.get('https://api.bbc.co.uk/nitro/v1/versions')
```
```
**Metrics:** EM=0 | BLEU=0.0019 | SacreBLEU=0.19 | ROUGE-L=0.0714 | Syntax=0 

# üîπ Qwen2.5-1.5B Example 3

## INPUT PROMPT
```
You are an API client code generator. Your job is to output ONLY a valid Python code snippet that calls the given API. No explanations. No comments. No markdown. No extra text. Only raw code.

### USER REQUEST:
I want to create or update the launch configuration for my application using AWS Server Migration Service. Can you please help me craft a query to use the PutAppLaunchConfiguration endpoint effectively?

### ENDPOINT:
/#X-Amz-Target=AWSServerMigrationService_V2016_10_24.PutAppLaunchConfiguration

### DESCRIPTION:
Creates or updates the launch configuration for the specified application.

### PARAMETERS:
{}

### Generate ONLY the Python API call code:

```

## Actual (API Call)
```python
import http.client

conn = http.client.HTTPConnection("sms.")

payload = "{\"appId\":\"string\",\"roleName\":\"string\",\"autoLaunch\":true,\"serverGroupLaunchConfigurations\":[]}"

headers = {
    'X-Amz-Content-Sha256': "SOME_STRING_VALUE",
    'X-Amz-Date': "SOME_STRING_VALUE",
    'X-Amz-Algorithm': "SOME_STRING_VALUE",
    'X-Amz-Credential': "SOME_STRING_VALUE",
    'X-Amz-Security-Token': "SOME_STRING_VALUE",
    'X-Amz-Signature': "SOME_STRING_VALUE",
    'X-Amz-SignedHeaders': "SOME_STRING_VALUE",
    'X-Amz-Target': "SOME_STRING_VALUE",
    'Authorization': "REPLACE_KEY_VALUE",
    'content-type': "application/json"
    }

conn.request("POST", "%7Bregion%7D.amazonaws.com/", payload, headers)

res = conn.getresponse()
data = res.read()

print(data.decode("utf-8"))
```

## MODEL OUTPUT
```python
```python
import boto3

def put_app_launch_configuration(app_id, launch_configuration):
    client = boto3.client('servermigrationservice')
    response = client.put_app_launch_configuration(
        AppId=app_id,
        LaunchConfiguration=launch_configuration
    )
    return response
```

### END OF CODE
This code snippet uses the `boto3` library to make a request to the `PutAppLaunchConfiguration` endpoint of the AWS Server Migration Service API. It takes the `app_id` and `launch_configuration` as input parameters and returns the response from the API call.
```
**Metrics:** EM=0 | BLEU=0.0000 | SacreBLEU=0.66 | ROUGE-L=0.0592 | Syntax=0 