# Drive Mounting

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Clone Dataset Repository

In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Install git-lfs (needed for large Hugging Face files)
!apt-get install git-lfs
!git lfs install

# Step 3: Clone the dataset repo from Hugging Face
!git clone https://huggingface.co/datasets/apipack/API-Pack-Dataset

# Step 4: Move dataset to Google Drive
import shutil

src = '/content/API-Pack-Dataset'
dst = '/content/drive/MyDrive/API-Pack-Dataset'

shutil.move(src, dst)

print("✅ Dataset successfully saved to your Google Drive at:", dst)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Git LFS initialized.
Cloning into 'API-Pack-Dataset'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 35 (delta 3), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (35/35), 9.46 KiB | 1.35 MiB/s, done.
Filtering content: 100% (10/10), 3.14 GiB | 100.36 MiB/s, done.


Error: Destination path '/content/drive/MyDrive/API-Pack-Dataset/API-Pack-Dataset' already exists

# Preprocessing level 1

In [None]:
import json
import glob
import os

# -----------------------------
# Paths
# -----------------------------
DATASET_DIR = "/content/drive/MyDrive/API-Pack-Dataset"
OUT_MAIN = f"{DATASET_DIR}/api_pack_starcoder_training.jsonl"

# Create output folder for subfiles
SUB_DIR = f"{DATASET_DIR}/langs"
os.makedirs(SUB_DIR, exist_ok=True)

# -----------------------------
# Collect raw files
# -----------------------------
files = glob.glob(f"{DATASET_DIR}/total_data_cleaned_*.json")
print("Found files:", files)


# -----------------------------
# Helper clean function
# -----------------------------
def clean_text(t):
    if t is None:
        return ""
    return t.strip()


# -----------------------------
# Convert a single example
# -----------------------------
def process_single_item(ex):
    api_call_data = ex.get("api_call_data", {})

    input_obj = {
        "instruction": clean_text(ex.get("instruction", "")),
        "lang": clean_text(api_call_data.get("lang", "")),
        "functionality": clean_text(api_call_data.get("functionality", "")),
        "api_arguments": api_call_data.get("api_arguments", {}),
        "description": clean_text(api_call_data.get("description", "")),
        "domain": api_call_data.get("domain", []),
        "path": clean_text(api_call_data.get("path", "")),
    }

    output_obj = {
        "api_call": clean_text(api_call_data.get("api_call", "")),
    }

    return {"input": input_obj, "output": output_obj}


# -----------------------------
# Process all and also group by lang
# -----------------------------
all_rows = []
lang_groups = {}  # dict: lang -> list of rows

for fpath in files:
    print("Processing:", fpath)

    with open(fpath, "r", encoding="utf-8") as f:
        data = json.load(f)

    for ex in data:
        row = process_single_item(ex)
        all_rows.append(row)

        lang = row["input"]["lang"]
        if lang not in lang_groups:
            lang_groups[lang] = []
        lang_groups[lang].append(row)


print("Total processed samples:", len(all_rows))


# -----------------------------
# Save the main combined training file
# -----------------------------
with open(OUT_MAIN, "w", encoding="utf-8") as f:
    for row in all_rows:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved main training dataset →", OUT_MAIN)


# -----------------------------
# Save per-language subfiles
# -----------------------------
for lang, rows in lang_groups.items():
    safe_lang = lang.lower().replace(" ", "_")
    out_path = f"{SUB_DIR}/api_pack_starcoder_{safe_lang}.jsonl"

    with open(out_path, "w", encoding="utf-8") as f:
        for row in rows:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

    print(f"Saved {lang} dataset → {out_path}")


Found files: ['/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_curl.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_go.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_java.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_javascript.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_libcurl.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_node.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_php.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_python.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_ruby.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_swift.json']
Processing: /content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_curl.json
Processing: /content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_go.json
Processing: /content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_java.json
Processing: /con

# Hugginface login

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Model Testing and baseline exploartion

In [None]:
# ============================================================
# 0. Install dependencies
# ============================================================
!pip install -q transformers accelerate sentencepiece sacrebleu


# ============================================================
# 1. Imports
# ============================================================
import json
import random
import ast
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sacrebleu.metrics import BLEU
from IPython.display import Markdown, display


# ============================================================
# 2. Load your preprocessed training data
# ============================================================
DATA_PATH = "/content/drive/MyDrive/API-Pack-Dataset/api_pack_starcoder_training.jsonl"

dataset = []
with open(DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            dataset.append(json.loads(line))

print(f"✅ Loaded {len(dataset)} samples")

# sample 10
random.seed(42)
samples = random.sample(dataset, min(10, len(dataset)))
print("✅ Selected 10 samples\n")


# ============================================================
# 3. Load GPT-2
# ============================================================
model_name = "gpt2"

print(f"⬇️ Loading model: {model_name} ...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# set pad token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

print("✅ GPT-2 loaded!\n")


# ============================================================
# 4. Build prompt using your exact input structure
# ============================================================
def build_prompt(sample):
    inp = sample["input"]

    return (
        f"You are an API client code generator. "
        f"Your job is to output ONLY a valid {inp['lang']} code snippet that calls the given API. "
        f"No explanations. No comments. No markdown. No extra text. Only raw code.\n\n"
        f"### USER REQUEST:\n{inp['instruction']}\n\n"
        f"### ENDPOINT:\n{inp['path']}\n\n"
        f"### DESCRIPTION:\n{inp['description']}\n\n"
        f"### PARAMETERS:\n{inp['api_arguments']}\n\n"
        f"### Generate ONLY the {inp['lang']} API call code:\n"
    )


# ============================================================
# 5. Generate completions
# ============================================================
def generate_code(sample, max_tokens=256):
    prompt = build_prompt(sample)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.2,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    completion = decoded[len(prompt):].strip()
    return completion


# ============================================================
# 6. Evaluation Metrics
# ============================================================

def check_syntax_correctness(code, lang="python"):
    """
    Check if the generated code is syntactically valid.
    Returns True if valid, False otherwise.
    """
    if lang.lower() == "python":
        try:
            ast.parse(code)
            return True
        except SyntaxError:
            return False
    else:
        # For non-Python languages, we do a basic check
        # You can extend this with appropriate parsers
        return True  # Assume valid for non-Python


def check_exact_match(prediction, reference):
    """
    Check if prediction exactly matches reference after normalization.
    """
    pred_normalized = prediction.strip().replace(" ", "").replace("\n", "").replace("\t", "")
    ref_normalized = reference.strip().replace(" ", "").replace("\n", "").replace("\t", "")
    return pred_normalized == ref_normalized


def compute_sacrebleu(predictions, references):
    """
    Compute SacreBLEU score.
    predictions: list of predicted strings
    references: list of reference strings (each reference is a list for multi-ref, here single ref)
    """
    bleu = BLEU()
    # SacreBLEU expects references as list of lists
    refs = [[ref] for ref in references]
    score = bleu.corpus_score(predictions, list(zip(*refs)))
    return score.score


def compute_individual_sacrebleu(prediction, reference):
    """
    Compute SacreBLEU for a single prediction-reference pair.
    """
    bleu = BLEU()
    score = bleu.sentence_score(prediction, [reference])
    return score.score


# ============================================================
# 7. Run Evaluation
# ============================================================
predictions = []
references = []
syntax_results = []
exact_match_results = []

print("=" * 70)
print("BASELINE EVALUATION: GPT-2 (Zero-Shot)")
print("=" * 70 + "\n")

for idx, sample in enumerate(samples):
    # Generate prediction
    completion = generate_code(sample)

    # Get ground truth (extract api_call from output dict if needed)
    ground_truth_raw = sample["output"]
    if isinstance(ground_truth_raw, dict):
        ground_truth = ground_truth_raw.get("api_call", str(ground_truth_raw))
    else:
        ground_truth = str(ground_truth_raw)
    lang = sample["input"].get("lang", "python")

    # Store for corpus-level metrics
    predictions.append(completion)
    references.append(ground_truth)

    # Compute individual metrics
    syntax_valid = check_syntax_correctness(completion, lang)
    exact_match = check_exact_match(completion, ground_truth)
    individual_bleu = compute_individual_sacrebleu(completion, ground_truth)

    syntax_results.append(syntax_valid)
    exact_match_results.append(exact_match)

    # Display results
    input_json = json.dumps(sample["input"], indent=2, ensure_ascii=False)

    md = (
        f"# 🔹 Baseline Example {idx+1}\n\n"
        f"## 📝 INPUT\n"
        "```json\n" + input_json + "\n```\n\n"
        f"## 🤖 MODEL OUTPUT (GPT-2)\n"
        "```python\n" + completion + "\n```\n\n"
        f"## ✅ GROUND TRUTH\n"
        "```python\n" + ground_truth + "\n```\n\n"
        f"## 📊 METRICS\n"
        f"| Metric | Value |\n"
        f"|--------|-------|\n"
        f"| SacreBLEU | {individual_bleu:.2f} |\n"
        f"| Syntax Correctness | {'✅ Valid' if syntax_valid else '❌ Invalid'} |\n"
        f"| Exact Match | {'✅ Yes' if exact_match else '❌ No'} |\n\n"
        "---\n"
    )
    display(Markdown(md))


# ============================================================
# 8. Aggregate Metrics
# ============================================================
corpus_sacrebleu = compute_sacrebleu(predictions, references)
syntax_correctness_rate = sum(syntax_results) / len(syntax_results)
exact_match_rate = sum(exact_match_results) / len(exact_match_results)

print("\n" + "=" * 70)
print("AGGREGATE METRICS (GPT-2 Zero-Shot Baseline)")
print("=" * 70)
print(f"\n📊 SacreBLEU (Corpus):      {corpus_sacrebleu:.2f}")
print(f"📊 Syntax Correctness:      {syntax_correctness_rate:.2%} ({sum(syntax_results)}/{len(syntax_results)})")
print(f"📊 Exact Match:             {exact_match_rate:.2%} ({sum(exact_match_results)}/{len(exact_match_results)})")
print("\n" + "=" * 70)


# ============================================================
# 9. Summary Table
# ============================================================
summary_md = f"""
# 📈 GPT-2 Baseline Summary

| Metric | Score |
|--------|-------|
| **SacreBLEU** | {corpus_sacrebleu:.2f} |
| **Syntax Correctness** | {syntax_correctness_rate:.2%} |
| **Exact Match** | {exact_match_rate:.2%} |

**Total Samples Evaluated:** {len(samples)}
"""
display(Markdown(summary_md))

✅ Loaded 1014093 samples
✅ Selected 10 samples

⬇️ Loading model: gpt2 ...
✅ GPT-2 loaded!

BASELINE EVALUATION: GPT-2 (Zero-Shot)





# 🔹 Baseline Example 1

## 📝 INPUT
```json
{
  "instruction": "I'd like to unassign certain companies from a specific shared catalog using the Magento B2B API. Could you please guide me on how to use the sharedCatalogCompanyManagementV1UnassignCompaniesPost endpoint for this purpose?",
  "lang": "PHP",
  "functionality": "sharedCatalog/{sharedCatalogId}/unassignCompanies",
  "api_arguments": {},
  "description": "Unassign companies from a shared catalog.",
  "domain": [
    "sharedCatalog/{sharedCatalogId}/unassignCompanies"
  ],
  "path": "/V1/sharedCatalog/{sharedCatalogId}/unassignCompanies"
}
```

## 🤖 MODEL OUTPUT (GPT-2)
```python
$api = new API();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api->execute();

$api
```

## ✅ GROUND TRUTH
```python
<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://example.com/rest/default/V1/sharedCatalog/%7BsharedCatalogId%7D/unassignCompanies",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => "{\"companies\":[{\"city\":\"string\",\"comment\":\"string\",\"company_email\":\"string\",\"company_name\":\"string\",\"country_id\":\"string\",\"customer_group_id\":0,\"extension_attributes\":{\"applicable_payment_method\":0,\"available_payment_methods\":\"string\",\"quote_config\":{\"company_id\":\"string\",\"extension_attributes\":{},\"is_quote_enabled\":true},\"use_config_settings\":0},\"id\":0,\"legal_name\":\"string\",\"postcode\":\"string\",\"region\":\"string\",\"region_id\":\"string\",\"reject_reason\":\"string\",\"rejected_at\":\"string\",\"reseller_id\":\"string\",\"sales_representative_id\":0,\"status\":0,\"street\":[\"string\"],\"super_user_id\":0,\"telephone\":\"string\",\"vat_tax_id\":\"string\"}]}",
  CURLOPT_HTTPHEADER => [
    "content-type: application/json"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}
```

## 📊 METRICS
| Metric | Value |
|--------|-------|
| SacreBLEU | 0.58 |
| Syntax Correctness | ✅ Valid |
| Exact Match | ❌ No |

---




# 🔹 Baseline Example 2

## 📝 INPUT
```json
{
  "instruction": "I want to update specific attributes of a rack in NetBox API, and I believe the dcim_racks_partial_update endpoint is the best fit for my needs. Can you provide an example of how to effectively use this endpoint to modify the name and size of an existing rack?",
  "lang": "go",
  "functionality": "",
  "api_arguments": {},
  "description": "",
  "domain": [
    "dcim"
  ],
  "path": "/dcim/racks/{id}/"
}
```

## 🤖 MODEL OUTPUT (GPT-2)
```python
go get github.com/mikey/dcim/racks/{id}/

go get github.com/mikey/dcim/racks/{id}/

go get github.com/mikey/dcim/racks/{id}/

go get github.com/mikey/dcim/racks/{id}/

go get github.com/mikey/dcim/racks/{id}/

go get github.com/mikey/dcim/racks/{id}/

go get github.com/mikey/dcim/racks/{id}/

go get github.com/mikey/dcim/racks/{id}/

go get github.com/mikey/dcim/racks/{id}/

go get github.com/mikey/dcim/racks/{id}/

go get github.com/mikey/dcim/racks/{id}/

go get github.com/mikey/dcim/
```

## ✅ GROUND TRUTH
```python
package main

import (
	"fmt"
	"strings"
	"net/http"
	"io/ioutil"
)

func main() {

	url := "https://netboxdemo.com/api/dcim/racks/%7Bid%7D/"

	payload := strings.NewReader("{\"asset_tag\":\"string\",\"comments\":\"string\",\"custom_fields\":{},\"desc_units\":true,\"facility_id\":\"string\",\"group\":0,\"name\":\"string\",\"outer_depth\":32767,\"outer_unit\":\"mm\",\"outer_width\":32767,\"role\":0,\"serial\":\"string\",\"site\":0,\"status\":\"reserved\",\"tags\":[\"string\"],\"tenant\":0,\"type\":\"2-post-frame\",\"u_height\":1,\"width\":10}")

	req, _ := http.NewRequest("PATCH", url, payload)

	req.Header.Add("Authorization", "REPLACE_KEY_VALUE")
	req.Header.Add("content-type", "application/json")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)

	fmt.Println(res)
	fmt.Println(string(body))

}
```

## 📊 METRICS
| Metric | Value |
|--------|-------|
| SacreBLEU | 1.26 |
| Syntax Correctness | ✅ Valid |
| Exact Match | ❌ No |

---




# 🔹 Baseline Example 3

## 📝 INPUT
```json
{
  "instruction": "I'd like to access comprehensive NBA data for a specific franchise spanning from the 1977 season up to 2022. Could you guide me on how to accomplish this using the Sports-Get 1977 2022 Nba Team Rosters And Schedules-API?",
  "lang": "cURL",
  "functionality": "get_specific_nba_franchise_data",
  "api_arguments": {},
  "description": "Grabs all NBA Data for a Franchise from 1977(if applicable)-2022",
  "domain": [],
  "path": "/elements/{franchise}"
}
```

## 🤖 MODEL OUTPUT (GPT-2)
```python
/elements/{franchise}

### Generate only the cURL API call code:

/elements/{franchise}

### Generate only the cURL API call code:

/elements/{franchise}

### Generate only the cURL API call code:

/elements/{franchise}

### Generate only the cURL API call code:

/elements/{franchise}

### Generate only the cURL API call code:

/elements/{franchise}

### Generate only the cURL API call code:

/elements/{franchise}

### Generate only the cURL API call code:

/elements/{franchise}

### Generate only the cURL API call code:

/elements/{franchise}

### Generate only the cURL API call code:

/elements/{franchise}

### Generate only the cURL API call code:

/elements/{franchise}

### Generate only
```

## ✅ GROUND TRUTH
```python
curl --request GET \
  --url 'https//1977-2022-nba-team-rosters-and-schedules.p.rapidapi.com/elements/%7Bfranchise%7D?franchise=SOME_STRING_VALUE' \
  --header 'X-RapidAPI-Host: SOME_STRING_VALUE' \
  --header 'X-RapidAPI-Key: SOME_STRING_VALUE'
```

## 📊 METRICS
| Metric | Value |
|--------|-------|
| SacreBLEU | 0.94 |
| Syntax Correctness | ✅ Valid |
| Exact Match | ❌ No |

---




# 🔹 Baseline Example 4

## 📝 INPUT
```json
{
  "instruction": "Could you show me how to use the Dragon Cave Lineage DB API to fetch all the lineages related to a particular dragon's code(s)?",
  "lang": "Python",
  "functionality": "Retrieves all lineages by dragon's code(s)",
  "api_arguments": {},
  "description": "",
  "domain": [
    "Searches"
  ],
  "path": "/lineages/search/codes"
}
```

## 🤖 MODEL OUTPUT (GPT-2)
```python
python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.py

python api.
```

## ✅ GROUND TRUTH
```python
import http.client

conn = http.client.HTTPSConnection("virtserver.swaggerhub.com")

conn.request("GET", "/brittaniSavery/DCLineageDatabase/1.0/lineages/search/codes?gender=SOME_STRING_VALUE&code=SOME_STRING_VALUE")

res = conn.getresponse()
data = res.read()

print(data.decode("utf-8"))
```

## 📊 METRICS
| Metric | Value |
|--------|-------|
| SacreBLEU | 0.36 |
| Syntax Correctness | ❌ Invalid |
| Exact Match | ❌ No |

---




# 🔹 Baseline Example 5

## 📝 INPUT
```json
{
  "instruction": "To obtain the Uthmani simple script (without tashkiq/diacritical marks) of a specific ayah, a user can utilize the \"uthmani\\_simple\\_text\" functionality provided by the Other-Quran Com-API as follows:\n\n- verse_key: The key identifying the desired ayah.\n\nFor example, if a user wishes to get the Uthmani script of ayah 2:177, they would use:\n\n`verse_key=2:177`\n\nThis query will return the Uthmani simple script (without tashkiq/diacritical marks) for ayah 2:177. Optional query strings like `rub_el_hizb_number`, `chapter_number`, `page_number`, `juz_number`, or `hizb_number` can be used to filter the results based on the desired Quranic division if needed. They should be left blank if the user wants to retrieve the script for the whole Quran.",
  "lang": "Java",
  "functionality": "uthmani_simple_text",
  "api_arguments": {},
  "description": "Get Uthmani simple script(without tashkiq/diacritical marks) of ayah. Use query strings to filter results, leave all query string blank if you want to fetch script of whole Quran.\"\nverse_key: If you want to get Uthmani script of a specific ayah.\n    rub_el_hizb_number: If you want to get Uthmani script of a specific Rub el Hizb.\n    chapter_number: If you want to get Uthmani script of a specific surah.\n    page_number: If you want to get Uthmani script of a Madani Muhsaf page\n    juz_number: If you want to get Uthmani script of a specific juz.\n    hizb_number: If you want to get Uthmani script of a specific hizb.",
  "domain": [],
  "path": "/quran/verses/uthmani_simple"
}
```

## 🤖 MODEL OUTPUT (GPT-2)
```python
/quran/verses/uthmani_simple

### GET:

/quran/verses/uthmani_simple

### POST:

/quran/verses/uthmani_simple

### DELETE:

/quran/verses/uthmani_simple

### END:

/quran/verses/uthmani_simple

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###

###
```

## ✅ GROUND TRUTH
```python
OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https//quran-com.p.rapidapi.com/quran/verses/uthmani_simple?verse_key=SOME_STRING_VALUE&rub_el_hizb_number=SOME_STRING_VALUE&chapter_number=SOME_STRING_VALUE&page_number=SOME_STRING_VALUE&juz_number=SOME_STRING_VALUE&hizb_number=SOME_STRING_VALUE")
  .get()
  .addHeader("X-RapidAPI-Key", "SOME_STRING_VALUE")
  .addHeader("X-RapidAPI-Host", "SOME_STRING_VALUE")
  .build();

Response response = client.newCall(request).execute();
```

## 📊 METRICS
| Metric | Value |
|--------|-------|
| SacreBLEU | 3.29 |
| Syntax Correctness | ✅ Valid |
| Exact Match | ❌ No |

---




# 🔹 Baseline Example 6

## 📝 INPUT
```json
{
  "instruction": "I want to delete a partner integration from my Amazon Redshift cluster, but I don't want the data flow to stop immediately. Please help me craft a query to use the GET_DeletePartner API and ensure the integration deletion process begins while maintaining the current data flow.",
  "lang": "Java",
  "functionality": "",
  "api_arguments": {},
  "description": "Deletes a partner integration from a cluster. Data can still flow to the cluster until the integration is deleted at the partner's website.",
  "domain": [],
  "path": "/#Action=DeletePartner"
}
```

## 🤖 MODEL OUTPUT (GPT-2)
```python
#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#

#
```

## ✅ GROUND TRUTH
```python
OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("http://redshift./%7Bregion%7D.amazonaws.com/?AccountId=SOME_STRING_VALUE&ClusterIdentifier=SOME_STRING_VALUE&DatabaseName=SOME_STRING_VALUE&PartnerName=SOME_STRING_VALUE&Action=SOME_STRING_VALUE&Version=SOME_STRING_VALUE#Action=DeletePartner")
  .get()
  .addHeader("X-Amz-Content-Sha256", "SOME_STRING_VALUE")
  .addHeader("X-Amz-Date", "SOME_STRING_VALUE")
  .addHeader("X-Amz-Algorithm", "SOME_STRING_VALUE")
  .addHeader("X-Amz-Credential", "SOME_STRING_VALUE")
  .addHeader("X-Amz-Security-Token", "SOME_STRING_VALUE")
  .addHeader("X-Amz-Signature", "SOME_STRING_VALUE")
  .addHeader("X-Amz-SignedHeaders", "SOME_STRING_VALUE")
  .addHeader("Authorization", "REPLACE_KEY_VALUE")
  .build();

Response response = client.newCall(request).execute();
```

## 📊 METRICS
| Metric | Value |
|--------|-------|
| SacreBLEU | 0.07 |
| Syntax Correctness | ✅ Valid |
| Exact Match | ❌ No |

---




# 🔹 Baseline Example 7

## 📝 INPUT
```json
{
  "instruction": "I want to check the purchase and consumption status of a specific in-app item using the Google Play Developer API.",
  "lang": "Java",
  "functionality": "",
  "api_arguments": {},
  "description": "Checks the purchase and consumption status of an inapp item.",
  "domain": [
    "purchases"
  ],
  "path": "/{packageName}/purchases/products/{productId}/tokens/{token}"
}
```

## 🤖 MODEL OUTPUT (GPT-2)
```python
$api = new GooglePlayAPI(); $api->get( 'productId' ); $api->get( 'token' ); $api->get( 'productId' ); $api->get( 'token' ); $api->get( 'productId' ); $api->get( 'token' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' ); $api->get( 'productId' );
```

## ✅ GROUND TRUTH
```python
OkHttpClient client = new OkHttpClient();

Request request = new Request.Builder()
  .url("https://www.googleapis.com/androidpublisher/v2/applications/%7BpackageName%7D/purchases/products/%7BproductId%7D/tokens/%7Btoken%7D?alt=SOME_STRING_VALUE&fields=SOME_STRING_VALUE&key=SOME_STRING_VALUE&oauth_token=SOME_STRING_VALUE&prettyPrint=SOME_BOOLEAN_VALUE&quotaUser=SOME_STRING_VALUE&userIp=SOME_STRING_VALUE")
  .get()
  .addHeader("Authorization", "Bearer REPLACE_BEARER_TOKEN")
  .build();

Response response = client.newCall(request).execute();
```

## 📊 METRICS
| Metric | Value |
|--------|-------|
| SacreBLEU | 1.38 |
| Syntax Correctness | ✅ Valid |
| Exact Match | ❌ No |

---




# 🔹 Baseline Example 8

## 📝 INPUT
```json
{
  "instruction": "I'd like to know how to refresh the exit routine of specific health checks using the GDPS REST API V4R6Metro. In the request, please include the health check details such as names or IDs. The API response will include the updated *rc* and *message* attributes.",
  "lang": "go",
  "functionality": "Refresh Exit routine of the given health check(s)",
  "api_arguments": {},
  "description": "The *rc* and *message* attributes are only present in **v2** version",
  "domain": [
    "health_checks"
  ],
  "path": "/health_checks/install"
}
```

## 🤖 MODEL OUTPUT (GPT-2)
```python
go get -u github.com/mike-j.davies/go-api/api/v4r6-health_checks

go get -u github.com/mike-j.davies/go-api/api/v4r6-health_checks/v4r6-health_check

go get -u github.com/mike-j.davies/go-api/api/v4r6-health_checks/v4r6-health_check/v4r6-health_check

go get -u github.com/mike-j.davies/go-api/api/v4r6-health_checks/v4r6-health_check/v4r6-health_check/v4r6-health_check/v4r6-health_check/v4r6-health_check/v4r6-health_check/v4r6-health_check/v4r6-health_check/v4r6-health_check/v4r6-health_check/v4r6-health_
```

## ✅ GROUND TRUTH
```python
package main

import (
	"fmt"
	"net/http"
	"io/ioutil"
)

func main() {

	url := "https:///%3Cgdpsdomain%3E:%3Cport%3E/org.ibm.gdps/rest/v2/health_checks/install?hc_names=SOME_STRING_VALUE"

	req, _ := http.NewRequest("POST", url, nil)

	req.Header.Add("domain", "SOME_STRING_VALUE")
	req.Header.Add("Authorization", "Basic REPLACE_BASIC_AUTH")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)

	fmt.Println(res)
	fmt.Println(string(body))

}
```

## 📊 METRICS
| Metric | Value |
|--------|-------|
| SacreBLEU | 2.16 |
| Syntax Correctness | ✅ Valid |
| Exact Match | ❌ No |

---




# 🔹 Baseline Example 9

## 📝 INPUT
```json
{
  "instruction": "How can I retrieve a specific freetext template using the Template Service API Specification? Please provide instructions on making a request with an example ID or name.",
  "lang": "Python",
  "functionality": "Get a freetext template",
  "api_arguments": {},
  "description": "",
  "domain": [
    "freetext"
  ],
  "path": "/templates/freetext/{freetextTemplateName}"
}
```

## 🤖 MODEL OUTPUT (GPT-2)
```python
# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Generate the Python API call code:

# Gener
```

## ✅ GROUND TRUTH
```python
import http.client

conn = http.client.HTTPConnection("localhost:8080")

headers = { 'Authorization': "Bearer REPLACE_BEARER_TOKEN" }

conn.request("GET", "/templates/freetext/%7BfreetextTemplateName%7D", headers=headers)

res = conn.getresponse()
data = res.read()

print(data.decode("utf-8"))
```

## 📊 METRICS
| Metric | Value |
|--------|-------|
| SacreBLEU | 0.23 |
| Syntax Correctness | ✅ Valid |
| Exact Match | ❌ No |

---




# 🔹 Baseline Example 10

## 📝 INPUT
```json
{
  "instruction": "Could you please guide me on how to use the Manajemen Asset API to retrieve a list of assets that meet certain conditions or, alternatively, match specific criteria? For example, I may want to select all assets that have a specific tag or belong to a specific category, but I also have the option to include assets that don't have a certain tag or don't belong to a certain category. How would I structure my query for such scenarios using the Select_Or_where endpoint of the Manajemen Asset API?",
  "lang": "go",
  "functionality": "Select_Or_where Asset",
  "api_arguments": {},
  "description": "",
  "domain": [],
  "path": "/v4/select_or_where/token/651bc3de9b493f4b9fe2485b/project/manajemen_asset/collection/asset/appid/{appid}/or_where_field/{or_where_field}/or_where_value/{or_where_value}/asset"
}
```

## 🤖 MODEL OUTPUT (GPT-2)
```python
go get github.com/jamesj/go-api/api/api/go/api/go.go

go get github.com/jamesj/go-api/api/api/go/api/go.go

go get github.com/jamesj/go-api/api/api/go/api/go.go

go get github.com/jamesj/go-api/api/api/go/api/go.go

go get github.com/jamesj/go-api/api/api/go/api/go.go

go get github.com/jamesj/go-api/api/api/go/api/go.go

go get github.com/jamesj/go-api/api/api/go/api/go.go

go get github.com/jamesj/go-api/api/api/go/api/go.go

go get github.com/jamesj/go-api/api/api/go/api/go.go

go get github.com/jamesj/go-
```

## ✅ GROUND TRUTH
```python
package main

import (
	"fmt"
	"net/http"
	"io/ioutil"
)

func main() {

	url := "https://io.etter.cloudundefined/v4/select_or_where/token/651bc3de9b493f4b9fe2485b/project/manajemen_asset/collection/asset/appid/%7Bappid%7D/or_where_field/%7Bor_where_field%7D/or_where_value/%7Bor_where_value%7D/asset"

	req, _ := http.NewRequest("GET", url, nil)

	req.Header.Add("accept", "application/x-www-form-urlencoded")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := ioutil.ReadAll(res.Body)

	fmt.Println(res)
	fmt.Println(string(body))

}
```

## 📊 METRICS
| Metric | Value |
|--------|-------|
| SacreBLEU | 0.43 |
| Syntax Correctness | ✅ Valid |
| Exact Match | ❌ No |

---



AGGREGATE METRICS (GPT-2 Zero-Shot Baseline)

📊 SacreBLEU (Corpus):      1.49
📊 Syntax Correctness:      90.00% (9/10)
📊 Exact Match:             0.00% (0/10)




# 📈 GPT-2 Baseline Summary

| Metric | Score |
|--------|-------|
| **SacreBLEU** | 1.49 |
| **Syntax Correctness** | 90.00% |
| **Exact Match** | 0.00% |

**Total Samples Evaluated:** 10


In [None]:
!pip install -q transformers accelerate peft datasets


# Model testing and Dataset exploration

In [None]:
import json
from datasets import Dataset

DATA_PATH = "/content/drive/MyDrive/API-Pack-Dataset/api_pack_starcoder_training.jsonl"

flat_rows = []

with open(DATA_PATH, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        line = line.strip()
        if not line:
            continue

        obj = json.loads(line)

        inp = obj.get("input", {})
        out = obj.get("output", {})

        instruction = str(inp.get("instruction", "") or "")
        lang        = str(inp.get("lang", "") or "Python")
        description = str(inp.get("description", "") or "")
        path        = str(inp.get("path", "") or "")
        api_call    = str(out.get("api_call", "") or "")

        api_args = inp.get("api_arguments", {})
        if not isinstance(api_args, dict):
            api_args = {}
        api_args_str = json.dumps(api_args)

        prompt = (
            f"You are an API client code generator. "
            f"You MUST output ONLY valid {lang} code. "
            f"No comments. No explanations. No markdown. Only raw code.\n\n"
            f"### USER REQUEST:\n{instruction}\n\n"
            f"### ENDPOINT PATH:\n{path}\n\n"
            f"### DESCRIPTION:\n{description}\n\n"
            f"### PARAMETERS:\n{api_args_str}\n\n"
            f"### OUTPUT:\n"
        )

        flat_rows.append({"text": prompt + api_call})

        if i % 200000 == 0:
            print("Processed:", i)

print("Loaded + flattened:", len(flat_rows))

raw_ds = Dataset.from_list(flat_rows)
raw_ds


Processed: 0
Processed: 200000
Processed: 400000
Processed: 600000
Processed: 800000
Processed: 1000000
Loaded + flattened: 1014093


Dataset({
    features: ['text'],
    num_rows: 1014093
})

In [None]:
DEBUG_SIZE = 50000  # start small; you can increase later or remove this

if len(raw_ds) > DEBUG_SIZE:
    train_ds = raw_ds.select(range(DEBUG_SIZE))
else:
    train_ds = raw_ds

train_ds


Dataset({
    features: ['text'],
    num_rows: 50000
})

In [None]:
train_ds = raw_ds


# Finetuning 50k Dry run

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "bigcode/starcoderbase-1b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)

print("Model loaded on device:", model.device)


Model loaded on device: cuda:0


In [None]:
for name, module in model.named_modules():
    print(name)



transformer
transformer.wte
transformer.wpe
transformer.drop
transformer.h
transformer.h.0
transformer.h.0.ln_1
transformer.h.0.attn
transformer.h.0.attn.c_attn
transformer.h.0.attn.c_proj
transformer.h.0.attn.resid_dropout
transformer.h.0.ln_2
transformer.h.0.mlp
transformer.h.0.mlp.c_fc
transformer.h.0.mlp.c_proj
transformer.h.0.mlp.act
transformer.h.0.mlp.dropout
transformer.h.1
transformer.h.1.ln_1
transformer.h.1.attn
transformer.h.1.attn.c_attn
transformer.h.1.attn.c_proj
transformer.h.1.attn.resid_dropout
transformer.h.1.ln_2
transformer.h.1.mlp
transformer.h.1.mlp.c_fc
transformer.h.1.mlp.c_proj
transformer.h.1.mlp.act
transformer.h.1.mlp.dropout
transformer.h.2
transformer.h.2.ln_1
transformer.h.2.attn
transformer.h.2.attn.c_attn
transformer.h.2.attn.c_proj
transformer.h.2.attn.resid_dropout
transformer.h.2.ln_2
transformer.h.2.mlp
transformer.h.2.mlp.c_fc
transformer.h.2.mlp.c_proj
transformer.h.2.mlp.act
transformer.h.2.mlp.dropout
transformer.h.3
transformer.h.3.ln_1
trans

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj", "c_fc"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 11,108,352 || all params: 1,148,315,648 || trainable%: 0.9674


In [None]:
MAX_LENGTH = 1024  # later you can bump to 1024 etc.

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
    )

tokenized_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_ds.set_format(type="torch")

print(tokenized_ds)
print(tokenized_ds[0].keys())


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 50000
})
dict_keys(['input_ids', 'attention_mask'])


In [None]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False    # important! We're training a Causal LM, NOT masked LM
)

training_args = TrainingArguments(
    output_dir="/content/starcoder1b_lora_api_pack_debug",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=50,
    max_steps=500,           # small test run; increase later
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,
    bf16=torch.cuda.is_available(),
    fp16=not torch.cuda.is_bf16_supported(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [None]:
!pip install -q bitsandbytes
import bitsandbytes as bnb
print("bitsandbytes version:", bnb.__version__)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25hbitsandbytes version: 0.48.2


In [None]:
# ============================================================
# ENABLE GRADIENT CHECKPOINTING (saves VRAM)
# ============================================================
model.gradient_checkpointing_enable()
model.enable_input_require_grads()


# ============================================================
# DATA COLLATOR
# ============================================================
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


# ============================================================
# TRAINING ARGUMENTS FOR FULL DATASET
# ============================================================
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/starcoder1b_lora_full",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    num_train_epochs=2,
    logging_steps=50,
    save_steps=2000,
    save_total_limit=3,
    bf16=torch.cuda.is_available(),
    fp16=not torch.cuda.is_bf16_supported(),
    report_to="none",
    optim="paged_adamw_32bit",
)


# ============================================================
# TRAINER
# ============================================================
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


# ============================================================
# SAVE LORA ADAPTER
# ============================================================
SAVE_DIR = "/content/starcoder1b_lora_full_final"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print("🔥 Successfully trained on FULL DATASET!")
print("Saved to:", SAVE_DIR)


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
50,0.6478
100,0.6297
150,0.6654
200,0.6565
250,0.6372
300,0.6803
350,0.6472
400,0.6554
450,0.6661
500,0.6431


🔥 Successfully trained on FULL DATASET!
Saved to: /content/starcoder1b_lora_full_final


In [None]:
trainer.train()


Step,Training Loss
20,1.6162
40,0.9572
60,0.7775
80,0.6592
100,0.7219
120,0.6886
140,0.6551
160,0.644
180,0.5895
200,0.6642


TrainOutput(global_step=500, training_loss=0.685258050918579, metrics={'train_runtime': 300.0845, 'train_samples_per_second': 13.33, 'train_steps_per_second': 1.666, 'total_flos': 1.2667393671168e+16, 'train_loss': 0.685258050918579, 'epoch': 0.4})

In [None]:
SAVE_DIR = "/content/starcoder1b_lora_api_pack_debug_final"

model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print("Saved LoRA adapter to:", SAVE_DIR)


Saved LoRA adapter to: /content/starcoder1b_lora_api_pack_debug_final


# Dry run successful

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

base_model_name = "bigcode/starcoderbase-1b"
lora_dir = "/content/starcoder1b_lora_api_pack_debug_final"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)

model = PeftModel.from_pretrained(base, lora_dir)
model.eval()


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTBigCodeForCausalLM(
      (transformer): GPTBigCodeModel(
        (wte): Embedding(49152, 2048)
        (wpe): Embedding(8192, 2048)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-23): 24 x GPTBigCodeBlock(
            (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (attn): GPTBigCodeAttention(
              (c_attn): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2304, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
 

# Define metrics function

In [None]:
!pip install -q tree_sitter==0.20.1 rouge-score nltk
!git clone https://github.com/microsoft/CodeXGLUE.git


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.2/126.2 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for tree_sitter (pyproject.toml) ... [?25l[?25hdone
Cloning into 'CodeXGLUE'...
remote: Enumerating objects: 3373, done.[K
remote: Counting objects: 100% (432/432), done.[K
remote: Compressing objects: 100% (76/76), done.[K
remote: Total 3373 (delta 372), reused 356 (delta 356), pack-reused 2941 (from 2)[K
Receiving objects: 100% (3373/3373), 213.15 MiB | 14.57 MiB/s, done.
Resolving deltas: 100% (1755/1755), done.
Updating files: 100% (400/400), done.


In [None]:
!mkdir -p /content/codebleu
!wget -q https://raw.githubusercontent.com/microsoft/CodeBERT/master/CodeBLEU/bleu.py -O /content/codebleu/bleu.py
!wget -q https://raw.githubusercontent.com/microsoft/CodeBERT/master/CodeBLEU/weighted_ngram_match.py -O /content/codebleu/weighted_ngram_match.py
!wget -q https://raw.githubusercontent.com/microsoft/CodeBERT/master/CodeBLEU/syntax_match.py -O /content/codebleu/syntax_match.py
!wget -q https://raw.githubusercontent.com/microsoft/CodeBERT/master/CodeBLEU/dataflow_match.py -O /content/codebleu/dataflow_match.py


In [None]:
import sys
sys.path.append("/content/codebleu")


In [None]:
from bleu import corpus_bleu
from weighted_ngram_match import weighted_ngram_match
from syntax_match import syntax_match
from dataflow_match import dataflow_match

print("🔥 CodeBLEU imported successfully!")


ImportError: cannot import name 'weighted_ngram_match' from 'weighted_ngram_match' (/content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/weighted_ngram_match.py)

In [None]:
!pip install evaluate sacrebleu rouge-score


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py

In [None]:
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

BASE = "bigcode/starcoderbase-1b"
LORA = "/content/starcoder1b_lora_full_final"

tokenizer = AutoTokenizer.from_pretrained(BASE)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    BASE,
    torch_dtype=torch.float16,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, LORA)
model.eval()
print("🔥 Model loaded!")


🔥 Model loaded!


In [None]:
import json, random

DATA_PATH = "/content/drive/MyDrive/API-Pack-Dataset/api_pack_starcoder_training.jsonl"

samples = []
with open(DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            samples.append(json.loads(line))

random.shuffle(samples)
eval_samples = samples[:100]   # evaluate on 100
print("Loaded eval samples:", len(eval_samples))


Loaded eval samples: 100


In [None]:
def build_prompt(example):
    inp = example["input"]
    api_args = inp.get("api_arguments", {})

    prompt = (
        f"You are an API client code generator. "
        f"You MUST output ONLY valid {inp['lang']} code. "
        f"No comments. No explanations. Only raw code.\n\n"
        f"### USER REQUEST:\n{inp['instruction']}\n\n"
        f"### ENDPOINT PATH:\n{inp['path']}\n\n"
        f"### DESCRIPTION:\n{inp['description']}\n\n"
        f"### PARAMETERS:\n{api_args}\n\n"
        f"### OUTPUT:\n"
    )
    return prompt


In [None]:
def generate_code(prompt, max_new_tokens=256):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            top_p=0.95,
            do_sample=False,
        )

    full = tokenizer.decode(output[0], skip_special_tokens=True)
    return full[len(prompt):].strip()


In [None]:
import evaluate

bleu_metric = evaluate.load("bleu")
sacrebleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load("rouge")


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
import difflib

def similarity_percentage(pred, gold):
    seq = difflib.SequenceMatcher(None, pred.strip(), gold.strip())
    return seq.ratio()      # Already a value between 0 and 1


In [None]:
import ast

def python_is_valid(code):
    try:
        ast.parse(code)
        return True
    except:
        return False


In [None]:
import math

def compute_perplexity(text):
    enc = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        loss = model(**enc, labels=enc["input_ids"]).loss
    return math.exp(loss.item())


In [None]:
from statistics import mean

results = {
    "exact_match": [],
    "bleu": [],
    "sacrebleu": [],
    "rougeL": [],
    "perplexity": [],
    "syntax": []
}

for ex in eval_samples:
    prompt = build_prompt(ex)
    pred = generate_code(prompt)
    gold = ex["output"]["api_call"]

    # Exact match
    results["exact_match"].append(exact_match(pred, gold))

    # BLEU
    results["bleu"].append(
        bleu_metric.compute(
            predictions=[pred],
            references=[[gold]]
        )["bleu"]
    )

    # SacreBLEU
    results["sacrebleu"].append(
        sacrebleu_metric.compute(
            predictions=[pred],
            references=[gold]
        )["score"]
    )

    # ROUGE-L
    results["rougeL"].append(
        rouge_metric.compute(
            predictions=[pred],
            references=[gold]
        )["rougeL"]
    )

    # Perplexity
    results["perplexity"].append(compute_perplexity(prompt + pred))

    # Syntax correctness (only Python supported)
    lang = ex["input"].get("lang", "Python")
    if lang == "Python":
        results["syntax"].append(int(python_is_valid(pred)))

print("🔥 Evaluation Complete!")


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:

🔥 Evaluation Complete!


# Finetuning results 100k

In [None]:
print("\n========== FINAL METRICS ==========")
print("Exact Match:", mean(results["exact_match"]))
print("BLEU:", mean(results["bleu"]))
print("SacreBLEU:", mean(results["sacrebleu"]))
print("ROUGE-L:", mean(results["rougeL"]))
print("Perplexity:", mean(results["perplexity"]))
print("Python Syntax Correctness:", mean(results["syntax"]))



Exact Match: 0
BLEU: 0.18728886751550575
SacreBLEU: 18.91682139195097
ROUGE-L: 0.2837455799737819
Perplexity: 1.7350634128687479
Python Syntax Correctness: 0


In [None]:
from statistics import mean

results = {
    "exact_match": [],
    "bleu": [],
    "sacrebleu": [],
    "rougeL": [],
    "perplexity": [],
    "syntax": []
}

for ex in eval_samples:
    prompt = build_prompt(ex)
    pred = generate_code(prompt)
    gold = ex["output"]["api_call"]

    # Exact match
    results["exact_match"].append(exact_match(pred, gold))

    # BLEU
    results["bleu"].append(
        bleu_metric.compute(
            predictions=[pred],
            references=[[gold]]
        )["bleu"]
    )

    # SacreBLEU
    results["sacrebleu"].append(
        sacrebleu_metric.compute(
            predictions=[pred],
            references=[gold]
        )["score"]
    )

    # ROUGE-L
    results["rougeL"].append(
        rouge_metric.compute(
            predictions=[pred],
            references=[gold]
        )["rougeL"]
    )

    # Perplexity
    results["perplexity"].append(compute_perplexity(prompt + pred))

    # Syntax correctness (only Python supported)
    lang = ex["input"].get("lang", "Python")
    if lang == "Python":
        results["syntax"].append(int(python_is_valid(pred)))

print("🔥 Evaluation Complete!")

print("\n========== FINAL METRICS ==========")
print("Exact Match:", mean(results["exact_match"]))
print("BLEU:", mean(results["bleu"]))
print("SacreBLEU:", mean(results["sacrebleu"]))
print("ROUGE-L:", mean(results["rougeL"]))
print("Perplexity:", mean(results["perplexity"]))
print("Python Syntax Correctness:", mean(results["syntax"]))



Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for o

🔥 Evaluation Complete!

Exact Match: 0
BLEU: 0.18728886751550575
SacreBLEU: 18.91682139195097
ROUGE-L: 0.2837455799737819
Perplexity: 1.7350634128687479
Python Syntax Correctness: 0


# Finetuning batchwise 100k each batch - 2 runs total

In [None]:
!pip install -q transformers peft accelerate bitsandbytes datasets


In [None]:
import json
from datasets import Dataset

DATA_PATH = "/content/drive/MyDrive/API-Pack-Dataset/api_pack_starcoder_training.jsonl"
BASE_MODEL = "bigcode/starcoderbase-1b"
LAST_LORA = "/content/starcoder1b_lora_full_final"   # previous result
SAVE_DIR = "/content/starcoder1b_lora_phase2"


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype="auto"
)

model = PeftModel.from_pretrained(base_model, LAST_LORA)
model.train()

print("🔥 Loaded base + LoRA checkpoint")


`torch_dtype` is deprecated! Use `dtype` instead!


🔥 Loaded base + LoRA checkpoint


In [None]:
rows = []
with open(DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            rows.append(json.loads(line))

print("Total rows:", len(rows))

# Select rows 100,000 → 200,000
start = 100_000
end = 200_000
phase2_rows = rows[start:end]

print("Phase-2 rows:", len(phase2_rows))


Total rows: 1014093
Phase-2 rows: 100000


In [None]:
def build_training_text(obj):
    inp = obj["input"]
    out = obj["output"]

    instruction = inp.get("instruction", "")
    lang        = inp.get("lang", "Python")
    description = inp.get("description", "")
    path        = inp.get("path", "")
    api_call    = out.get("api_call", "")
    api_args    = inp.get("api_arguments", {})

    prompt = (
        f"You are an API client code generator. "
        f"You MUST output ONLY valid {lang} code. "
        f"No comments. No explanations. Only raw code.\n\n"
        f"### USER REQUEST:\n{instruction}\n\n"
        f"### ENDPOINT PATH:\n{path}\n\n"
        f"### DESCRIPTION:\n{description}\n\n"
        f"### PARAMETERS:\n{api_args}\n\n"
        f"### OUTPUT:\n"
    )

    return {"text": prompt + api_call}

phase2_flat = [build_training_text(x) for x in phase2_rows]
phase2_ds = Dataset.from_list(phase2_flat)

print(phase2_ds)


Dataset({
    features: ['text'],
    num_rows: 100000
})


In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding=False,
        max_length=1024
    )

tokenized_ds = phase2_ds.map(
    tokenize,
    batched=True,
    remove_columns=["text"],
    num_proc=4
)


Map (num_proc=4):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=SAVE_DIR,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,        # lower LR for continued training
    warmup_ratio=0.05,
    num_train_epochs=1,        # only 1 epoch for incremental learning
    logging_steps=50,
    save_steps=2000,
    save_total_limit=2,
    bf16=True,
    optim="paged_adamw_32bit",
    report_to="none",
)


In [None]:
model.gradient_checkpointing_enable()
model.enable_input_require_grads()


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    data_collator=data_collator,
)

trainer.train()


The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
50,0.9684
100,0.9554
150,0.9633
200,0.9669
250,0.9441
300,0.9617
350,0.9366
400,0.9527
450,0.9687
500,0.9513


TrainOutput(global_step=6250, training_loss=0.9548125390625, metrics={'train_runtime': 6624.8043, 'train_samples_per_second': 15.095, 'train_steps_per_second': 0.943, 'total_flos': 2.7402411540860928e+17, 'train_loss': 0.9548125390625, 'epoch': 1.0})

In [None]:
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print("🔥 PHASE-2 training complete!")
print("Saved updated LoRA to:", SAVE_DIR)


🔥 PHASE-2 training complete!
Saved updated LoRA to: /content/starcoder1b_lora_phase2


In [None]:
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

BASE = "bigcode/starcoderbase-1b"
LORA = "/content/starcoder1b_lora_phase2"

tokenizer = AutoTokenizer.from_pretrained(BASE)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    BASE,
    torch_dtype=torch.float16,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, LORA)
model.eval()
print("🔥 Model loaded!")


🔥 Model loaded!


In [None]:
from statistics import mean

results = {
    "exact_match": [],
    "bleu": [],
    "sacrebleu": [],
    "rougeL": [],
    "perplexity": [],
    "syntax": []
}

for ex in eval_samples:
    prompt = build_prompt(ex)
    pred = generate_code(prompt)
    gold = ex["output"]["api_call"]

    # Exact match
    results["exact_match"].append(exact_match(pred, gold))

    # BLEU
    results["bleu"].append(
        bleu_metric.compute(
            predictions=[pred],
            references=[[gold]]
        )["bleu"]
    )

    # SacreBLEU
    results["sacrebleu"].append(
        sacrebleu_metric.compute(
            predictions=[pred],
            references=[gold]
        )["score"]
    )

    # ROUGE-L
    results["rougeL"].append(
        rouge_metric.compute(
            predictions=[pred],
            references=[gold]
        )["rougeL"]
    )

    # Perplexity
    results["perplexity"].append(compute_perplexity(prompt + pred))

    # Syntax correctness (only Python supported)
    lang = ex["input"].get("lang", "Python")
    if lang == "Python":
        results["syntax"].append(int(python_is_valid(pred)))

print("🔥 Evaluation Complete!")


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for o

🔥 Evaluation Complete!


In [None]:
print("\n========== FINAL METRICS PHASE 2 ==========")
print("Exact Match:", mean(results["exact_match"]))
print("BLEU:", mean(results["bleu"]))
print("SacreBLEU:", mean(results["sacrebleu"]))
print("ROUGE-L:", mean(results["rougeL"]))
print("Perplexity:", mean(results["perplexity"]))
print("Python Syntax Correctness:", mean(results["syntax"]))



Exact Match: 0
BLEU: 0.18728886751550575
SacreBLEU: 18.91682139195097
ROUGE-L: 0.2837455799737819
Perplexity: 1.7350634128687479
Python Syntax Correctness: 0


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import json
import evaluate
import ast

# ====== PATHS ======
BASE_MODEL = "bigcode/starcoderbase-1b"
LORA_CHECKPOINT = "/content/starcoder1b_lora_phase2/checkpoint-6250"   # <-- CHANGE to latest
TEST_DATA_PATH = "/content/drive/MyDrive/API-Pack-Dataset/api_pack_starcoder_training.jsonl"  # or any subset

# ====== Load tokenizer ======
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token

# ====== Load base model ======
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto"
)

# ====== Load LoRA adapter ======
model = PeftModel.from_pretrained(base_model, LORA_CHECKPOINT)
model.eval()


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTBigCodeForCausalLM(
      (transformer): GPTBigCodeModel(
        (wte): Embedding(49152, 2048)
        (wpe): Embedding(8192, 2048)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-23): 24 x GPTBigCodeBlock(
            (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (attn): GPTBigCodeAttention(
              (c_attn): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2304, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
 

In [None]:
# Load all rows
eval_rows = []
with open(TEST_DATA_PATH, "r") as f:
    for line in f:
        obj = json.loads(line)
        eval_rows.append(obj)

print("Total eval samples:", len(eval_rows))

# Select 100 random rows
random.seed(42)   # for reproducibility
eval_rows = random.sample(eval_rows, 100)

print("Using random rows:", len(eval_rows))

Total eval samples: 1014093
Using random rows: 100


In [None]:
import ast

def exact_match_percentage(pred, gold):
    return int(pred.strip() == gold.strip())

def syntax_correct_python(code):
    try:
        ast.parse(code)
        return 1
    except:
        return 0


In [None]:
predictions = []
references = []

for row in eval_rows:  # using your sampled 100 rows

    inp = row["input"]
    out = row["output"]

    instruction = inp.get("instruction", "")
    lang        = inp.get("lang", "Python")  # default
    description = inp.get("description", "")
    path        = inp.get("path", "")
    api_args    = inp.get("api_arguments", {})
    api_args_str = json.dumps(api_args)

    gold_api_call = out.get("api_call", "").strip()

    # 🔥 RECONSTRUCT THE TRAINING PROMPT EXACTLY AS BEFORE
    prompt = (
        f"You are an API client code generator. "
        f"You MUST output ONLY valid {lang} code. "
        f"No comments. No explanations. No markdown. Only raw code.\n\n"
        f"### USER REQUEST:\n{instruction}\n\n"
        f"### ENDPOINT PATH:\n{path}\n\n"
        f"### DESCRIPTION:\n{description}\n\n"
        f"### PARAMETERS:\n{api_args_str}\n\n"
        f"### OUTPUT:\n"
    )

    # ===== Model inference =====
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.1,
            do_sample=False
        )

    pred = tokenizer.decode(out[0], skip_special_tokens=True)
    pred = pred.replace(prompt, "").strip()

    predictions.append(pred)
    references.append(gold_api_call)


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for ope

In [None]:
import evaluate

bleu = evaluate.load("bleu")
sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# BLEU
bleu_score = bleu.compute(
    predictions=predictions,
    references=[[r] for r in references]
)["bleu"]

# SacreBLEU
sacre_score = sacrebleu.compute(
    predictions=predictions,
    references=references
)["score"]

# ROUGE-L
rouge_l = rouge.compute(
    predictions=predictions,
    references=references
)["rougeL"]

# Exact Match (%)
em_total = sum(exact_match_percentage(p, r) for p, r in zip(predictions, references))
em_percent = 100 * em_total / len(predictions)

# Approx Perplexity
import numpy as np
import math

def approx_ppl(preds, refs):
    ce_list = []
    for p, r in zip(preds, refs):
        ce = np.mean([1 if a != b else 0 for a, b in zip(p.split(), r.split())])
        ce_list.append(ce)
    return math.exp(np.mean(ce_list))

ppl = approx_ppl(predictions, references)

# Python syntax correctness (%)
syntax_ok = sum(syntax_correct_python(p) for p in predictions)
syntax_percent = 100 * syntax_ok / len(predictions)


In [None]:
print("\n========== FINAL TRAINED MODEL METRICS ==========")
print(f"Exact Match (%):          {em_percent:.2f}")
print(f"BLEU Score:               {bleu_score:.4f}")
print(f"SacreBLEU:                {sacre_score:.4f}")
print(f"ROUGE-L:                  {rouge_l:.4f}")
print(f"Approx Perplexity:        {ppl:.4f}")
print(f"Python Syntax Correct (%): {syntax_percent:.2f}")
print("=================================================\n")



Exact Match (%):          0.00
BLEU Score:               0.1997
SacreBLEU:                19.9673
ROUGE-L:                  0.3096
Approx Perplexity:        2.3720
Python Syntax Correct (%): 1.00

