In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import os
import shutil
import glob
import json
import random
import math
import difflib
import ast

In [None]:
os.system(
    "pip install -q transformers accelerate sentencepiece peft datasets "
    "bitsandbytes evaluate sacrebleu rouge-score"
)


0

In [None]:
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, PeftModel
import bitsandbytes as bnb
from IPython.display import Markdown, display
from statistics import mean
from tqdm.auto import tqdm
import evaluate

print("bitsandbytes version:", bnb.__version__)

bitsandbytes version: 0.48.2


In [None]:
DATASET_DIR = "/content/drive/MyDrive/API-Pack-Dataset"

In [None]:
print("Dataset already exists on Drive at:", DATASET_DIR)

✅ Dataset already exists on Drive at: /content/drive/MyDrive/API-Pack-Dataset


In [None]:
DATA_PATH = f"{DATASET_DIR}/api_pack_starcoder_training.jsonl"

dataset = []
with open(DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            dataset.append(json.loads(line))

print(f"Loaded {len(dataset)} samples")


random.seed(42)
samples = random.sample(dataset, min(10, len(dataset)))
print(" Selected", len(samples), "samples for baseline generation")

✅ Loaded 1014093 samples
✅ Selected 10 samples for baseline generation


In [None]:
model_name = "Qwen/Qwen2.5-1.5B"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [None]:
def build_prompt(ex):
    """Build instruction-style prompt from a dataset sample."""
    inp = ex["input"]

    instruction = inp.get("instruction", "")
    lang = inp.get("lang", "Python")
    path = inp.get("path", "")
    description = inp.get("description", "")
    api_arguments = inp.get("api_arguments", {})

    prompt = (
        f"You are an API client code generator. "
        f"You MUST output ONLY valid {lang} code. "
        f"No comments. No explanations. No markdown. Only raw code.\n\n"
        f"### USER REQUEST:\n{instruction}\n\n"
        f"### ENDPOINT PATH:\n{path}\n\n"
        f"### DESCRIPTION:\n{description}\n\n"
        f"### PARAMETERS:\n{api_arguments}\n\n"
        f"### OUTPUT:\n"
    )

    return prompt

In [None]:
def generate_code(prompt, max_new_tokens=256, temperature=0.2, top_p=0.95, do_sample=True):
    """Generate code completion for a given prompt using the current model."""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=do_sample,
            pad_token_id=tokenizer.pad_token_id,
        )

    full = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return full[len(prompt):].strip()

In [None]:
for idx, sample in enumerate(samples):
    prompt = build_prompt(sample)
    completion = generate_code(prompt)

    input_json = json.dumps(sample["input"], indent=2, ensure_ascii=False)

    md = (
        f"# 🔹 Baseline Example {idx+1}\n\n"
        f"## INPUT\n"
        "```json\n" + input_json + "\n```\n\n"
        f"## MODEL OUTPUT (Qwen2.5-1.5B)\n"
        "```python\n" + completion + "\n```\n"
        "---\n"
    )
    display(Markdown(md))

# 🔹 Baseline Example 1

## INPUT
```json
{
  "instruction": "I'd like to retrieve the gift message associated with the current order using the Magento B2B API. Could you help me with that by providing the relevant information using the endpoint giftMessageCartRepositoryV1GetGet?",
  "lang": "cURL",
  "functionality": "carts/mine/gift-message",
  "api_arguments": {},
  "description": "Return the gift message for a specified order.",
  "domain": [
    "carts/mine/gift-message"
  ],
  "path": "/V1/carts/mine/gift-message"
}
```

## MODEL OUTPUT (Qwen2.5-1.5B)
```python
The gift message associated with the current order.

### RESPONSE:
200 OK
{
    "giftMessage": "Gift message for the current order"
}

### EXAMPLE REQUEST:
curl -X GET "https://example.com/V1/carts/mine/gift-message" \
-H "Authorization: Bearer YOUR_ACCESS_TOKEN" \
-H "Content-Type: application/json"

### EXAMPLE RESPONSE:
{
    "giftMessage": "Gift message for the current order"
}
```
---


# 🔹 Baseline Example 2

## INPUT
```json
{
  "instruction": "How do I obtain the home standings for a particular league in a specified season using the Sports-Americanfootballapi-API? Please provide an example of passing the tournament ID and season ID to the function.",
  "lang": "go",
  "functionality": "leaguehomestandings",
  "api_arguments": {},
  "description": "This endpoint retrieves home standings for a specific league in a given season.\"\ntournamentid: The ID of the unique tournament for which the league's home standings are to be retrieved.\n    seasonid: The ID of the season for which the league's home standings are to be retrieved.",
  "domain": [],
  "path": "/api/american-football/tournament/{tournamentid}/season/{seasonid}/standings/home"
}
```

## MODEL OUTPUT (Qwen2.5-1.5B)
```python
A JSON object containing the home standings for the specified league in the specified season.

### EXAMPLE REQUEST:
{
    "tournamentid": "12345",
    "seasonid": "2023"
}

### EXAMPLE RESPONSE:
{
    "standings": [
        {
            "team": "Team A",
            "wins": 10,
            "losses": 2,
            "ties": 0
        },
        {
            "team": "Team B",
            "wins": 8,
            "losses": 4,
            "ties": 0
        },
        {
            "team": "Team C",
            "wins": 6,
            "losses": 6,
            "ties": 0
        }
    ]
}

### EXAMPLE RESPONSE:
{
    "standings": [
        {
            "team": "Team A",
            "wins": 10,
            "losses": 2,
            "ties": 0
        },
        {
            "team": "Team B",
            "wins": 8,
            "losses": 4,
            "ties": 0
        },
        {
            "team": "Team C",
            "wins":
```
---


# 🔹 Baseline Example 3

## INPUT
```json
{
  "instruction": "\"How do I retrieve a list of schedules using the VTLiveApi.ScheduleService_v2.GetShifts endpoint from the Web API para cegid Visualtime?\"",
  "lang": "libcurl",
  "functionality": "Get the list of schedules",
  "api_arguments": {},
  "description": "Get the list of schedules",
  "domain": [
    "Calendar"
  ],
  "path": "/api/v2/ScheduleService.svc/GetShifts"
}
```

## MODEL OUTPUT (Qwen2.5-1.5B)
```python
{
    "Shifts": [
        {
            "ShiftId": 1,
            "ShiftName": "Morning Shift",
            "ShiftDescription": "This is the morning shift",
            "ShiftStartTime": "08:00:00",
            "ShiftEndTime": "16:00:00",
            "ShiftStatus": "Active",
            "ShiftType": "Full Time",
            "ShiftLocation": "Office",
            "ShiftNotes": "This is the morning shift notes"
        },
        {
            "ShiftId": 2,
            "ShiftName": "Evening Shift",
            "ShiftDescription": "This is the evening shift",
            "ShiftStartTime": "16:00:00",
            "ShiftEndTime": "24:00:00",
            "ShiftStatus": "Active",
            "ShiftType": "Full Time",
            "ShiftLocation": "Office",
            "ShiftNotes": "This is the evening shift notes"
        }
    ]
}

### CODE:
curl -X GET "https://api.example.com/api/v2/ScheduleService.svc/GetShifts"
```
---


# 🔹 Baseline Example 4

## INPUT
```json
{
  "instruction": "I'd like to retrieve the configuration details of a specific software package using the AWS IoT API. Could you please help me with that by using the GetPackageConfiguration endpoint?",
  "lang": "swift",
  "functionality": "",
  "api_arguments": {},
  "description": "<p>Gets information about the specified software package's configuration.</p> <p>Requires permission to access the <a href=\"https://docs.aws.amazon.com/service-authorization/latest/reference/list_awsiot.html#awsiot-actions-as-permissions\">GetPackageConfiguration</a> action.</p>",
  "domain": [],
  "path": "/package-configuration"
}
```

## MODEL OUTPUT (Qwen2.5-1.5B)
```python
<p>Software package configuration details.</p> <p>For more information about the structure of this data, see <a href="https://docs.aws.amazon.com/iot/latest/apireference/API_PackageConfiguration.html">PackageConfiguration</a>.</p>

### EXAMPLE REQUEST:
```
GET /package-configuration HTTP/1.1
Host: <your-iot-endpoint>
Authorization: <your-authorization-token>
```

### EXAMPLE RESPONSE:
```
HTTP/1.1 200 OK
Content-Type: application/json

{
  "PackageConfiguration": {
    "PackageId": "example-package-id",
    "PackageVersion": "1.0.0",
    "PackageStatus": "ACTIVE",
    "PackageDescription": "This is an example package.",
    "PackageLicense": "LICENSE",
    "PackageLicenseUrl": "https://example.com/license.txt",
    "PackageLicenseType": "LICENSE_TYPE",
    "PackageLicenseVersion": "1.0.0",
    "PackageLicenseUrl": "https://example.com/license.txt",
    "PackageLicenseType": "LICENSE_TYPE",
    "PackageLicenseVersion": "1.0.0",
    "PackageLicenseUrl": "https://example.com/license.txt",
```
---


# 🔹 Baseline Example 5

## INPUT
```json
{
  "instruction": "I'd like to update the webhook registration for my conversations with a new callback URL, using the REST API Version 2. Could you show me how to craft a query for updating the registration with the given filter?",
  "lang": "Java",
  "functionality": "Updates a WebHook registration",
  "api_arguments": {},
  "description": "Updates a webHook registration with the given filter and callback URL.\nOauthScopes: READ_CONVERSATIONS, READ_USER",
  "domain": [
    "Outgoing Webhooks"
  ],
  "path": "/webhooks/{id}"
}
```

## MODEL OUTPUT (Qwen2.5-1.5B)
```python
{}

### EXAMPLE REQUEST:
{
  "callbackUrl": "https://example.com/callback"
}

### EXAMPLE RESPONSE:
{
  "id": "12345",
  "callbackUrl": "https://example.com/callback",
  "createdAt": "2023-04-01T12:34:56Z",
  "updatedAt": "2023-04-01T12:34:56Z",
  "status": "active"
}

### EXAMPLE RESPONSE (ERROR):
{
  "error": "Invalid callback URL",
  "message": "The provided callback URL is invalid."
}

### CODE:
```java
import com.example.conversations.api.model.WebhookRegistration;
import com.example.conversations.api.model.WebhookRegistrationUpdate;
import com.example.conversations.api.model.WebhookRegistrationUpdateCallbackUrl;
import com.example.conversations.api.model.WebhookRegistrationUpdateCallbackUrlCallbackUrl;
import com.example.conversations.api.model.WebhookRegistrationUpdateCallbackUrlCallbackUrlCallbackUrl;
import com.example.conversations.api.model.WebhookRegistrationUpdateCallbackUrlCallbackUrlCallbackUrlCallbackUrl;
import com.example.conversations.api.model.WebhookRegistrationUpdateCallbackUrlCallbackUrlCallbackUrlCallbackUrlCallbackUrl;
import
```
---


# 🔹 Baseline Example 6

## INPUT
```json
{
  "instruction": "How can I use the Sports-Basketapi-API to view the last basketball matches played by a particular player? To execute this request, please provide the player ID and specify the desired page number. Keep in mind that the page parameter is a zero-based index.",
  "lang": "Java",
  "functionality": "playerlastmatches",
  "api_arguments": {},
  "description": "Get the last matches of a basketball player using their id.\"\npage: Zero-based page.\n    id: The team id you want to retrieve the last matches.",
  "domain": [],
  "path": "/api/basketball/player/{is_id}/matches/previous/{page}"
}
```

## MODEL OUTPUT (Qwen2.5-1.5B)
```python
{
    "matches": [
        {
            "id": 1,
            "date": "2023-01-01",
            "time": "12:00:00",
            "location": "Arena A",
            "team": "Team A",
            "score": 100,
            "player": "Player A"
        },
        {
            "id": 2,
            "date": "2023-01-02",
            "time": "13:00:00",
            "location": "Arena B",
            "team": "Team B",
            "score": 90,
            "player": "Player B"
        }
    ]
}

### EXAMPLE REQUEST:
/api/basketball/player/1/matches/previous/0

### EXAMPLE RESPONSE:
{
    "matches": [
        {
            "id": 1,
            "date": "2023-01-01",
            "time": "12:00:00",
            "location": "Arena A",
            "team": "Team A",
            "score": 100,
            "player": "Player A"
```
---


# 🔹 Baseline Example 7

## INPUT
```json
{
  "instruction": "I'd like to search for relevant job listings using the News Media-Job Alerts-API. Could you help me retrieve a list of jobs with optional filters based on keywords and job type, while also specifying the page number?",
  "lang": "Java",
  "functionality": "jobslist",
  "api_arguments": {},
  "description": "This end point provides list of jobs list with page number as required parameter and keywords and job type as optional parameter",
  "domain": [],
  "path": "/jobs"
}
```

## MODEL OUTPUT (Qwen2.5-1.5B)
```python
{
  "jobs": [
    {
      "title": "Job Title 1",
      "company": "Company 1",
      "location": "Location 1",
      "description": "Job Description 1",
      "keywords": ["Keyword 1", "Keyword 2"],
      "job_type": "Job Type 1"
    },
    {
      "title": "Job Title 2",
      "company": "Company 2",
      "location": "Location 2",
      "description": "Job Description 2",
      "keywords": ["Keyword 3", "Keyword 4"],
      "job_type": "Job Type 2"
    }
  ],
  "page_number": 1,
  "total_pages": 2
}

### EXAMPLE RESPONSE:
{
  "jobs": [
    {
      "title": "Job Title 1",
      "company": "Company 1",
      "location": "Location 1",
      "description": "Job Description 1",
      "keywords": ["Keyword 1", "Keyword 2"],
      "job_type": "Job Type 1"
    },
    {
      "title": "Job Title 2",
      "company": "Company 2
```
---


# 🔹 Baseline Example 8

## INPUT
```json
{
  "instruction": "To discover the current top songs worldwide in a specific music genre using the Music-Shazam Core-API, please provide the desired genre from the following options: POP, HIP_HOP_RAP, DANCE, ELECTRONIC, SOUL_RNB, ALTERNATIVE, ROCK, LATIN, FILM_TV, COUNTRY, AFRO_BEATS, WORLDWIDE, REGGAE_DANCE_HALL, HOUSE, K_POP, FRENCH_POP, SINGER_SONGWRITER, REG_MEXICO. I will help you retrieve the top songs chart for the indicated genre.",
  "lang": "go",
  "functionality": "charts_get_top_songs_in_world_by_genre",
  "api_arguments": {},
  "description": "return top songs in world by genre\"\ngenre: POP,HIP_HOP_RAP,DANCE,ELECTRONIC,SOUL_RNB,ALTERNATIVE,ROCK,LATIN,FILM_TV,COUNTRY,AFRO_BEATS,WORLDWIDE,REGGAE_DANCE_HALL,HOUSE,K_POP,FRENCH_POP,SINGER_SONGWRITER,REG_MEXICO",
  "domain": [],
  "path": "/charts/get-top-songs-in_world_by_genre"
}
```

## MODEL OUTPUT (Qwen2.5-1.5B)
```python
```go
func getTopSongsInWorldByGenre(genre string) ([]*TopSong, error) {
    // Your implementation here
}
```

### RESPONSE:
```json
{
    "data": [
        {
            "id": 1,
            "name": "Song Name 1",
            "artist": "Artist Name 1",
            "genre": "POP",
            "popularity": 100,
            "release_date": "2023-01-01"
        },
        {
            "id": 2,
            "name": "Song Name 2",
            "artist": "Artist Name 2",
            "genre": "HIP_HOP_RAP",
            "popularity": 95,
            "release_date": "2023-02-01"
        },
        {
            "id": 3,
            "name": "Song Name 3",
            "artist": "Artist Name 3",
            "genre": "DANCE",
            "popularity": 90,
            "release_date": "2023-03-01"
        },
        {
            "id": 4,
            "name": "
```
---


# 🔹 Baseline Example 9

## INPUT
```json
{
  "instruction": "I want to pause the scheduling of a specific AWS Glue crawler without stopping it in the middle of its current run. How do I achieve this using the AWS Glue API? Please provide a clear query or example for the StopCrawlerSchedule endpoint.",
  "lang": "swift",
  "functionality": "",
  "api_arguments": {},
  "description": "Sets the schedule state of the specified crawler to <code>NOT_SCHEDULED</code>, but does not stop the crawler if it is already running.",
  "domain": [],
  "path": "/#X-Amz-Target=AWSGlue.StopCrawlerSchedule"
}
```

## MODEL OUTPUT (Qwen2.5-1.5B)
```python
{}

### EXAMPLE REQUEST:
{
  "Action": "StopCrawlerSchedule",
  "Version": "2017-03-31",
  "Input": {
    "CrawlerId": "crawler-1234567890"
  }
}

### EXAMPLE RESPONSE:
{
  "ResponseMetadata": {
    "RequestId": "12345678-1234-1234-1234-123456789012",
    "HostId": "host-id",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "x-amzn-requestid": "12345678-1234-1234-1234-123456789012",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "123",
      "date": "Wed, 01 Jan 2020 00:00:00 GMT"
    },
    "RetryAttempts": 0
  }
}

### EXAMPLE RESPONSE BODY:
{
```
---


# 🔹 Baseline Example 10

## INPUT
```json
{
  "instruction": "As a logged-in fraud manager, how can I utilize the Key UI Template API to incorporate specified parameters into a customer's information group?",
  "lang": "go",
  "functionality": "Adds given parameters to the customer information group.",
  "api_arguments": {},
  "description": "This can only be done by the logged fraud manager.",
  "domain": [
    "admins"
  ],
  "path": "/parametergroup/addtocustomerinfo"
}
```

## MODEL OUTPUT (Qwen2.5-1.5B)
```python
{
  "status": "success",
  "message": "Customer information group added successfully"
}

### CODE:
```go
package main

import (
	"fmt"
	"net/http"
)

func main() {
	http.HandleFunc("/parametergroup/addtocustomerinfo", addCustomerInfoGroup)
	http.ListenAndServe(":8080", nil)
}

func addCustomerInfoGroup(w http.ResponseWriter, r *http.Request) {
	// Your implementation here
}
```
```
---


In [None]:
flat_rows = []

with open(DATA_PATH, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        line = line.strip()
        if not line:
            continue

        obj = json.loads(line)

        inp = obj.get("input", {})
        out = obj.get("output", {})

        instruction = str(inp.get("instruction", "") or "")
        lang = str(inp.get("lang", "") or "Python")
        description = str(inp.get("description", "") or "")
        path = str(inp.get("path", "") or "")
        api_call = str(out.get("api_call", "") or "")

        api_args = inp.get("api_arguments", {})
        if not isinstance(api_args, dict):
            api_args = {}
        api_args_str = json.dumps(api_args)

        prompt_text = (
            f"You are an API client code generator. "
            f"You MUST output ONLY valid {lang} code. "
            f"No comments. No explanations. No markdown. Only raw code.\n\n"
            f"### USER REQUEST:\n{instruction}\n\n"
            f"### ENDPOINT PATH:\n{path}\n\n"
            f"### DESCRIPTION:\n{description}\n\n"
            f"### PARAMETERS:\n{api_args_str}\n\n"
            f"### OUTPUT:\n"
        )

        flat_rows.append({"text": prompt_text + api_call})

        if i > 0 and i % 200000 == 0:
            print("Processed:", i)

print("Loaded + flattened:", len(flat_rows))

raw_ds = Dataset.from_list(flat_rows)
print(raw_ds)

Processed: 200000
Processed: 400000
Processed: 600000
Processed: 800000
Processed: 1000000
Loaded + flattened: 1014093
Dataset({
    features: ['text'],
    num_rows: 1014093
})


In [None]:
MAX_TRAIN_SAMPLES = None

if MAX_TRAIN_SAMPLES is not None and MAX_TRAIN_SAMPLES < len(raw_ds):
    train_ds = raw_ds.select(range(MAX_TRAIN_SAMPLES))
else:
    train_ds = raw_ds

print("Training dataset size:", len(train_ds))


Training dataset size: 1014093


In [None]:
MAX_LENGTH = 1024


def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
    )


subset_size = min(100000, len(train_ds))
subset = train_ds.select(range(subset_size))

tokenized_ds = subset.map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_ds.set_format(type="torch")

print(tokenized_ds)
print(tokenized_ds[0].keys())

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 100000
})
dict_keys(['input_ids', 'attention_mask'])


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16
    if (torch.cuda.is_available() and torch.cuda.is_bf16_supported())
    else torch.float16,
    trust_remote_code=True,
)


model.config.use_cache = False
model.config.return_dict = True

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 4,358,144 || all params: 1,548,072,448 || trainable%: 0.2815


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


In [None]:
if torch.cuda.is_available():
    total_mem = torch.cuda.get_device_properties(0).total_memory
    per_device_train_batch_size = 2 if total_mem >= 20_000_000_000 else 1
else:
    per_device_train_batch_size = 1

bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
fp16 = torch.cuda.is_available() and not bf16


In [None]:
training_args = TrainingArguments(
    output_dir="/content/qwen_lora_output_1.5",
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    num_train_epochs=1,
    logging_steps=50,
    save_steps=2000,
    save_total_limit=3,
    gradient_checkpointing=False,
    bf16=bf16,
    fp16=fp16,
    optim="paged_adamw_32bit",
    report_to="none",
    remove_unused_columns=False,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
50,1.6486
100,0.9951
150,0.8502
200,0.7833
250,0.7637
300,0.7227
350,0.7166
400,0.7164
450,0.7063
500,0.7336


TrainOutput(global_step=6250, training_loss=0.6317202691650391, metrics={'train_runtime': 10687.499, 'train_samples_per_second': 9.357, 'train_steps_per_second': 0.585, 'total_flos': 8.077509132288e+17, 'train_loss': 0.6317202691650391, 'epoch': 1.0})

In [None]:
SAVE_DIR = "/content/qwen_lora_final"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

model.eval()

print("Training complete. Saved to:", SAVE_DIR)

is_lora = isinstance(model, PeftModel)
print("Is LoRA applied?", is_lora)

Training complete. Saved to: /content/qwen_lora_final
Is LoRA applied? True


In [None]:
SAVE_DIR = "/content/drive/MyDrive/Qwen-LoRA-1.5B"
os.makedirs(SAVE_DIR, exist_ok=True)


In [None]:
model.save_pretrained(f"{SAVE_DIR}/lora_adapter")
tokenizer.save_pretrained(f"{SAVE_DIR}/tokenizer")


('/content/drive/MyDrive/Qwen-LoRA-1.5B/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Qwen-LoRA-1.5B/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Qwen-LoRA-1.5B/tokenizer/chat_template.jinja',
 '/content/drive/MyDrive/Qwen-LoRA-1.5B/tokenizer/vocab.json',
 '/content/drive/MyDrive/Qwen-LoRA-1.5B/tokenizer/merges.txt',
 '/content/drive/MyDrive/Qwen-LoRA-1.5B/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/Qwen-LoRA-1.5B/tokenizer/tokenizer.json')

In [None]:
MERGED_DIR = "/content/qwen2_lora_merged"

In [None]:
os.makedirs(MERGED_DIR, exist_ok=True)

merged = model.merge_and_unload()
merged.save_pretrained(MERGED_DIR)
tokenizer.save_pretrained(MERGED_DIR)

('/content/qwen2_lora_merged/tokenizer_config.json',
 '/content/qwen2_lora_merged/special_tokens_map.json',
 '/content/qwen2_lora_merged/chat_template.jinja',
 '/content/qwen2_lora_merged/vocab.json',
 '/content/qwen2_lora_merged/merges.txt',
 '/content/qwen2_lora_merged/added_tokens.json',
 '/content/qwen2_lora_merged/tokenizer.json')

In [None]:

from peft import PeftModel

base = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

eval_model = PeftModel.from_pretrained(base, f"{SAVE_DIR}/lora_adapter")
eval_model.eval()

print("Loaded LoRA-applied model.")

Loaded LoRA-applied model.


In [None]:
def generate(model, prompt, max_new_tokens=256):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            do_sample=False,
        )
    full = tokenizer.decode(out[0], skip_special_tokens=True)
    return full[len(prompt):].strip()

In [None]:
bleu = evaluate.load("bleu")
sacre = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

results = {
    "exact_match": [],
    "bleu": [],
    "sacrebleu": [],
    "rougeL": [],
    "syntax": [],
    "perplexity": [],
}


def compute_perplexity(model, text):
    enc = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        loss = model(**enc, labels=enc["input_ids"]).loss
    return math.exp(loss.item())


eval_samples = random.sample(dataset, 100)

for ex in tqdm(eval_samples):
    prompt = build_prompt(ex)
    gold = ex["output"]["api_call"]
    pred = generate(eval_model, prompt)


    results["exact_match"].append(int(pred.strip() == gold.strip()))


    results["bleu"].append(
        bleu.compute(predictions=[pred], references=[[gold]])["bleu"]
    )


    results["sacrebleu"].append(
        sacre.compute(predictions=[pred], references=[gold])["score"]
    )


    results["rougeL"].append(
        rouge.compute(predictions=[pred], references=[gold])["rougeL"]
    )


    if ex["input"]["lang"].lower() == "python":
        try:
            ast.parse(pred)
            results["syntax"].append(1)
        except:
            results["syntax"].append(0)
    else:
        results["syntax"].append(1)


    results["perplexity"].append(compute_perplexity(eval_model, prompt + pred))

print("\n===== FINAL SCORES =====")
from statistics import mean
for k, v in results.items():
    print(k, ":", mean(v))

  0%|          | 0/100 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end ge


===== FINAL SCORES =====
exact_match : 0.03
bleu : 0.18474212560621497
sacrebleu : 18.922363168899793
rougeL : 0.25028160843720093
syntax : 0.88
perplexity : 2.0413812381256644


In [None]:
LORA_SAVE_DIR = "/content/drive/MyDrive/Qwen-LoRA-1.5B/lora_adapter"
TOKENIZER_SAVE_DIR = "/content/drive/MyDrive/Qwen-LoRA-1.5B/tokenizer"

os.makedirs(LORA_SAVE_DIR, exist_ok=True)
os.makedirs(TOKENIZER_SAVE_DIR, exist_ok=True)

print("Saving LoRA adapter to Drive...")
model.save_pretrained(LORA_SAVE_DIR)
tokenizer.save_pretrained(TOKENIZER_SAVE_DIR)
print("LoRA saved to Drive")


MERGED_SAVE_DIR = "/content/drive/MyDrive/Qwen-LoRA-1.5B/merged_model"
os.makedirs(MERGED_SAVE_DIR, exist_ok=True)

print("Merging LoRA into base model...")
merged = model.merge_and_unload()

print("Saving merged full model to Drive...")
merged.save_pretrained(MERGED_SAVE_DIR)
tokenizer.save_pretrained(MERGED_SAVE_DIR)
print("✅ Merged model saved to Drive")

Saving LoRA adapter to Drive...
✅ LoRA saved to Drive
Merging LoRA into base model...
Saving merged full model to Drive...
✅ Merged model saved to Drive


In [None]:
CHECKPOINT_DIR = "/content/drive/MyDrive/Qwen-LoRA-1.5B/checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

print("Backing up final checkpoint...")
trainer.save_model(CHECKPOINT_DIR)
tokenizer.save_pretrained(CHECKPOINT_DIR)
print("✅ Training checkpoint saved to Drive")

Backing up final checkpoint...
✅ Training checkpoint saved to Drive


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, PeftModel, get_peft_model

BASE_MODEL = "Qwen/Qwen2.5-1.5B"
CHECKPOINT = "/content/drive/MyDrive/Qwen-LoRA-1.5B/checkpoint-6250"
TOKENIZER_DIR = "/content/drive/MyDrive/Qwen-LoRA-1.5B/tokenizer"


tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)


model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)


model = PeftModel.from_pretrained(model, CHECKPOINT)


lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
    task_type="CAUSAL_LM",
)


model = get_peft_model(model, lora_config)

print()
model.print_trainable_parameters()
print()



trainable params: 4,358,144 || all params: 1,548,072,448 || trainable%: 0.2815





In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Qwen-LoRA-1.5B/further_training",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    max_steps = 1250,
    warmup_ratio=0.03,
    logging_steps=50,
    save_steps=1000,
    save_total_limit=3,
    bf16=True,
    optim="paged_adamw_32bit",
    remove_unused_columns=False,
    report_to="none"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [None]:
model.print_trainable_parameters()


trainable params: 4,358,144 || all params: 1,548,072,448 || trainable%: 0.2815


In [None]:
trainer.train()

Step,Training Loss
50,0.7476
100,0.7486
150,0.7603
200,0.7294
250,0.7221
300,0.6905
350,0.6891
400,0.6883
450,0.6825
500,0.7151


TrainOutput(global_step=1250, training_loss=0.6871484527587891, metrics={'train_runtime': 2315.1049, 'train_samples_per_second': 8.639, 'train_steps_per_second': 0.54, 'total_flos': 1.6683287361827635e+17, 'train_loss': 0.6871484527587891, 'epoch': 0.2})

In [None]:
next(model.parameters()).device


device(type='cuda', index=0)

In [None]:
# !find /content -maxdepth 3 -type f -name "trainer_state.json"


/content/qwen_lora_output_1.5/checkpoint-6250/trainer_state.json
/content/qwen_lora_output_1.5/checkpoint-6000/trainer_state.json
/content/qwen_lora_output_1.5/checkpoint-4000/trainer_state.json


In [None]:
# !mkdir -p /content/drive/MyDrive/Qwen-LoRA-1.5B/checkpoints
# !cp -r /content/qwen_lora_output_1.5/checkpoint-6250/* \
#       /content/drive/MyDrive/Qwen-LoRA-1.5B/checkpoints/


In [None]:
!rm -rf /content/drive/MyDrive/Qwen-LoRA-1.5B/checkpoints


In [None]:
!cp -r /content/qwen_lora_output_1.5/checkpoint-6250 \
      /content/drive/MyDrive/Qwen-LoRA-1.5B/


In [None]:
!cp -r /content/qwen_lora_output_1.5 \
      /content/drive/MyDrive/qwen_lora_output_1.5_old


In [None]:
!cp -r /content/qwen2_lora_merged \
      /content/drive/MyDrive/qwen2_lora_merged_old

In [None]:
!cp -r /content/qwen_lora_final \
      /content/drive/MyDrive/qwen_lora_final_old


In [None]:
bleu = evaluate.load("bleu")
sacre = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

results = {
    "exact_match": [],
    "bleu": [],
    "sacrebleu": [],
    "rougeL": [],
    "syntax": [],
    "perplexity": [],
}


def compute_perplexity(model, text):
    enc = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        loss = model(**enc, labels=enc["input_ids"]).loss
    return math.exp(loss.item())


eval_samples = random.sample(dataset, 100)

for ex in tqdm(eval_samples):
    prompt = build_prompt(ex)
    gold = ex["output"]["api_call"]
    pred = generate(eval_model, prompt)


    results["exact_match"].append(int(pred.strip() == gold.strip()))

    results["bleu"].append(
        bleu.compute(predictions=[pred], references=[[gold]])["bleu"]
    )


    results["sacrebleu"].append(
        sacre.compute(predictions=[pred], references=[gold])["score"]
    )


    results["rougeL"].append(
        rouge.compute(predictions=[pred], references=[gold])["rougeL"]
    )

    if ex["input"]["lang"].lower() == "python":
        try:
            ast.parse(pred)
            results["syntax"].append(1)
        except:
            results["syntax"].append(0)
    else:
        results["syntax"].append(1)


    results["perplexity"].append(compute_perplexity(eval_model, prompt + pred))

print("\n===== FINAL SCORES =====")
from statistics import mean
for k, v in results.items():
    print(k, ":", mean(v))

  0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for


===== FINAL SCORES =====
exact_match : 0.06
bleu : 0.15610176225728237
sacrebleu : 16.06591112911089
rougeL : 0.23523325355026217
syntax : 0.9
perplexity : 1.959550267994667
