In [None]:
# Cloning Dataset  - API Pack and Storing it in google drive
from google.colab import drive
import shutil
drive.mount('/content/drive')


!apt-get install git-lfs
!git lfs install
!git clone https://huggingface.co/datasets/apipack/API-Pack-Dataset


src = '/content/API-Pack-Dataset'
dst = '/content/drive/MyDrive/API-Pack-Dataset'

shutil.move(src, dst)

print("Dataset successfully saved to your Google Drive at:", dst)


Mounted at /content/drive
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Git LFS initialized.
Cloning into 'API-Pack-Dataset'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 35 (delta 3), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (35/35), 9.46 KiB | 1.58 MiB/s, done.
Filtering content: 100% (10/10), 3.14 GiB | 255.43 MiB/s, done.
✅ Dataset successfully saved to your Google Drive at: /content/drive/MyDrive/API-Pack-Dataset


In [5]:
# API Pack dataset has APIs in different languages such as python, javascript, java, ruby, go, swift and so on. Our team intends to work on the python data primarily for this task
import json
import glob
import os


DATASET_DIR = "/content/drive/MyDrive/API-Pack-Dataset"
OUT_MAIN = f"{DATASET_DIR}/api_pack_training.jsonl"


SUB_DIR = f"{DATASET_DIR}/langs"
os.makedirs(SUB_DIR, exist_ok=True)

files = glob.glob(f"{DATASET_DIR}/total_data_cleaned_*.json")
print("Found files:", files)

def clean_text(t):
    if t is None:
        return ""
    return t.strip()

# Creating input and output pairs needed for finetuning. For this we are using concatenating instruction, language of api call,, functionality of the api, api arguments, description, domain and path available
# in the dataset as prompt to the large language model.

def process_single_item(ex):
    api_call_data = ex.get("api_call_data", {})

    input_obj = {
        "instruction": clean_text(ex.get("instruction", "")),
        "lang": clean_text(api_call_data.get("lang", "")),
        "functionality": clean_text(api_call_data.get("functionality", "")),
        "api_arguments": api_call_data.get("api_arguments", {}),
        "description": clean_text(api_call_data.get("description", "")),
        "domain": api_call_data.get("domain", []),
        "path": clean_text(api_call_data.get("path", "")),
    }

    output_obj = {
        "api_call": clean_text(api_call_data.get("api_call", "")),
    }

    return {"input": input_obj, "output": output_obj}


all_rows = []
lang_groups = {}
for fpath in files:
    print("Processing:", fpath)

    with open(fpath, "r", encoding="utf-8") as f:
        data = json.load(f)

    for ex in data:
        row = process_single_item(ex)
        all_rows.append(row)

        lang = row["input"]["lang"]
        if lang not in lang_groups:
            lang_groups[lang] = []
        lang_groups[lang].append(row)


print("Total processed samples:", len(all_rows))


with open(OUT_MAIN, "w", encoding="utf-8") as f:
    for row in all_rows:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Saved main training dataset →", OUT_MAIN)


for lang, rows in lang_groups.items():
    safe_lang = lang.lower().replace(" ", "_")
    out_path = f"{SUB_DIR}/api_pack_{safe_lang}.jsonl"

    with open(out_path, "w", encoding="utf-8") as f:
        for row in rows:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

    print(f"Saved {lang} dataset → {out_path}")


Found files: ['/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_java.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_libcurl.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_node.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_ruby.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_go.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_javascript.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_php.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_swift.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_curl.json', '/content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_python.json']
Processing: /content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_java.json
Processing: /content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_libcurl.json
Processing: /content/drive/MyDrive/API-Pack-Dataset/total_data_cleaned_node.json
Processing: