In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import json

**Data Preparation**

In [2]:
# Load datasets
train_dataset = load_dataset("google-research-datasets/mbpp", split="train")
eval_dataset = load_dataset("openai/openai_humaneval", split="test")

In [3]:
print(train_dataset)
print(json.dumps((train_dataset[0]), indent=2))

Dataset({
    features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
    num_rows: 374
})
{
  "task_id": 601,
  "text": "Write a function to find the longest chain which can be formed from the given set of pairs.",
  "code": "class Pair(object): \r\n\tdef __init__(self, a, b): \r\n\t\tself.a = a \r\n\t\tself.b = b \r\ndef max_chain_length(arr, n): \r\n\tmax = 0\r\n\tmcl = [1 for i in range(n)] \r\n\tfor i in range(1, n): \r\n\t\tfor j in range(0, i): \r\n\t\t\tif (arr[i].a > arr[j].b and\r\n\t\t\t\tmcl[i] < mcl[j] + 1): \r\n\t\t\t\tmcl[i] = mcl[j] + 1\r\n\tfor i in range(n): \r\n\t\tif (max < mcl[i]): \r\n\t\t\tmax = mcl[i] \r\n\treturn max",
  "test_list": [
    "assert max_chain_length([Pair(5, 24), Pair(15, 25),Pair(27, 40), Pair(50, 60)], 4) == 3",
    "assert max_chain_length([Pair(1, 2), Pair(3, 4),Pair(5, 6), Pair(7, 8)], 4) == 4",
    "assert max_chain_length([Pair(19, 10), Pair(11, 12),Pair(13, 14), Pair(15, 16), Pair(31, 54)], 5) == 5"
 

In [4]:
print(eval_dataset)
print(json.dumps((eval_dataset[0]), indent=2))

Dataset({
    features: ['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'],
    num_rows: 164
})
{
  "task_id": "HumanEval/0",
  "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n",
  "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n",
  "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candida

In [5]:
def prepocess_training_data_function(examples, tokenizer):
    # Concatenate problem and solution for training examples
    inputs = [f"Instruction:\n{problem}\n\nTest list:\n{test}\n\nResponse:\n" for problem, test in zip(examples["text"], examples["test_list"])]
    targets = examples["code"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

In [6]:
def prepocess_evaluation_data_function(examples, tokenizer):
    # Concatenate problem and solution for training examples
    inputs = [f"Instruction:\n{problem}\n\nTest list:\n{test}\n\nResponse:\n" for problem, test in zip(examples["prompt"], examples["test"])]
    targets = examples["canonical_solution"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

In [7]:
# Use tokenizer from teacher model for consistency
checkpoint = "bigcode/starcoder2-3b" 
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token 

In [8]:
tokenized_train = train_dataset.map(lambda x: prepocess_training_data_function(x, tokenizer), batched=True, remove_columns=train_dataset.column_names)
tokenized_eval = eval_dataset.map(lambda x: prepocess_evaluation_data_function(x, tokenizer), batched=True, remove_columns=eval_dataset.column_names)

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

**Build the teacher model**

Load the teacher as a large pre-trained model and use it to generate soft labels for distillation.

In [None]:
teacher_model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", dtype="auto")
teacher_model.eval()  # Set to evaluation mode

model.safetensors:   0%|          | 0.00/12.1G [00:00<?, ?B/s]

**Build the Student model from scratch**