In [None]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

In [None]:
base_model_id = "mistralai/Mistral-7B-v0.1"
model_max_length = 512
dataset_name = 'alpaca_code'
output_dir = "./datasets/" + dataset_name
eos_token = '</s>'

In [None]:
dataset = pd.read_json(f'./data/{dataset_name}.json')

In [None]:
def gen_dataset_splits(ds, perc: [], verbose:False):
    ds_len = len(ds)
    train_len = int(ds_len * perc[0])
    eval_len = int(ds_len * perc[1])
    test_len = ds_len - train_len - eval_len
    if verbose:
        print(f"train size: {train_len}, validation size:{eval_len}, test size:{test_len} - total size: {ds_len}")
    splits = np.concatenate([
        np.zeros(train_len),
        np.ones(eval_len),
        np.full(test_len, 2)
    ])
    np.random.shuffle(splits)
    return splits

In [None]:
dataset['split'] = gen_dataset_splits(dataset, [.9, .01], verbose=True)

In [None]:
def alpaca_prompt(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Response:\n").format_map(row)


def alpaca_prompt_input(row):
    return ("Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n").format_map(row)


def gen_prompt(row):
    return (alpaca_prompt(row) if row['input'] == "" else alpaca_prompt_input(row)) + row['output'] + eos_token

In [None]:
dataset['prompt'] = dataset.apply(lambda x: gen_prompt(x), axis=1)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# split dataset
def get_split(ds, split_id=0):
    res = ds[ds['split'] == split_id]
    res = res.drop('split', axis=1)
    return res


train_dataset = get_split(dataset)
eval_dataset = get_split(dataset, 1)
test_dataset = get_split(dataset, 2)

# extract prompts
train_prompts = train_dataset['prompt'].to_list()
eval_prompts = eval_dataset['prompt'].to_list()
test_prompts = test_dataset['prompt'].to_list()

In [None]:
# tokenize
def tokenize(prompts, tokenizer):
    return tokenizer(prompts, truncation=True)['input_ids']


tokenized_train_dataset = tokenize(train_prompts, tokenizer)
tokenized_eval_dataset = tokenize(eval_prompts, tokenizer)
tokenized_test_dataset = tokenize(test_prompts, tokenizer)

In [None]:
# packing
def pack(tokens, max_model_size):
    packed_tokens = []
    i = 0
    pack = []
    while i < len(tokens):
        cur_len = len(pack)
        if cur_len + len(tokens[i]) <= max_model_size:
            pack.extend(tokens[i])
        else:
            packed_tokens.append(pack)
            pack = tokens[i]
        i += 1
    if len(pack) > 0:
        packed_tokens.append(pack)
    return packed_tokens

packed_train_data = pack(tokenized_train_dataset, model_max_length)
packed_eval_data = pack(tokenized_eval_dataset, model_max_length)
packed_test_data = pack(tokenized_test_dataset, model_max_length)

In [None]:
# padding
def pad(tokens, max_model_size, pad_token_id=2):
    padded_tokens = []
    for i in tokens:
        cur_len = len(i)
        if cur_len >= max_model_size:
            padded_tokens.append(i)
            continue
        needed_padding = max_model_size - cur_len
        pad = np.full(needed_padding, pad_token_id)
        i.extend(pad)
        padded_tokens.append(i)
    return padded_tokens

packed_padded_train_data = pad(packed_train_data, model_max_length, tokenizer.eos_token_id)
packed_padded_eval_data = pad(packed_eval_data, model_max_length, tokenizer.eos_token_id)
packed_padded_test_data = pad(packed_test_data, model_max_length, tokenizer.eos_token_id)

In [None]:
# save to disk
tds = Dataset.from_dict({'input_ids': packed_padded_train_data, 'labels': packed_padded_train_data})
eds = Dataset.from_dict({'input_ids': packed_padded_eval_data, 'labels': packed_padded_eval_data})
teds = Dataset.from_dict({'input_ids': packed_padded_test_data, 'labels': packed_padded_test_data})

ds = DatasetDict({
    'train': tds,
    "eval": eds,
    'test': teds
})

ds.save_to_disk(output_dir)