In [20]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

In [23]:
base_model_id = "mistralai/Mistral-7B-v0.1"
model_max_length = 512
project = "alpaca-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name
eos_token = '</s>'

In [8]:
dataset = pd.read_json('./data/alpaca_gpt4_data.json')

In [9]:
def gen_dataset_splits(ds, perc: []):
    ds_len = len(ds)
    train_len = int(ds_len * perc[0])
    eval_len = int(ds_len * perc[1])
    test_len = ds_len - train_len - eval_len
    splits = np.concatenate([
        np.zeros(train_len),
        np.ones(eval_len),
        np.full(test_len, 2)
    ])
    np.random.shuffle(splits)
    return splits

In [11]:
dataset['split'] = gen_dataset_splits(dataset, [.9, .01])

In [16]:
def alpaca_prompt(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Response:\n").format_map(row)


def alpaca_prompt_input(row):
    return ("Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n").format_map(row)


def gen_prompt(row):
    return (alpaca_prompt(row) if row['input'] == "" else alpaca_prompt_input(row)) + row['output']

In [17]:
dataset['prompt'] = dataset.apply(lambda x: gen_prompt(x), axis=1)

In [26]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)