In [2]:
import datasets

In [3]:
# load in dataset
raw_dataset = datasets.load_from_disk("dataset.hf")
print(raw_dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 117
    })
    test: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 14
    })
})


In [4]:
example = raw_dataset["train"][0]
example["prompt"]

'How do I debrief during deliberations?\n'

In [5]:
messages = example["messages"]
for message in messages:
    role = message["role"]
    content = message["content"]
    print('{0:20}:  {1}'.format(role, content))

user                :  How do I debrief during deliberations?

assistant           :  Deliberations can get pretty heated. It’s important to focus on the applicants and their many amazing qualities and not attack your subteam members. We all have the same goal: to accept the best candidates possible. No matter how heated the deliberations get, stay respectful and stay objective. \n Afterwards, the team should debrief. It’s always good to get feedback on our recruitment process, both internally and externally. Recruitment is vital to the culture of CDS, so we need to make sure that we critically evaluate our internal processes each semester. 



In [6]:
from transformers import AutoTokenizer

model_id = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

# Set reasonable default for models without max length
if tokenizer.model_max_length > 100_000:
  tokenizer.model_max_length = 2048

# Set chat template
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE

In [7]:
import re
import random
from multiprocessing import cpu_count

def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    # We add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

    return example

column_names = list(raw_dataset["train"].features)
raw_datasets = raw_dataset.map(apply_chat_template,
                                num_proc=cpu_count(),
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template",)

# create the splits
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]

for index in random.sample(range(len(raw_datasets["train"])), 3):
  print(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")

Applying chat template (num_proc=12):   0%|          | 0/117 [00:00<?, ? examples/s]

Applying chat template (num_proc=12):   0%|          | 0/14 [00:00<?, ? examples/s]

Sample 45 of the processed training set:

<|system|>
</s>
<|user|>
What are the responsibilities of officers on CDS?
</s>
<|assistant|>
a. Each officer, both elected and appointed, shall be responsible for their duties as listed in this constitution. \n b. Each officer shall also be responsible for establishing and maintaining a working relationship with the General Membership, Executive Board, and all other officers, advisors, chairs, committees, and staff necessary to perform their duties.
</s>

Sample 39 of the processed training set:

<|system|>
</s>
<|user|>
How do I measure teamwork for written applications?
</s>
<|assistant|>
0 - No mention of working on a team or with others. Applicant is very self-centered or does not consider teamwork. \n 1 - Mentions working with others on some project. Calls out maintaining positive relationships or specifically how they resolved a conflict. Has experience working collaboratively with others, or mentoring others. \n 2 - Working with others 

In [9]:

from transformers import BitsAndBytesConfig
import torch

# specify how to quantize the model
# quantization_config = BitsAndBytesConfig(
#             load_in_4bit=True,
#             bnb_4bit_quant_type="nf4",
#             bnb_4bit_compute_dtype="torch.bfloat16",
# )
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

model_kwargs = dict(
    attn_implementation="flash_attention_2", # set this to True if your GPU supports it (Flash Attention drastically speeds up model computations)
    torch_dtype="auto",
    use_cache=False, # set to False as we're going to use gradient checkpointing
    device_map=device_map,
)
    

In [11]:
from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments

# path where the Trainer will save its checkpoints and logs
output_dir = 'model/zephyr-7b-sft-lora'

# based on config
training_args = TrainingArguments(
    # fp16=True, # specify bf16=True instead when training on GPUs that support bf16
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=128,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=1,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=1, # originally set to 8
    per_device_train_batch_size=1, # originally set to 8
    # push_to_hub=True,
    # hub_model_id="zephyr-7b-sft-lora",
    # hub_strategy="every_save",
    # report_to="tensorboard",
    save_strategy="no",
    save_total_limit=None,
    seed=42,
)

# based on config
peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

trainer = SFTTrainer(
        model=model_id,
        model_init_kwargs=model_kwargs,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="text",
        tokenizer=tokenizer,
        packing=True,
        peft_config=peft_config,
        max_seq_length=tokenizer.model_max_length,
    )



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]