In [1]:
!pip install datasets accelerate bitsandbytes trl peft deepspeed -q

In [2]:
import numpy as np
import pandas as pd

import datasets
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorWithPadding
import accelerate
import bitsandbytes as bnb
import torch
#from trl import SFTTrainer
#from trl.trainer import ConstantLengthDataset
from peft import LoraConfig, get_peft_model
#import deepspeed

In [41]:
dataset = datasets.load_dataset('flytech/llama-python-codes-30k', split='train')

In [42]:
dataset

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 27332
})

In [43]:
def formatting_func(example):
    return {'text': f"Question: {example['instruction']}\nAnswer: {example['output']}"}

In [44]:
dataset = dataset.map(formatting_func)

In [46]:
dataset['text'][0]

"Question: [INST]Help me set up my daily to-do list![/INST]\nAnswer: ```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```"

In [47]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=False
)

In [150]:
model = AutoModelForCausalLM.from_pretrained(
    'NousResearch/Llama-2-7b-chat-hf',
    quantization_config=bnb_config,
    device_map={"": 0}
)
tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-chat-hf', trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [137]:
def tokenization(data):
  model_inputs = tokenizer(data['text'], max_length=512, padding='max_length', truncation=True)
  labels = model_inputs["input_ids"].copy()
  eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
  labels = [label[1:] + [eos_token_id] for label in labels]

  model_inputs['labels'] = labels
  return model_inputs

In [138]:
tokenized_dataset = dataset.map(tokenization, batched=True,
                                remove_columns=['instruction', 'input', 'output', 'text'])

In [139]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 27332
})

In [140]:
new_tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [141]:
train_tok_dataset, test_tok_dataset = new_tokenized_dataset['train'], new_tokenized_dataset['test']

In [142]:
train_tok_dataset, test_tok_dataset

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 21865
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 5467
 }))

In [151]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

In [152]:
model.enable_input_require_grads()

In [153]:
model = get_peft_model(model, peft_config).to('cuda')
model.print_trainable_parameters()

trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.4955


In [154]:
args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=1,
    fp16=False,
    bf16=False,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    max_grad_norm=0.3,
    learning_rate=3e-4,
    weight_decay=0.001,
    optim="paged_adamw_32bit",
    lr_scheduler_type = "cosine",
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
)

In [155]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,
                                        max_length=512,
                                        padding='max_length',
                                        return_tensors='pt')

In [156]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_tok_dataset,
    eval_dataset=test_tok_dataset,
    args=args,
    data_collator=data_collator
)

In [None]:
trainer.train()