In [None]:
# Copies the model weights from google drive into colab
%cp -r "/content/drive/MyDrive/hf_llama_2_7b" "/content/"

In [None]:
# Installs required python packages
!pip install torch accelerate bitsandbytes datasets transformers peft trl scipy astrapy llama_cpp_python

In [None]:
# Import required python packages
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorForLanguageModeling
import bitsandbytes as bnb
from torch import cuda, bfloat16
import transformers
import torch
import torch.nn as nn
from google.colab import userdata
from astrapy.db import AstraDBCollection, AstraDB
from datasets import Dataset

In [None]:
# Uses GPU for processing if a CUDA device is available
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [None]:
# Set quantization settings
model_id = "hf_llama_2_7b"

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)
model_config = AutoConfig.from_pretrained(model_id)

In [None]:
# Loads model weights
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto'
)

model.eval()
print(f"Model loaded on {device}")

mem = model.get_memory_footprint()
print("Memory footprint: {} ".format(mem))

In [None]:
# Loads model's associated tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_fast=True)

In [None]:
# Pulls all our instruction data from Astra
token = userdata.get('astra_token')
endpoint = userdata.get('astra_endpoint')

collection_name = "instructions"
astra_db = AstraDB(token=token, api_endpoint=endpoint)
collection = AstraDBCollection(collection_name=collection_name, astra_db=astra_db)

nextPageState = ""
raw_dataset = []
expected_columns = ['instruction', 'input', 'output']

def check_expected_columns(raw_instruction):
  if all(column in raw_instruction for column in expected_columns):
    return True
  else:
    return False

while nextPageState != None:
  if nextPageState == "":
    data = collection.find()
    nextPageState = data['data']['nextPageState']
    raw_instructions = [instruction for instruction in data['data']['documents'] if check_expected_columns(instruction)]
    raw_dataset.extend(raw_instructions)
  else:
    data = collection.find(options={"pageState":nextPageState}, sort = None)
    nextPageState = data['data']['nextPageState']
    raw_instructions = [instruction for instruction in data['data']['documents'] if check_expected_columns(instruction)]
    raw_dataset.extend(raw_instructions)

print(raw_dataset[0])
print(len(raw_dataset))

In [None]:
# Turns separated instruction dicts from Astra into a dataset of combined instructions
def create_prompt(record):

  start = "Read the Instruction below and provide an answer."
  question = f"### INSTRUCTION:\n{record['instruction']}\n\n"
  response = f"### Context:\n{record['input']}\n"
  answer = f"### Response:\n {record['output']}\n\n"
  end = "### End"

  parts = [part for part in [start, question, response, answer, end] if part]

  formatted_prompt = "\n\n".join(parts)
  formatted_prompt = formatted_prompt.replace('\\n', '\n')

  record["text"] = formatted_prompt

  return record

p = create_prompt(raw_dataset[0])
print(p["text"])
combined_dataset = list(map(create_prompt, raw_dataset))
print(combined_dataset[1]['text'])

dataset = Dataset.from_list([{"text" : record["text"]} for record in combined_dataset])
print(dataset[2])

In [None]:
#max length of the model
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length
mx = get_max_length(model)
mx

In [None]:
#tokenize dataset
dataset = dataset.map(lambda samples: tokenizer(samples['text']), batched=True)
dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < mx)
seed = 402
set_seed(seed)
dataset = dataset.shuffle(seed=seed)

In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [None]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)
print(modules)

#['v_proj', 'up_proj', 'down_proj', 'k_proj', 'o_proj', 'q_proj', 'gate_proj']

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,  #attention heads
    lora_alpha=64,  #alpha scaling
    target_modules=modules,  #gonna train all
    lora_dropout=0.1,  # dropout probability for layers
    bias="none",
    task_type="CAUSAL_LM", #for Decoder models like GPT Seq2Seq for Encoder-Decoder models like T5
)
##Get the PEFT Model using the downloaded model and the loRA config
model = get_peft_model(model, config)

In [None]:
# Print Trainable parameters
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
print(
    f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token
trainer = Trainer(
    model=model,
    train_dataset=dataset,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        max_steps=4000, #20,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs

trainer.train()

In [None]:
%cp -r "/content/outputs" "/content/drive/MyDrive"

In [None]:
import datetime
now = str(datetime.datetime.now()).replace(" ","_")
model.save_pretrained(save_directory="outputs/lora_adapter_"+now)

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

#The model loaded here is a separate object from the model loaded above. Unless your runtime has ~ 16GB of VRAM, you may need to restart to purge the old model from memory

peft_model_id = "/content/outputs/lora_adapter2024-02-29_16:56:54.106818"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
trained_model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
tst = """Read the Instruction below and provide an answer.

### INSTRUCTION:
In this task, you are given an input list A. You need to find all the elements of the list that are numbers and calculate their sum.

['i', '33', 'h', '849', '77']



### RESPONSE:"""
batch = tokenizer(tst, return_tensors='pt')
with torch.cuda.amp.autocast():
  output_tokens = trained_model.generate(**batch, max_new_tokens=90)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))