<a href="https://colab.research.google.com/github/ArchanaAhlawat7/llm_experiments/blob/main/Mistral7b_finetune_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install peft
!pip install accelerate
!pip install bitsandbytes
# !pip install evaluate # if calculating bleu and rouge
# !pip install rouge_score

# Data set-up

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

dolly_ds = load_dataset("databricks/databricks-dolly-15k")
dolly_df = pd.DataFrame(dolly_ds['train'].select(range(10000)))

# Append instructions + context (if exists) + response.
def create_input(row):
    if pd.notna(row['context']) and row['context'].strip():
        return f"Instruction: {row['instruction']} \n\nContext: {row['context']} \n\nResponse: {row['response']}"
    else:
        return f"Instruction: {row['instruction']} \n\nResponse: {row['response']}"

dolly_df['input'] = dolly_df.apply(create_input, axis=1)

train_df, test_df = train_test_split(dolly_df, train_size=0.8, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, train_size=0.8, test_size=0.2, random_state=42)

train_df.drop(columns=['instruction', 'context', 'response', 'category'], inplace=True)
val_df.drop(columns=['instruction', 'context', 'response', 'category'], inplace=True)
test_df.drop(columns=['input','category'], inplace=True)

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)
val_ds = Dataset.from_pandas(val_df)

# Check on structure of datasets
print(train_ds.shape)
print(test_ds.shape)
print(val_ds.shape)

print(train_ds[:2])
print(test_ds[:2])
print(val_ds[:2])

# Training

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model_id = "mistralai/Mistral-7B-v0.1"

# select one for finetuning
model = AutoModelForCausalLM.from_pretrained(base_model_id, load_in_8bit=True, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, trust_remote_code=True)


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules= ["q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head"
      ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

# Mistral Finetune w just attention layers: trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940
# Mistral Finetune w all linear layers: trainable params: 42,520,576 || all params: 7,284,252,672 || trainable%: 0.5837

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
from transformers import AutoTokenizer

# Set up tokenizer and tokenize train and val sets, to be used in training.
base_model_id = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained( # eos and bos during training, but not during inference.
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_input(prompt):
  result = tokenizer(
      prompt['input'],
      truncation=True,
      max_length=512,
      padding="max_length",
  )
  result["labels"] = result["input_ids"].copy()
  return result

tokenized_train = train_ds.map(tokenize_input)
tokenized_val = val_ds.map(tokenize_input)

In [None]:
# !pip install trl
import transformers

# Set up training configs

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    args=transformers.TrainingArguments(
        output_dir='./mistral_four',
        warmup_steps=1,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        gradient_checkpointing=True,
        num_train_epochs=4,
        learning_rate=1e-4,
        fp16=True,
        fp16_full_eval=True,
        optim="paged_adamw_8bit",
        logging_steps=25,
        logging_dir="./logs_debugging",
        save_strategy="steps",
        save_steps=50,
        eval_strategy="steps",
        eval_steps=25,
        do_eval=True,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False
trainer.train()

# Comparing the base model with finetuned models.

In [None]:
import torch
from peft import PeftModel, PeftConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer

# load the model
peft_model_id = "Archanaa7/ft_mistral_dolly" # finetune #1
config = PeftConfig.from_pretrained(peft_model_id)

ft_one = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    device_map={"":0},
    trust_remote_code=True,
    load_in_4bit=True,
)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

ft_one = PeftModel.from_pretrained(ft_one, peft_model_id)
ft_one.to("cuda")

base = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    load_in_4bit=True,
    trust_remote_code=True,
)

# Alternative way of loading PEFT model
ft_overfit = AutoPeftModelForCausalLM.from_pretrained(pretrained_model_name_or_path="Archanaa7/ft_mistral_dolly_secondtry", load_in_4bit=True) # finetune #2 (overfit)

ft_overfit.get_model_status()
ft_overfit.to("cuda")


In [None]:
# Qualitative tests. Sanity check.

tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    padding_side="left",
    truncation=True,
    max_length=512,
)
tokenizer.pad_token = tokenizer.eos_token

input = tokenizer("Instruction: What type of effect does Coffee have on humans? \n\nContext: 'Coffee is a beverage prepared from roasted coffee beans. Darkly colored, bitter, and slightly acidic, coffee has a stimulating effect on humans, primarily due to its caffeine content. It has the highest sales in the world market for hot drinks.\n\nSeeds of the Coffee plant\'s fruits are separated to produce un-roasted green coffee beans. The beans are roasted and then ground into fine particles that are typically steeped in hot water before being filtered out, producing a cup of coffee. It is usually served hot, although chilled or iced coffee is common. Coffee can be prepared and presented in a variety of ways (e.g., espresso, French press, caffè latte, or already-brewed canned coffee). Sugar, sugar substitutes, milk, and cream are often used to mask the bitter taste or enhance the flavor.'\n\n Response: ", return_tensors="pt").to("cuda")

base.eval()
with torch.no_grad():
  gen_output = base.generate(**input, max_new_tokens=100, repetition_penalty=1.2)[0]
  decoded_output = tokenizer.decode(gen_output, skip_special_tokens=True)
  print(decoded_output)

ft_one.eval()
with torch.no_grad():
  gen_output = ft_one.generate(**input, max_new_tokens=100, repetition_penalty=1.2)[0]
  decoded_output = tokenizer.decode(gen_output, skip_special_tokens=True)
  print(decoded_output)

ft_overfit.eval()
with torch.no_grad():
  gen_output = ft_overfit.generate(**input, max_new_tokens=100, repetition_penalty=1.2)[0]
  decoded_output = tokenizer.decode(gen_output, skip_special_tokens=True)
  print(decoded_output)


In [None]:
import numpy as np
import evaluate
from transformers import GenerationConfig
import csv

# Now, let's test the pretrained model's performance vs our finetuned model's performance on our held out dataset (test_ds)

tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    padding_side="left",
    truncation=True,
    max_length=512,
)
tokenizer.pad_token = tokenizer.eos_token

total_base_loss = 0
total_ft_one_loss = 0
total_ft_overfit_loss = 0

target_responses = []
tokenized_inputs = []

base.eval()
ft_one.eval()
ft_overfit.eval()
with torch.no_grad():
  for item in test_ds:
    input = "Instruction: " + item['instruction']
    if item['context']:
      input += " \n\n Context: " + item['context']
    input += "\n\nResponse: " # inputs are instruction: {instruction} context: {context} response:

    target_response = item['response']
    input_output_pair = input + target_response

    tokenized_input = tokenizer.encode(input, return_tensors="pt").to("cuda")
    tokenized_target = tokenizer.encode(input_output_pair, return_tensors="pt").to("cuda")

    # prep for future generation step
    tokenized_inputs.append(tokenized_input)
    target_responses.append(target_response)

    # Compute loss. input = target, output = target
    base_model_output = base(tokenized_target, labels=tokenized_target)
    base_model_loss = base_model_output.loss.item()

    ft_one_output = ft_one(tokenized_target, labels=tokenized_target)
    ft_one_loss = ft_one_output.loss.item()

    ft_overfit_output = ft_overfit(tokenized_target, labels=tokenized_target)
    ft_overfit_loss = ft_overfit_output.loss.item()

    total_base_loss += base_model_loss
    total_ft_one_loss += ft_one_loss
    total_ft_overfit_loss += ft_overfit_loss

average_base_loss = total_base_loss / len(test_ds)
average_ft_one_loss = total_ft_one_loss / len(test_ds)
average_ft_overfit_loss = total_ft_overfit_loss / len(test_ds)

base_perplexity = np.exp(average_base_loss)
ft_perplexity = np.exp(average_ft_one_loss)
ft_overfit_perplexity = np.exp(average_ft_overfit_loss)

print(f"Average Base Model Loss: {average_base_loss}")
print(f"Average FT Loss: {average_ft_one_loss}" )
print(f"Average FT Overfit Loss: {average_ft_overfit_loss}")
print(f"Base Model Perplexity: {base_perplexity}")
print(f"FT Perplexity: {ft_perplexity}")
print(f"FT Overfit Perplexity: {ft_overfit_perplexity}")

- Average Base Model Loss: 2.0458536679148676
- Average FT Loss: 1.5839086458086968
- Average FT Overfit Loss: 1.4659164778441192
- Base Model Perplexity: 7.735759489672113
- FT Perplexity: 4.873969248109109
- FT Overfit Perplexity: 4.331511155775555

In [None]:
torch.save(tokenized_inputs, 'tokenized_inputs.pt')

In [None]:
# tokenized_inputs list already filled above.

# Optionally calculate bleu and rouge scores!

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

tokenized_inputs = torch.load('tokenized_inputs.pt')

generation_config = GenerationConfig(
    max_new_tokens=256,
    num_return_sequences=1,
    repetition_penalty=1.2
)

base_model_responses = []
ft_one_responses = []
ft_overfit_responses = []

base.eval()
ft_one.eval()
ft_overfit.eval()
with torch.no_grad():
  for tokenized_input in tokenized_inputs:
      # Generate responses and append to response lists
      tokenized_input = tokenized_input.to("cuda")
      base_output = base.generate(tokenized_input, generation_config=generation_config)[0]
      ft_one_output = ft_one.generate(tokenized_input, generation_config=generation_config)[0]
      ft_overfit_output = ft_overfit.generate(tokenized_input, generation_config=generation_config)[0]

      base_output_decoded = tokenizer.decode(base_output, skip_special_tokens=True)
      ft_output_decoded = tokenizer.decode(ft_one_output, skip_special_tokens=True)
      ft_overfit_output_decoded = tokenizer.decode(ft_overfit_output, skip_special_tokens=True)

      base_model_responses.append(base_output_decoded)
      ft_one_responses.append(ft_output_decoded)
      ft_overfit_responses.append(ft_overfit_output_decoded)

print(f"Base Model Bleu Score: {bleu.compute(predictions=base_model_responses, references=[[r] for r in target_responses])['bleu']}")
print(f"Base Model Rouge Score: {rouge.compute(predictions=base_model_responses, references=target_responses)['rouge1']}")

print(f"FT One Bleu Score: {bleu.compute(predictions=ft_one_responses, references=[[r] for r in target_responses])['bleu']}")
print(f"FT One Rouge Score: {rouge.compute(predictions=ft_one_responses, references=target_responses)['rouge1']}")

print(f"FT Overfit Bleu Score: {bleu.compute(predictions=ft_overfit_responses, references=[[r] for r in target_responses])['bleu']}")
print(f"FT Overfit Rouge Score: {rouge.compute(predictions=ft_overfit_responses, references=target_responses)['rouge1']}")
