In [1]:
%%capture
#Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

!pip install --pre -U xformers -q

#Import all Libraries.
!pip install -r "requirements.txt" -q

In [2]:
!export CUDA_VISIBLE_DEVICES=0,1

In [27]:
import csv
import torch
from torch.utils.data import DataLoader
import os
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_model, TaskType, PeftType, LoraConfig

In [4]:
# Please use your huggingface credentials
!huggingface-cli login --token ""

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/jupyter/.cache/huggingface/token
Login successful


**Load Datasets**

In [18]:
train_file = "mmlu_01/trainset_with_rationales_1.csv"
eval_file = "mmlu_01/valset.csv"
test_file = "mmlu_01/varying_option/testset.csv"

train_dataset =  load_dataset('csv', data_files=train_file, split='train')
eval_dataset = load_dataset('csv', data_files=eval_file, split='train')
test_dataset = load_dataset('csv', data_files=test_file, split='train')

**Model**

In [6]:
device="cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
torch.cuda.empty_cache()

cuda:0


In [7]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model_name_or_path = "unsloth/gemma-2b-it-bnb-4bit"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side='right', padding=True, 
                                         truncation=True, max_length=max_seq_length, low_cpu_mem_usage=True, device_map="auto")
model.enable_input_require_grads()

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


In [8]:
hyperparam_config = {
    'lr': 1e-4,
    'nepochs': 3,
    'batch_size':4,
    'wd': 1e-7,
    'eps': 0.1,
    'warmup_steps': 0,
}

In [9]:
# LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16, 
    lora_alpha=16, 
    use_rslora=True, 
    lora_dropout=0.1,
    init_lora_weights="gaussian")

model = get_peft_model(model, peft_config)
model.print_trainable_parameters() #Ensure only prompt tuning params are trainable

trainable params: 1,843,200 || all params: 2,508,015,616 || trainable%: 0.073492365368111


In [10]:
# optimizer and lr scheduler
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr=hyperparam_config['lr'], 
                              weight_decay=hyperparam_config['wd'], 
                              eps=hyperparam_config['eps'])
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=hyperparam_config['warmup_steps'],
    num_training_steps=(len(train_dataset) * hyperparam_config['nepochs']),
)

In [11]:
next(model.parameters()).is_cuda

True

**Training**

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="train_logs/llama-2-7b",
    learning_rate=hyperparam_config['lr'],
    per_device_train_batch_size=hyperparam_config['batch_size'],
    per_device_eval_batch_size=hyperparam_config['batch_size'],
    num_train_epochs=hyperparam_config['nepochs'],
    weight_decay=hyperparam_config['wd'],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    group_by_length=True,
    dataloader_pin_memory=True,                           
    dataloader_num_workers=4,
    dataloader_prefetch_factor=1,
)

In [19]:
from trl import DataCollatorForCompletionOnlyLM

context_prompt = '''### Instruction: Let's think step-by-step and come up with an answer from the options A/B/C/D only.\n\n'''

def formatting_prompts_func_train(example):
    question = example['question'][:-10]
    text = f"{context_prompt} ### {question}\n\n  ### Answer: {example['gold_answer']}\n\n ### Explanation: {example['rationale']}"
    return {'prompt': text}

response_template = "### Answer:"

train_dataset = train_dataset.map(formatting_prompts_func_train)
train_dataset.to_csv('train_dataset.csv')


def formatting_prompts_func_val(example):
    question = example['question'][:-10]
    text = f"{context_prompt} ### {question}\n\n  ### Answer: {example['gold_answer']}\n\n"
    return {'prompt': text}

eval_dataset = eval_dataset.map(formatting_prompts_func_val)
eval_dataset.to_csv('eval_dataset.csv')

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/1132 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

1305120

In [20]:
from trl import SFTTrainer

collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, lr_scheduler),
    dataset_text_field="prompt",
    max_seq_length=max_seq_length,
    data_collator=collator,
)

trainer.train()

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

Map:   0%|          | 0/1132 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss
1,No log,1.667514
2,No log,1.095317
3,No log,0.901841


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=300, training_loss=1.288029276529948, metrics={'train_runtime': 636.4653, 'train_samples_per_second': 1.881, 'train_steps_per_second': 0.471, 'total_flos': 2531014260523008.0, 'train_loss': 1.288029276529948, 'epoch': 3.0})

In [21]:
model = trainer.model
model.push_to_hub("gemma-2b-lora-CoT-tuned")



adapter_model.safetensors:   0%|          | 0.00/7.38M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/roshinishetty333/gemma-2b-lora-CoT-tuned/commit/c765a8ffcdb3e423c223d857938d5b1609724821', commit_message='Upload model', commit_description='', oid='c765a8ffcdb3e423c223d857938d5b1609724821', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
model.save_pretrained("out_dir/gemma-2b-lora-CoT-tuned")

In [31]:
# Function to generate answers to questions
def generate_answer(question, vocab_id_A, vocab_id_B, vocab_id_C, vocab_id_D):
  max_new_tokens = 300

  context_prompt = f'''### Instruction: Let's think step-by-step and come up with an answer from the options A/B/C/D only.

### Question: In British currency how many pence make a pound?
Options: 
A. 10
B. 100
C. 500
D. 1000

### Answer: 

B

Explanation: In British currency, there are 100 pence in a pound. So, option B is the correct answer.
'''
  question = question[:-10]
  question = f"{context_prompt} ### {question}\n\n ### Answer: \n\n"

  input_ids = tokenizer.encode(question, return_tensors="pt").cuda()
  output_ids = model.generate(input_ids, max_new_tokens=max_new_tokens,return_dict=True,return_dict_in_generate=True,output_scores=True,do_sample=False)

  #Scores is a tuple : (tensor of scores, empty) and tensor is of shape (1,vocab size of model)

  #Finding Probability of generating current predicted token output
  gen = tokenizer.decode(output_ids.sequences[0,:], skip_special_tokens=True)

  gen2 = gen[len(question):]
  counter= 0
  while counter < max_new_tokens:
    if gen2[counter] != ' ':
      break
    counter+=1
  answer = gen2[counter]
  rationale = gen2[counter+1:]
  probs = output_ids.scores[counter].softmax(-1).squeeze().cuda()
  print(f"answer: {answer}, rationale: {rationale}, probs: {probs}, counter: {counter}")

  #Finding Probability of generating other tokens as answers ('A','B','C','D')
  #Assert that probability of prediction of the correct option is the same as above.
  vocab_id_A_probs = probs[vocab_id_A].item()
  vocab_id_B_probs = probs[vocab_id_B].item()
  vocab_id_C_probs = probs[vocab_id_C].item()
  vocab_id_D_probs = probs[vocab_id_D].item()
  total = vocab_id_A_probs + vocab_id_B_probs + vocab_id_C_probs + vocab_id_D_probs
  vocab_id_A_probs = vocab_id_A_probs/total
  vocab_id_B_probs = vocab_id_B_probs/total
  vocab_id_C_probs = vocab_id_C_probs/total
  vocab_id_D_probs = vocab_id_D_probs/total

  return answer, rationale, vocab_id_A_probs, vocab_id_B_probs, vocab_id_C_probs, vocab_id_D_probs

# Function to process CSV file containing questions
def process_csv(input_file, output_file):
  index = 0
  with open(input_csv_file, "r") as input_file, open(output_csv_file, "w", newline="") as output_file:
    reader = csv.reader(input_file)
    writer = csv.writer(output_file)

    # Read and write headers
    header = next(reader)
    writer.writerow(header + ["Predicted_token_ID", "Rationale", "Normalized_A_probs", "Normalized_B_probs", "Normalized_C_probs", "Normalized_D_probs"])

    # Process each row in the CSV file
    for row in reader:
        question = row[0]
        if question == '':
          break

        #get vocab keys for each of the option ID's to index the scores tensor based on these vocab id's
        out = tokenizer.get_vocab()
        vocab_id_A = out['A']
        vocab_id_B = out['B']
        vocab_id_C = out['C']
        vocab_id_D = out['D']

        answer, rationale, A_norm_prob, B_norm_prob, C_norm_prob, D_norm_prob = generate_answer(question,
                                                                                     vocab_id_A, vocab_id_B, vocab_id_C, vocab_id_D)
        print(f'{index}: {answer}, {rationale}')
        writer.writerow(row + [answer, rationale, A_norm_prob, B_norm_prob, C_norm_prob, D_norm_prob])
        index += 1

In [32]:
data_tables = [
  "testset",
  # "professional_law",
  # "prehistory",
  # "philosophy",
  # "high_school_mathematics",
  # "conceptual_physics",
  # "college_medicine",
  # "abstract_algebra"
]

In [None]:
# Input and output file paths
model_name = "gemma-2b-it"
mmlu_01 = "mmlu_01/"
mmlu_02 = "CoT_LoRA/"

for is_varying_option in [True, False]:
    sub_folder = "varying_option" if is_varying_option else "varying_position"
    for file_name in data_tables:
      input_csv_file = mmlu_01 + sub_folder + "/" + file_name + ".csv"
      output_csv_file = mmlu_02 + model_name + "/" + sub_folder + "/" + file_name + ".csv"

      # Process CSV file containing questions and generate answers
      process_csv(input_csv_file, output_csv_file)

      print("Answers generated and saved to:", output_csv_file)

answer: A, rationale: 

Explanation: The characteristic roots of the MA process are 1 and 2., probs: tensor([6.7979e-18, 6.1617e-09, 2.8016e-16,  ..., 4.4515e-15, 8.2220e-13,
        1.4848e-17], device='cuda:0'), counter: 0
0: A, 

Explanation: The characteristic roots of the MA process are 1 and 2.
answer: A, rationale: 

Explanation: Astronomers think Jupiter generates its internal heat through exothermic chemical reactions converting chemical potential energy into thermal energy., probs: tensor([7.5809e-17, 1.0765e-09, 7.7003e-17,  ..., 5.8037e-14, 6.6560e-12,
        1.7626e-16], device='cuda:0'), counter: 0
1: A, 

Explanation: Astronomers think Jupiter generates its internal heat through exothermic chemical reactions converting chemical potential energy into thermal energy.
answer: C, rationale: 

Explanation: Scenario 1 is wrong because cooking a potato is not morally wrong. Scenario 2 is not wrong because forcing a rat to go outside is not morally wrong. Therefore, option C is