In [1]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-24.3.1


In [2]:
!pip install -q torch
!pip install -q -U accelerate peft bitsandbytes transformers trl einops

In [3]:
import torch
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    logging,
)
from trl import SFTTrainer

In [4]:
dataset = load_dataset("GBaker/MedQA-USMLE-4-options")
train_samples = dataset['train']
test_samples = dataset['test']
train_samples = train_samples.shuffle(seed=42).select(range(7000))


train_validation_split = train_samples.train_test_split(test_size=0.2, seed=42)
train_samples = train_validation_split['train']
val_samples = train_validation_split['test']
val_samples = val_samples.select(range(100))

print(len(train_samples))
print(len(val_samples))

README.md:   0%|          | 0.00/654 [00:00<?, ?B/s]

phrases_no_exclude_train.jsonl:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

phrases_no_exclude_test.jsonl:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1273 [00:00<?, ? examples/s]

5600
100


In [5]:
def generate_prompt(example):
    idx_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4}
    
    question, options, answer_idx = example['question'], example['options'], example['answer_idx']
    
    answer = idx_mapping[answer_idx]
    
    option_list = [f"{value}" for key, value in options.items()]
    
    return (
        f"Answer the following question by returning one correct numerical value:\n"
        f"Question : {question}\n"
        f"Option 1: {option_list[0]}\n"
        f"Option 2: {option_list[1]}\n"
        f"Option 3: {option_list[2]}\n"
        f"Option 4: {option_list[3]}\n"
        f"Answer only one numerical value. \n"
        f"Answer:"
    )

In [6]:
answer_mapping = {"A": 1, "B": 2, "C": 3, "D": 4}

def generate_prompt_and_label(sample):
    prompt = generate_prompt(sample)
    
    label = answer_mapping.get(sample['answer_idx'], 0)  
    
    label_tensor = torch.tensor(label)
    
    return {"text": prompt, "labels": label_tensor}

train_samples = train_samples.map(generate_prompt_and_label)
val_samples = val_samples.map(generate_prompt_and_label)
test_samples = test_samples.map(generate_prompt_and_label)
print(train_samples[0])

Map:   0%|          | 0/5600 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1273 [00:00<?, ? examples/s]

{'question': 'A 53-year-old man is being evaluated for a 3-week history of fatigue, difficulty to concentrate, dyspnea with exertion, dizziness, and digital pain that improves with cold. He has smoked half a pack of cigarettes a day since he was 20. His current medical history involves hypertension. He takes enalapril daily. The vital signs include a blood pressure of 131/82 mm Hg, a heart rate of 95/min, and a temperature of 36.9°C (98.4°F). On physical examination, splenomegaly is found. A complete blood count reveals thrombocytosis of 700,000 cells/m3. Lab work further shows decreased serum iron, iron saturation, and serum ferritin and increased total iron binding capacity. A blood smear reveals an increased number of abnormal platelets, and a bone marrow aspirate confirmed the presence of dysplastic megakaryocytes. A mutation on his chromosome 9 confirms the physician’s suspicion of a certain clonal myeloproliferative disease. The patient is started on hydroxyurea. What is the most

In [17]:
print(len(test_samples))

1273


In [7]:
base_model = "EleutherAI/gpt-neo-2.7B"
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

def tokenize_function(example, label):
    encodings = tokenizer(example, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    encodings['labels'] = torch.tensor(label)
    return encodings


train_prompts = [example['text'] for example in train_samples]
train_labels = [example['labels'] for example in train_samples]
val_prompts = [example['text'] for example in val_samples]
val_labels = [example['labels'] for example in val_samples]
test_prompts = [example['text'] for example in test_samples]
test_labels = [example['labels'] for example in test_samples]
train_dataset = tokenize_function(train_prompts, train_labels)
val_dataset = tokenize_function(val_prompts, val_labels)
test_dataset = tokenize_function(test_prompts, test_labels)
print(train_dataset)

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

{'input_ids': tensor([[33706,   262,  1708,  ..., 19722,   352,    25],
        [33706,   262,  1708,  ...,   306, 25921, 20461],
        [33706,   262,  1708,  ..., 50256, 50256, 50256],
        ...,
        [33706,   262,  1708,  ..., 50256, 50256, 50256],
        [33706,   262,  1708,  ...,   604,    25,  5869],
        [33706,   262,  1708,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([2, 1, 1,  ..., 2, 2, 1])}


In [8]:
print(train_dataset['input_ids'][0])

tensor([33706,   262,  1708,  1808,   416,  8024,   530,  3376, 29052,  1988,
           25,   198, 24361,  1058,   317,  7192,    12,  1941,    12,   727,
          582,   318,   852, 16726,   329,   257,   513,    12, 10464,  2106,
          286, 18787,    11,  8722,   284, 20062,    11, 13147,    79, 39718,
          351, 17596,   295,    11, 29527,  1272,    11,   290,  4875,  2356,
          326, 19575,   351,  4692,    13,   679,   468, 21603,  2063,   257,
         2353,   286, 17626,   257,  1110,  1201,   339,   373,  1160,    13,
         2399,  1459,  3315,  2106,  9018, 37454,    13,   679,  2753,   551,
          282,   499, 22379,  4445,    13,   383,  9204,  5895,  2291,   257,
         2910,  3833,   286, 23134,    14,  6469,  8085,   367,    70,    11,
          257,  2612,  2494,   286,  6957,    14,  1084,    11,   290,   257,
         5951,   286,  4570,    13,    24,  7200,    34,   357,  4089,    13,
           19,  7200,    37,   737,  1550,  3518, 12452,    11, 

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map={"": 0}
)

model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

In [19]:
print(test_dataset)

{'input_ids': tensor([[33706,   262,  1708,  ..., 50256, 50256, 50256],
        [33706,   262,  1708,  ..., 50256, 50256, 50256],
        [33706,   262,  1708,  ...,  2523,   493,  4108],
        ...,
        [33706,   262,  1708,  ...,  6309,  6672,   299],
        [33706,   262,  1708,  ..., 50256, 50256, 50256],
        [33706,   262,  1708,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([2, 4, 2,  ..., 2, 3, 3])}


In [10]:
model.config.use_cache = False
model.config.pretraining_tp = 1
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

In [11]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset_custom = CustomDataset(train_dataset)
val_dataset_custom = CustomDataset(val_dataset)
test_dataset_custom = CustomDataset(test_dataset)

In [12]:
peft_config = LoraConfig(
    r=32,                   # default=8, higher value for more representational capacity
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
        "dense"
    ]
)

model = PeftModel(model, peft_config)



In [13]:
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    fp16=False,
    bf16=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    max_grad_norm=0.3,
    learning_rate=2e-5,
    weight_decay=0.001,
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    logging_strategy="epoch",
    evaluation_strategy="epoch",  # Evaluate during training at each eval_step
)



In [14]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_custom,
    eval_dataset=val_dataset_custom,  
    peft_config=peft_config,  
    tokenizer=tokenizer,
    max_seq_length= 200,
    args=training_arguments,
)

trainer.can_return_loss=True


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [15]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111290593333365, max=1.0)…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
1,1.5473,1.42322
2,1.4419,1.407719


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=1400, training_loss=1.494586181640625, metrics={'train_runtime': 14433.2618, 'train_samples_per_second': 0.776, 'train_steps_per_second': 0.097, 'total_flos': 4.36071101104128e+16, 'train_loss': 1.494586181640625, 'epoch': 2.0})

In [16]:
trainer.save_model("./gptNEO_finetuned_mcq/final_model")



In [20]:
from peft import PeftModel
f_model = PeftModel.from_pretrained(model,'/kaggle/working/gptNEO_finetuned_mcq/final_model')
f_model = f_model.merge_and_unload()



In [27]:
from tqdm import tqdm
import torch

f_model.eval()  # Set the model to evaluation mode
predictions = []
device = next(f_model.parameters()).device  # Ensure device compatibility
batch_size = 1  # Adjust batch size as needed

# Use DataLoader for batching if batch_size > 1
test_dataloader = torch.utils.data.DataLoader(test_dataset_custom, batch_size=batch_size)

# Iterate through the test_dataset
for batch in tqdm(test_dataloader, desc="Generating predictions", unit="batch"):
    # Move input tensors to the same device as the model
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # Generate outputs from the model
    outputs = f_model.generate(input_ids, attention_mask=attention_mask, max_new_tokens = 10)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(response)


Generating predictions:   0%|          | 0/1273 [00:00<?, ?batch/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating predictions:   0%|          | 1/1273 [00:01<24:54,  1.18s/batch]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating predictions:   0%|          | 2/1273 [00:02<24:16,  1.15s/batch]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating predictions:   0%|          | 3/1273 [00:03<23:48,  1.12s/batch]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating predictions:   0%|          | 4/1273 [00:04<23:42,  1.12s/batch]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating predictions:   0%|          | 5/1273 [00:05<23:31,  1.11s/batch]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating predictions:   0%|          | 6/1273 [00:06<23:24,  1.11s/batch]Setting `pad_token_id` to `eos_token_id`:None for open-en

In [31]:
import csv

# Open the CSV file in write mode
with open("output.csv", "w", newline="") as f:
    writer = csv.writer(f)
    
    # Optional: Write a header if needed
    writer.writerow(["Prediction"])

    # Write each item from the predictions list
    for item in predictions:
        writer.writerow([item])
