In [1]:
# !pip install datasets==2.19.2 > /dev/null 2>&1
# !pip install peft > /dev/null 2>&1
# !pip install wandb > /dev/null 2>&1

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import time
import torch
import wandb
import psutil
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from datasets import load_dataset, load_metric, Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorWithPadding

In [4]:
def tokenize_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], padding="max_length", truncation=True, max_length=128, return_tensors='pt')

def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    fine_tuned_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, fine_tuned_params

def compute_metrics(pred):
    labels = torch.tensor(pred.label_ids)
    preds = torch.tensor(pred.predictions)
    
    accuracy = accuracy_score(labels, preds.argmax(dim=-1).numpy())
    loss = F.cross_entropy(preds, labels).item()
    
    return {"accuracy": accuracy, "eval_loss": loss}


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
model_name = "microsoft/phi-2"

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]
Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
snli_dataset = load_dataset('stanfordnlp/snli')

train_data = snli_dataset["train"].select(range(0, 550000, 550))[:1000]
test_data = snli_dataset["test"].select(range(0, 10000, 100))[:100]
validation_data = snli_dataset["validation"].select(range(0, 10000, 100))[:100]

In [8]:
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(validation_data)
test_dataset = Dataset.from_dict(test_data)
val_dataset = val_dataset.filter(lambda x: x['label'] != -1)

Filter: 100%|██████████| 100/100 [00:00<00:00, 25799.99 examples/s]


In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

train_data = train_dataset.map(tokenize_function, batched=True)
test_data = test_dataset.map(tokenize_function, batched=True)
validation_data = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1000/1000 [00:00<00:00, 4721.60 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 3115.99 examples/s]
Map: 100%|██████████| 99/99 [00:00<00:00, 2927.93 examples/s]


## Base Model

In [11]:
metric = load_metric("accuracy")

for example in tqdm(test_data, desc="Evaluating"):
    inputs = {
        'input_ids': torch.tensor(example['input_ids']).unsqueeze(0).to(device),
        'attention_mask': torch.tensor(example['attention_mask']).unsqueeze(0).to(device)
    }
    labels = torch.tensor([example['label']]).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)

    metric.add_batch(predictions=predictions, references=labels)

accuracy = metric.compute()
print("Accuracy before Fine-Tuning:", accuracy["accuracy"])

Evaluating: 100%|██████████| 100/100 [00:06<00:00, 14.96it/s]


Accuracy before Fine-Tuning: 0.35


## Fine-Tuning

In [12]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.01,
    task_type=TaskType.SEQ_CLS,
)
peft_model = get_peft_model(model, config)

In [13]:
total_params, fine_tuned_params = count_parameters(peft_model)
print(f"Total parameters: {total_params}, Fine-tuned parameters: {fine_tuned_params}")

Total parameters: 1399459840, Fine-tuned parameters: 9182720


In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=4,
    fp16=True, 
    gradient_accumulation_steps=8,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [15]:
api_key = "21c51e8c225a28eb460289d48760ddc6574e3c91"
wandb.login(key=api_key)

wandb.init(project="LLM Assigment 3", name="Fine-Tuning using QLoRA")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mabhijeetanand23[0m ([33mabhijeetanand23-indraprastha-institute-of-information-te[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/arunb/.netrc


In [16]:
torch.cuda.empty_cache()

In [17]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Accuracy
1,17.1136,1.758603,0.383838
2,11.781,1.171289,0.393939
3,8.2236,0.997068,0.535354
4,6.7152,0.83644,0.606061
5,5.3803,0.717982,0.707071


TrainOutput(global_step=20, training_loss=9.842738246917724, metrics={'train_runtime': 1612.3344, 'train_samples_per_second': 3.101, 'train_steps_per_second': 0.012, 'total_flos': 9702447513600000.0, 'train_loss': 9.842738246917724, 'epoch': 5.0})

In [20]:
test_results = trainer.evaluate(test_data)

print(f"Evaluation Results:")
print(f"Evaluation Loss: {test_results['eval_loss']}")
print(f"Evaluation Accuracy: {test_results['eval_accuracy']}")
print(f"Evaluation Runtime: {test_results['eval_runtime']} seconds")
print(f"Samples Per Second: {test_results['eval_samples_per_second']}")
print(f"Steps Per Second: {test_results['eval_steps_per_second']}")
print(f"Epoch: {test_results['epoch']}")

Evaluation Results:
Evaluation Loss: 0.8540907502174377
Evaluation Accuracy: 0.61
Evaluation Runtime: 10.8695 seconds
Samples Per Second: 9.2
Steps Per Second: 0.368
Epoch: 5.0
