In [1]:
!pip install transformers accelerate bitsandbytes datasets evaluate peft adapter-transformers

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting adapter-transformers
  Downloading adapter_transformers-4.0.0.tar.gz (2.9 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting adapters (from adapter-transformers)
  Downloading adapters-1.0.1-py3-none-any.whl.metadata (16 kB)
Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84

In [3]:
from datasets import load_dataset

dataset = load_dataset("stanfordnlp/snli")

train_data = dataset['train'].select(range(0, len(dataset['train']), 550))[:1000]
test_data = dataset['test'].select(range(0, len(dataset['test']), 100))[:100]
val_data = dataset['validation'].select(range(0, len(dataset['validation']), 100))[:100]

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

In [4]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig, 
    AutoModelForSequenceClassification
)
import torch
import time
import pandas as pd
import numpy as np

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
    )

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", device_map='auto', quantization_config=bnb_config)
# model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", device_map={"": 0}, quantization_config=bnb_config).to("cuda")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
train_data.keys()

dict_keys(['premise', 'hypothesis', 'label'])

In [6]:
import re

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer.pad_token = tokenizer.eos_token
pad_token_id = tokenizer.pad_token_id

def classify_nli(premise, hypothesis, max_length=10):
    """
    Given a premise and hypothesis, uses the model to classify the relationship as entailment (0), neutral (1), or contradiction (2).
    """
    # Adjusted prompt with explicit instructions
    prompt = (
        f"Premise: \"{premise}\"\n"
        f"Hypothesis: \"{hypothesis}\"\n"
        "What is the relationship?\n"
        "0 - Entailment\n"
        "1 - Neutral\n"
        "2 - Contradiction\n"
        "The answer is number ..."
    )
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Generate output with controlled max length to restrict unnecessary text
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        temperature=0.1,  # Low temperature for more deterministic output
        num_return_sequences=1, 
        pad_token_id=pad_token_id
    )
    
    # Decode the generated answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # print(answer)
    
    match = re.search(r"The answer is number?\s*([0-2])", answer)
    if match:
        return int(match.group(1))
    
    # Return None if classification failed
    return 0


In [7]:
print(classify_nli("A person on a horse jumps over a broken down airplane.", "A person is at a diner, ordering an omelette."))

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


2


In [9]:
correct_predictions = 0
total_predictions = len(test_data['premise'])  # Use the length of any of the lists, as they should be the same length

# Loop through each example in train_data
for i in range(total_predictions):
    premise = test_data['premise'][i]
    hypothesis = test_data['hypothesis'][i]
    true_label = test_data['label'][i]

    # Get the predicted label using the classify_nli function
    predicted_label = classify_nli(premise, hypothesis)

    # Check if the prediction matches the true label
    if predicted_label == true_label:
        correct_predictions += 1

    # Print progress every 100 samples
    if (i + 1) % 25 == 0 or (i + 1) == total_predictions:
        print(f"Processed {i + 1}/{total_predictions} samples...")

# Calculate accuracy
accuracy = correct_predictions / total_predictions
print(f"Accuracy on the training data: {accuracy * 100:.2f}%")


Processed 25/100 samples...
Processed 50/100 samples...
Processed 75/100 samples...
Processed 100/100 samples...
Accuracy on the training data: 44.00%


In [23]:
from datasets import Dataset

# Create a new list to hold the formatted examples
formatted_train_data = []

tokenizer.pad_token = tokenizer.eos_token
for i in range(len(train_data['premise'])):
    premise = train_data['premise'][i]
    hypothesis = train_data['hypothesis'][i]
    label = train_data['label'][i]
    
    # Create the prompt combining premise and hypothesis
    prompt = f"Premise: {premise}\nHypothesis: {hypothesis}\nWhat is the relationship?\n0 - Entailment\n1 - Neutral\n2 - Contradiction\nThe answer is number ..."
    output_string = f"{label}"

    # Tokenizing the input
    tokenized_inputs = tokenizer(prompt, padding="max_length", truncation=True, max_length=512)
    # Tokenizing the output
    tokenized_outputs = tokenizer(output_string, padding="max_length", truncation=True, max_length=512)
    
    
    # Append the new format to the list
    formatted_train_data.append({
        "input_ids": tokenized_inputs['input_ids'],  # Extract only the input_ids tensor
        "labels": tokenized_outputs['input_ids']
    })

formatted_train_dataset = Dataset.from_list(formatted_train_data)

In [24]:
from peft import get_peft_model, LoraConfig, TaskType
import torch
import time
import psutil
import os

# Define QLoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM
)

# Wrap the model with QLoRA
model_qlora = get_peft_model(model, lora_config)

In [25]:
model_qlora.print_trainable_parameters()

trainable params: 18,350,080 || all params: 2,798,033,920 || trainable%: 0.6558


In [28]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir="./models",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=1,  # Save model after each epoch
    save_total_limit=5,
    fp16=True,
)

# Define Trainer
trainer = Trainer(
    model=model_qlora,
    args=training_args,
    train_dataset=formatted_train_dataset,
    data_collator=data_collator
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [12]:
torch.cuda.empty_cache() 
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Timing the fine-tuning process
start_time = time.time()
trainer.train()
end_time = time.time()

# 978226dac872d3d7058236614a51baf131fa2dae
# Save the model after training
trainer.save_model()

# Print the results
print(f"Training completed in {end_time - start_time:.2f} seconds.")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112993733334204, max=1.0…

Step,Training Loss
10,7.2812
20,1.7687
30,0.1766
40,0.1014
50,0.0633
60,0.0365
70,0.0175
80,0.0097
90,0.0062
100,0.0043


Training completed in 3360.78 seconds.


In [33]:
def classify_nli_finetuned(premise, hypothesis, max_length=10):
    """
    Given a premise and hypothesis, uses the model to classify the relationship as entailment (0), neutral (1), or contradiction (2).
    """
    # Adjusted prompt with explicit instructions
    prompt = (
        f"Premise: \"{premise}\"\n"
        f"Hypothesis: \"{hypothesis}\"\n"
        "What is the relationship?\n"
        "0 - Entailment\n"
        "1 - Neutral\n"
        "2 - Contradiction\n"
        "The answer is number ..."
    )
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Generate output with controlled max length to restrict unnecessary text
    outputs = model_qlora.generate(
        **inputs,
        max_new_tokens=max_length,
        temperature=0.1,  # Low temperature for more deterministic output
        num_return_sequences=1, 
        pad_token_id=pad_token_id
    )
    
    # Decode the generated answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    
    match = re.search(r"The answer is number?\s*([0-2])", answer)
    if match:
        return int(match.group(1))
    
    # Return None if classification failed
    return 0


correct_predictions = 0
total_predictions = len(test_data['premise'])  # Use the length of any of the lists, as they should be the same length

# Loop through each example in train_data
for i in range(total_predictions):
    premise = test_data['premise'][i]
    hypothesis = test_data['hypothesis'][i]
    true_label = test_data['label'][i]

    # Get the predicted label using the classify_nli function
    predicted_label = classify_nli_finetuned(premise, hypothesis)

    # Check if the prediction matches the true label
    if predicted_label == true_label:
        correct_predictions += 1

    # Print progress every 100 samples
    if (i + 1) % 25 == 0 or (i + 1) == total_predictions:
        print(f"Processed {i + 1}/{total_predictions} samples...")

Processed 25/100 samples...
Processed 50/100 samples...
Processed 75/100 samples...
Processed 100/100 samples...


In [2]:
# Calculate accuracy
accuracy = correct_predictions / total_predictions
print(f"Accuracy on the training data: {accuracy * 100:.2f}%")

model.save_adapter("./finetuned_model", "my_adapter")

Accuracy on the training data: 64%


In [20]:
!zip -r checkpoint-309.zip models/checkpoint-309
!zip -r checkpoint-306.zip models/checkpoint-306
!zip -r checkpoint-308.zip models/checkpoint-308

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: models/checkpoint-309/ (stored 0%)
  adding: models/checkpoint-309/adapter_model.safetensors (deflated 8%)
  adding: models/checkpoint-309/adapter_config.json (deflated 52%)
  adding: models/checkpoint-309/trainer_state.json (deflated 77%)
  adding: models/checkpoint-309/rng_state.pth (deflated 25%)
  adding: models/checkpoint-309/optimizer.pt (deflated 8%)
  adding: models/checkpoint-309/training_args.bin (deflated 52%)
  adding: models/checkpoint-309/README.md (deflated 66%)
  adding: models/checkpoint-309/scheduler.pt (deflated 56%)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: models/checkpoint-306/ (stored 0%)
  adding: models/checkpoint-306/adapter_model.safetensors (deflated 8%)
  adding: models/checkpoint-306/adapter_config.json (deflated 52%)
  adding: models/checkpoint-306/trainer_state.json (deflated 77%)
  adding: models/checkpoint-306/rng_state.pth (deflated 25%)
  adding: models/checkpoint-306/optimizer.pt (deflated 8%)
  adding: models/checkpoint-306/training_args.bin (deflated 52%)
  adding: models/checkpoint-306/README.md (deflated 66%)
  adding: models/checkpoint-306/scheduler.pt (deflated 56%)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: models/checkpoint-308/ (stored 0%)
  adding: models/checkpoint-308/adapter_model.safetensors (deflated 8%)
  adding: models/checkpoint-308/adapter_config.json (deflated 52%)
  adding: models/checkpoint-308/trainer_state.json (deflated 77%)
  adding: models/checkpoint-308/rng_state.pth (deflated 25%)
  adding: models/checkpoint-308/optimizer.pt (deflated 8%)
  adding: models/checkpoint-308/training_args.bin (deflated 52%)
  adding: models/checkpoint-308/README.md (deflated 66%)
  adding: models/checkpoint-308/scheduler.pt (deflated 55%)
