## Dependencies and Dataset

### Dependencies

In [None]:
!pip install -U transformers
!pip install -U accelerate

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.3
    Uninstalling transformers-4.48.3:
      Successfully uninstalled transformers-4.48.3
Successfully installed transformers-4.49.0
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.meta

### Dataset Parsing and checking issues

Parse XML file

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_absa_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []

    for sentence in root.findall("sentence"):
        text = sentence.find("text").text
        aspects = sentence.find("aspectTerms")

        if aspects is not None:
            for aspect in aspects.findall("aspectTerm"):
                aspect_term = aspect.get("term")
                polarity = aspect.get("polarity")
                data.append([text, aspect_term, polarity])

    return pd.DataFrame(data, columns=["sentence", "aspect", "sentiment"])

# Load both datasets
laptop_df = parse_absa_xml("/content/Laptop_Train_v2.xml")
restaurant_df = parse_absa_xml("/content/Restaurants_Train_v2.xml")

# Combine both datasets
absa_df = pd.concat([laptop_df, restaurant_df], ignore_index=True)

absa_df.head()

Unnamed: 0,sentence,aspect,sentiment
0,I charge it at night and skip taking the cord ...,cord,neutral
1,I charge it at night and skip taking the cord ...,battery life,positive
2,The tech guy then said the service center does...,service center,negative
3,The tech guy then said the service center does...,"""sales"" team",negative
4,The tech guy then said the service center does...,tech guy,neutral


Check For Issues (Imbalanced datasets for missing values)

In [None]:
print(absa_df["sentiment"].value_counts())  # Check class distribution
print(absa_df.isnull().sum())  # Check for missing values

sentiment
positive    3151
negative    1671
neutral     1093
conflict     136
Name: count, dtype: int64
sentence     0
aspect       0
sentiment    0
dtype: int64


In [None]:
absa_df.describe()

Unnamed: 0,sentence,aspect,sentiment
count,6051,6051,6051
unique,3501,2304,4
top,There are several programs for school or offic...,food,positive
freq,13,357,3151


## Data Preparation Pipeline

### Data Cleaning & Normalization

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from datasets import Dataset, DatasetDict

# Replace XML entities (example: &quot; to ")
absa_df['sentence'] = absa_df['sentence'].str.replace('&quot;', '"', regex=False)

# Remove extra whitespace
absa_df['sentence'] = absa_df['sentence'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# Convert text to lowercase
absa_df['sentence'] = absa_df['sentence'].str.lower()

# Drop rows with the 'conflict' sentiment
absa_df = absa_df[absa_df['sentiment'] != 'conflict']

# Drop duplicate rows
absa_df = absa_df.drop_duplicates()

print("\nSentiment distribution after cleaning (conflict dropped):")
print(absa_df["sentiment"].value_counts())

print("\nData description:")
print(absa_df.describe())


Sentiment distribution after cleaning (conflict dropped):
sentiment
positive    3121
negative    1633
neutral     1087
Name: count, dtype: int64

Data description:
                                                 sentence aspect sentiment
count                                                5841   5841      5841
unique                                               3429   2273         3
top     there are several programs for school or offic...   food  positive
freq                                                   13    339      3121


### Label Encoding

In [None]:
sentiment_mapping = {
    'positive': 1,
    'negative': 0,
    'neutral': 2
}
absa_df['sentiment_encoded'] = absa_df['sentiment'].map(sentiment_mapping)

print("\nUnique sentiment labels after encoding:")
print(absa_df[['sentiment', 'sentiment_encoded']].drop_duplicates())


Unique sentiment labels after encoding:
  sentiment  sentiment_encoded
0   neutral                  2
1  positive                  1
2  negative                  0


### Splitting and Tokenization

In [None]:
# Split the DataFrame into training and testing sets (stratified)
train_df, test_df = train_test_split(
    absa_df,
    test_size=0.2,
    stratify=absa_df['sentiment'],
    random_state=42
)

train_df, val_df = train_test_split(
    train_df,
    test_size=0.1,
    stratify=train_df['sentiment'],
    random_state=42
)

print("\nTrain shape:", train_df.shape,
      "Test shape:", test_df.shape,
      "Validation shape:", val_df.shape)

# Convert the train and test DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
validation_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

# Create a DatasetDict with train and test splits
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": validation_dataset
})

dataset_dict


Train shape: (4204, 4) Test shape: (1169, 4) Validation shape: (468, 4)


DatasetDict({
    train: Dataset({
        features: ['sentence', 'aspect', 'sentiment', 'sentiment_encoded'],
        num_rows: 4204
    })
    test: Dataset({
        features: ['sentence', 'aspect', 'sentiment', 'sentiment_encoded'],
        num_rows: 1169
    })
    validation: Dataset({
        features: ['sentence', 'aspect', 'sentiment', 'sentiment_encoded'],
        num_rows: 468
    })
})

In [None]:
dataset_dict['train'][0], dataset_dict['test'][0], dataset_dict['validation'][0]

({'sentence': 'i hope to edit this in the next few hours, i am going to try to install my own copy of windows 7.',
  'aspect': 'Windows 7',
  'sentiment': 'neutral',
  'sentiment_encoded': 2},
 {'sentence': 'monday nights are a bargain at the $28 prix fix - this includes a three course meal plus *three* glasses of wine paired with each course.',
  'aspect': 'course',
  'sentiment': 'neutral',
  'sentiment_encoded': 2},
 {'sentence': "after 2 tries by the waiter to take it away (we hadn't even looked at it yet, we had full beers yet to drink), the manager approached and told us they needed the table for people with reservations.",
  'aspect': 'beers',
  'sentiment': 'neutral',
  'sentiment_encoded': 2})

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    return tokenizer(
        example["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Apply tokenization to the entire dataset using map (batched processing)
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Add a function to map the 'sentiment_encoded' to 'labels'
def add_labels(example):
    example["labels"] = example["sentiment_encoded"]
    return example

# Map the function over the dataset
tokenized_datasets = tokenized_datasets.map(add_labels, batched=True)

# (Optional) Remove columns that are no longer needed
# Here, you may want to remove the raw text and the original sentiment columns
tokenized_datasets = tokenized_datasets.remove_columns(["sentence",
                                                        "aspect",
                                                        "sentiment",
                                                        "sentiment_encoded"])

print("\nTokenized train example:")
print(tokenized_datasets["train"][0])

tokenized_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/4204 [00:00<?, ? examples/s]

Map:   0%|          | 0/1169 [00:00<?, ? examples/s]

Map:   0%|          | 0/468 [00:00<?, ? examples/s]

Map:   0%|          | 0/4204 [00:00<?, ? examples/s]

Map:   0%|          | 0/1169 [00:00<?, ? examples/s]

Map:   0%|          | 0/468 [00:00<?, ? examples/s]


Tokenized train example:
{'input_ids': [101, 1045, 3246, 2000, 10086, 2023, 1999, 1996, 2279, 2261, 2847, 1010, 1045, 2572, 2183, 2000, 3046, 2000, 16500, 2026, 2219, 6100, 1997, 3645, 1021, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4204
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1169
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 468
    })
})

## Model loading and Configuration

In [None]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=3)

# Configure LoRA parameters
lora_config = LoraConfig(
    r=8,
    lora_alpha=16, # a = r*2
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

# Wrap your model with LoRA layers
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.2707


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # If logits are shape (num_samples, num_labels), get the predicted label by taking the argmax.
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    prec = precision_score(labels, predictions, average='weighted')
    rec = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

In [None]:
import torch
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the training arguments
training_args_lora = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2, #mbs=2
    gradient_accumulation_steps=16, # effective batch size 16*2
    optim="adamw_torch",
    save_steps=200, # checkpoint every 200 steps
    logging_steps=1,
    learning_rate=2e-4, # step size in the optimizer update
    weight_decay=0.001,
    fp16=True, # 16 bit
    bf16=False, # not supported on V100
    max_grad_norm=0.3, #gradient clipping improves convergence
    max_steps=-1,
    warmup_ratio=0.03, # learning rate warmup
    group_by_length=True,
    lr_scheduler_type="cosine" # cosine lr scheduler
)

trainer = Trainer(
    model=model,
    args=training_args_lora,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

import gc # garbage collection
gc.collect()
torch.cuda.empty_cache() # clean cache

trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmahkotasteam[0m ([33mmahkotasteam-asia-pacific-university-of-technology-innov[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
1,1.0741
2,0.9951
3,0.9828
4,1.0395
5,0.9442
6,1.065
7,0.9754
8,1.0356
9,0.9331
10,0.9297


TrainOutput(global_step=393, training_loss=0.7220549801833757, metrics={'train_runtime': 305.4572, 'train_samples_per_second': 41.289, 'train_steps_per_second': 1.287, 'total_flos': 827458971070464.0, 'train_loss': 0.7220549801833757, 'epoch': 2.981921979067555})

In [None]:
trainer.evaluate()

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'eval_loss': 0.6324132084846497,
 'eval_accuracy': 0.7371794871794872,
 'eval_precision': 0.7216208342884588,
 'eval_recall': 0.7371794871794872,
 'eval_f1': 0.6919703755352902,
 'eval_runtime': 1.3264,
 'eval_samples_per_second': 352.837,
 'eval_steps_per_second': 44.482,
 'epoch': 2.981921979067555}

## LoRA-2 with r and a increased

In [None]:
model_2 = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=3)

# Configure LoRA parameters
lora_config_2 = LoraConfig(
    r=16,
    lora_alpha=32, # a = r*2
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

# Wrap your model with LoRA layers
model_2 = get_peft_model(model_2, lora_config_2)
model_2.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 592,131 || all params: 110,076,678 || trainable%: 0.5379


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the training arguments
training_args_lora_2 = TrainingArguments(
    output_dir="./results-model_2",
    num_train_epochs=5,
    per_device_train_batch_size=2, #mbs=2
    gradient_accumulation_steps=16, # effective batch size 16*2
    optim="adamw_torch",
    save_steps=200, # checkpoint every 200 steps
    logging_steps=1,
    learning_rate=2e-4, # step size in the optimizer update
    weight_decay=0.001,
    fp16=True, # 16 bit
    bf16=False, # not supported on V100
    max_grad_norm=0.3, #gradient clipping improves convergence
    max_steps=-1,
    warmup_ratio=0.03, # learning rate warmup
    group_by_length=True,
    lr_scheduler_type="cosine" # cosine lr scheduler
)

trainer_2 = Trainer(
    model=model_2,
    args=training_args_lora_2,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

import gc # garbage collection
gc.collect()
torch.cuda.empty_cache() # clean cache

trainer_2.train()

  trainer_2 = Trainer(


Step,Training Loss
1,1.1854
2,1.2079
3,1.1474
4,1.1206
5,1.217
6,1.1442
7,1.2028
8,1.1284
9,1.1407
10,1.136


TrainOutput(global_step=655, training_loss=0.6448268460863419, metrics={'train_runtime': 498.0747, 'train_samples_per_second': 42.203, 'train_steps_per_second': 1.315, 'total_flos': 1382947358515200.0, 'train_loss': 0.6448268460863419, 'epoch': 4.966698382492864})

In [None]:
trainer_2.evaluate()

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'eval_loss': 0.5918135046958923,
 'eval_accuracy': 0.7713675213675214,
 'eval_precision': 0.7598617945840168,
 'eval_recall': 0.7713675213675214,
 'eval_f1': 0.7631507634389456,
 'eval_runtime': 1.3631,
 'eval_samples_per_second': 343.344,
 'eval_steps_per_second': 43.285,
 'epoch': 4.966698382492864}

## LoRA-3 with r and a increased, to 32 and 64

In [None]:
model_3 = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                        num_labels=3)

# Configure LoRA parameters
lora_config_3 = LoraConfig(
    r=32,
    lora_alpha=64, # a = r*2
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

# Wrap your model with LoRA layers
model_3 = get_peft_model(model_3, lora_config_3)
model_3.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,181,955 || all params: 110,666,502 || trainable%: 1.0680


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the training arguments
training_args_lora_3 = TrainingArguments(
    output_dir="./results-model_3",
    num_train_epochs=6,
    per_device_train_batch_size=2, #mbs=2
    gradient_accumulation_steps=16, # effective batch size 16*2
    optim="adamw_torch",
    save_steps=200, # checkpoint every 200 steps
    logging_steps=1,
    learning_rate=2e-4, # step size in the optimizer update
    weight_decay=0.001,
    fp16=True, # 16 bit
    bf16=False, # not supported on V100
    max_grad_norm=0.3, #gradient clipping improves convergence
    max_steps=-1,
    warmup_ratio=0.03, # learning rate warmup
    group_by_length=True,
    lr_scheduler_type="cosine" # cosine lr scheduler
)

trainer_3 = Trainer(
    model=model_3,
    args=training_args_lora_3,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

import gc # garbage collection
gc.collect()
torch.cuda.empty_cache() # clean cache

trainer_3.train()

  trainer_3 = Trainer(


Step,Training Loss
1,1.1758
2,1.1634
3,1.0982
4,1.073
5,1.126
6,1.1011
7,1.1572
8,1.1181
9,1.1321
10,1.1024


TrainOutput(global_step=786, training_loss=0.5756967399866526, metrics={'train_runtime': 576.2492, 'train_samples_per_second': 43.773, 'train_steps_per_second': 1.364, 'total_flos': 1670620073785344.0, 'train_loss': 0.5756967399866526, 'epoch': 5.959086584205519})

In [None]:
trainer_3.evaluate()

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'eval_loss': 0.5980727076530457,
 'eval_accuracy': 0.7777777777777778,
 'eval_precision': 0.7657031243115098,
 'eval_recall': 0.7777777777777778,
 'eval_f1': 0.7674804274690797,
 'eval_runtime': 1.3268,
 'eval_samples_per_second': 352.721,
 'eval_steps_per_second': 44.467,
 'epoch': 5.959086584205519}

## LoRA-4 with add-on targeted modules, `value` and `dense`

In [None]:
model_4 = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                        num_labels=3)

# Configure LoRA parameters
lora_config_4 = LoraConfig(
    r=32,
    lora_alpha=64, # a = r*2
    target_modules=["query", "key", "value", "dense"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

# Wrap your model with LoRA layers
model_4 = get_peft_model(model_4, lora_config_4)
model_4.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 5,359,875 || all params: 114,844,422 || trainable%: 4.6671


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the training arguments
training_args_lora_4 = TrainingArguments(
    output_dir="./results-model_4",
    num_train_epochs=6,
    per_device_train_batch_size=2, #mbs=2
    gradient_accumulation_steps=16, # effective batch size 16*2
    optim="adamw_torch",
    save_steps=200, # checkpoint every 200 steps
    logging_steps=1,
    learning_rate=2e-4, # step size in the optimizer update
    weight_decay=0.001,
    fp16=True, # 16 bit
    bf16=False, # not supported on V100
    max_grad_norm=0.3, #gradient clipping improves convergence
    max_steps=-1,
    warmup_ratio=0.03, # learning rate warmup
    group_by_length=True,
    lr_scheduler_type="cosine" # cosine lr scheduler
)

trainer_4 = Trainer(
    model=model_4,
    args=training_args_lora_4,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

import gc # garbage collection
gc.collect()
torch.cuda.empty_cache() # clean cache

trainer_4.train()

  trainer_4 = Trainer(


Step,Training Loss
1,1.221
2,1.247
3,1.2935
4,1.2368
5,1.2244
6,1.1722
7,1.1331
8,1.1654
9,1.1083
10,1.0324


TrainOutput(global_step=786, training_loss=0.4589926237216736, metrics={'train_runtime': 990.8968, 'train_samples_per_second': 25.456, 'train_steps_per_second': 0.793, 'total_flos': 1751002987198464.0, 'train_loss': 0.4589926237216736, 'epoch': 5.959086584205519})

In [None]:
trainer_4.evaluate()

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'eval_loss': 0.7281752228736877,
 'eval_accuracy': 0.7521367521367521,
 'eval_precision': 0.745759536178415,
 'eval_recall': 0.7521367521367521,
 'eval_f1': 0.748593640087723,
 'eval_runtime': 2.2541,
 'eval_samples_per_second': 207.62,
 'eval_steps_per_second': 26.174,
 'epoch': 5.959086584205519}

## LoRA-5, targeting all linear layers with r32 a64

In [None]:
model_5 = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                        num_labels=3)

# Configure LoRA parameters
lora_config_5 = LoraConfig(
    r=32,
    lora_alpha=64, # a = r*2
    target_modules=["query", "key", "value", "dense",
                    "intermediate.dense", "output.dense"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

# Wrap your model with LoRA layers
model_5 = get_peft_model(model_5, lora_config_5)
model_5.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 5,359,875 || all params: 114,844,422 || trainable%: 4.6671


In [None]:
import torch
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define the training arguments
training_args_lora_5 = TrainingArguments(
    output_dir="./results-model_5",
    num_train_epochs=5,
    per_device_train_batch_size=2, #mbs=2
    gradient_accumulation_steps=16, # effective batch size 16*2
    optim="adamw_torch",
    save_steps=200, # checkpoint every 200 steps
    logging_steps=1,
    learning_rate=2e-4, # step size in the optimizer update
    weight_decay=0.001,
    fp16=True, # 16 bit
    bf16=False, # not supported on V100
    max_grad_norm=0.3, #gradient clipping improves convergence
    max_steps=-1,
    warmup_ratio=0.03, # learning rate warmup
    group_by_length=True,
    lr_scheduler_type="cosine" # cosine lr scheduler
)

trainer_5 = Trainer(
    model=model_5,
    args=training_args_lora_5,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

import gc # garbage collection
gc.collect()
torch.cuda.empty_cache() # clean cache

trainer_5.train()

  trainer_5 = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmahkotasteam[0m ([33mmahkotasteam-asia-pacific-university-of-technology-innov[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
1,1.1077
2,1.1492
3,1.1379
4,1.1736
5,1.0882
6,1.1374
7,1.106
8,1.0918
9,1.0082
10,1.0196


TrainOutput(global_step=655, training_loss=0.5005192502309348, metrics={'train_runtime': 882.1958, 'train_samples_per_second': 23.827, 'train_steps_per_second': 0.742, 'total_flos': 1459402138460160.0, 'train_loss': 0.5005192502309348, 'epoch': 4.966698382492864})

In [None]:
trainer_5.evaluate()

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'eval_loss': 0.6837388277053833,
 'eval_accuracy': 0.7606837606837606,
 'eval_precision': 0.7555191514725055,
 'eval_recall': 0.7606837606837606,
 'eval_f1': 0.7578218750803828,
 'eval_runtime': 2.5757,
 'eval_samples_per_second': 181.698,
 'eval_steps_per_second': 22.906,
 'epoch': 4.966698382492864}

## Implement `Temperature Scaling` Calibration Technique to calibrate `eval_loss` into model_5

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
from scipy.special import softmax

class ModelWithTemperature(nn.Module):
    def __init__(self, model):
        super(ModelWithTemperature, self).__init__()
        self.model = model
        self.temperature = nn.Parameter(torch.ones(1) * 1.5)  # Initialize T=1.5

    def forward(self, input_ids, attention_mask):
        logits = self.model(input_ids=input_ids, attention_mask=attention_mask).logits
        return logits / self.temperature

def calibrate_temperature(model, valid_loader, device='cuda'):
    """
    Learn the optimal temperature on the validation set
    """
    temperature_model = ModelWithTemperature(model)
    temperature_model = temperature_model.to(device)

    nll_criterion = nn.CrossEntropyLoss()
    optimizer = optim.LBFGS([temperature_model.temperature], lr=0.01, max_iter=50)

    def eval_step():
        total_loss = torch.tensor(0., device=device)  # Create tensor for loss
        num_samples = 0

        for batch in valid_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            scaled_logits = temperature_model(input_ids, attention_mask)
            loss = nll_criterion(scaled_logits, labels)
            total_loss += loss * input_ids.size(0)
            num_samples += input_ids.size(0)

        loss = total_loss / num_samples
        loss.backward()
        return loss

    optimizer.step(eval_step)
    return temperature_model

# Create validation dataloader
valid_dataloader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=8,
    collate_fn=data_collator
)

# Calibrate the model
print("Starting temperature calibration...")
calibrated_model = calibrate_temperature(model_5, valid_dataloader)
optimal_temperature = calibrated_model.temperature.item()
print(f"Optimal temperature: {optimal_temperature:.3f}")

# Function to get calibrated predictions
def get_calibrated_predictions(model, dataloader, temperature, device='cuda'):
    model.eval()
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits / temperature
            probs = torch.softmax(logits, dim=-1)

            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return np.array(all_probs), np.array(all_labels)

# Get calibrated predictions
calibrated_probs, true_labels = get_calibrated_predictions(
    model_5,
    valid_dataloader,
    optimal_temperature
)

# Function to compute ECE
def compute_ece(probs, labels, n_bins=15):
    confidences = np.max(probs, axis=1)
    predictions = np.argmax(probs, axis=1)
    accuracies = predictions == labels

    ece = 0
    total_samples = len(labels)

    for bin_lower in np.linspace(0, 1, n_bins+1)[:-1]:
        bin_upper = bin_lower + 1/n_bins
        bin_mask = (confidences >= bin_lower) & (confidences < bin_upper)
        if np.any(bin_mask):
            bin_accuracy = np.mean(accuracies[bin_mask])
            bin_confidence = np.mean(confidences[bin_mask])
            bin_samples = np.sum(bin_mask)
            ece += (bin_samples / total_samples) * np.abs(bin_accuracy - bin_confidence)

    return ece

# Get uncalibrated predictions first
def get_uncalibrated_predictions(model, dataloader, device='cuda'):
    model.eval()
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.softmax(outputs.logits, dim=-1)

            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return np.array(all_probs), np.array(all_labels)

# Compute ECE before and after calibration
uncalibrated_probs, _ = get_uncalibrated_predictions(model_5, valid_dataloader)
ece_before = compute_ece(uncalibrated_probs, true_labels)
ece_after = compute_ece(calibrated_probs, true_labels)

print(f"ECE Before Calibration: {ece_before:.3f}")
print(f"ECE After Calibration: {ece_after:.3f}")

Starting temperature calibration...


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 16.12 MiB is free. Process 22430 has 14.72 GiB memory in use. Of the allocated memory 14.59 GiB is allocated by PyTorch, and 3.38 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
from scipy.special import softmax
import gc

class ModelWithTemperature(nn.Module):
    def __init__(self, model):
        super(ModelWithTemperature, self).__init__()
        self.model = model
        self.temperature = nn.Parameter(torch.ones(1) * 1.5)

    def forward(self, input_ids, attention_mask):
        logits = self.model(input_ids=input_ids, attention_mask=attention_mask).logits
        return logits / self.temperature

def calibrate_temperature(model, valid_loader, device='cuda'):
    """
    Memory-efficient temperature calibration
    """
    # Clear GPU memory before starting
    torch.cuda.empty_cache()
    gc.collect()

    temperature_model = ModelWithTemperature(model)
    temperature_model = temperature_model.to(device)
    temperature_model.eval()  # Ensure model is in eval mode

    nll_criterion = nn.CrossEntropyLoss()
    optimizer = optim.LBFGS([temperature_model.temperature], lr=0.01, max_iter=50)

    def eval_step():
        optimizer.zero_grad()
        total_loss = torch.tensor(0., device=device)
        num_samples = 0

        # Process in smaller chunks
        for batch in valid_loader:
            # Move batch to CPU initially
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            # Process in even smaller sub-batches if needed
            sub_batch_size = 2  # Very small sub-batch size
            for i in range(0, input_ids.size(0), sub_batch_size):
                end_idx = min(i + sub_batch_size, input_ids.size(0))

                sub_input_ids = input_ids[i:end_idx].to(device)
                sub_attention_mask = attention_mask[i:end_idx].to(device)
                sub_labels = labels[i:end_idx].to(device)

                with torch.set_grad_enabled(True):
                    scaled_logits = temperature_model(sub_input_ids, sub_attention_mask)
                    loss = nll_criterion(scaled_logits, sub_labels)
                    total_loss += loss * sub_input_ids.size(0)
                    num_samples += sub_input_ids.size(0)

                # Clear GPU memory after each sub-batch
                del sub_input_ids, sub_attention_mask, sub_labels, scaled_logits
                torch.cuda.empty_cache()

        avg_loss = total_loss / num_samples
        avg_loss.backward()
        return avg_loss

    optimizer.step(eval_step)

    return temperature_model

# Create a smaller validation dataloader
valid_dataloader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=4,  # Reduced batch size
    collate_fn=data_collator,
    shuffle=False
)

# Clear GPU memory before starting
torch.cuda.empty_cache()
gc.collect()

print("Starting temperature calibration...")
try:
    calibrated_model = calibrate_temperature(model_5, valid_dataloader)
    optimal_temperature = calibrated_model.temperature.item()
    print(f"Optimal temperature: {optimal_temperature:.3f}")

    # Function to get predictions in a memory-efficient way
    def get_predictions(model, dataloader, temperature=None, device='cuda'):
        model.eval()
        all_probs = []
        all_labels = []

        with torch.no_grad():
            for batch in dataloader:
                # Process in smaller sub-batches
                sub_batch_size = 2
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels']

                for i in range(0, input_ids.size(0), sub_batch_size):
                    end_idx = min(i + sub_batch_size, input_ids.size(0))

                    sub_input_ids = input_ids[i:end_idx].to(device)
                    sub_attention_mask = attention_mask[i:end_idx].to(device)
                    sub_labels = labels[i:end_idx]

                    outputs = model(input_ids=sub_input_ids, attention_mask=sub_attention_mask)
                    logits = outputs.logits

                    if temperature is not None:
                        logits = logits / temperature

                    probs = torch.softmax(logits, dim=-1)

                    all_probs.extend(probs.cpu().numpy())
                    all_labels.extend(sub_labels.numpy())

                    # Clear GPU memory
                    del sub_input_ids, sub_attention_mask, logits, probs
                    torch.cuda.empty_cache()

        return np.array(all_probs), np.array(all_labels)

    # Get calibrated and uncalibrated predictions
    print("Computing predictions...")
    uncalibrated_probs, true_labels = get_predictions(model_5, valid_dataloader)
    calibrated_probs, _ = get_predictions(model_5, valid_dataloader, optimal_temperature)

    # Compute ECE
    def compute_ece(probs, labels, n_bins=15):
        confidences = np.max(probs, axis=1)
        predictions = np.argmax(probs, axis=1)
        accuracies = predictions == labels

        ece = 0
        total_samples = len(labels)

        for bin_lower in np.linspace(0, 1, n_bins+1)[:-1]:
            bin_upper = bin_lower + 1/n_bins
            bin_mask = (confidences >= bin_lower) & (confidences < bin_upper)
            if np.any(bin_mask):
                bin_accuracy = np.mean(accuracies[bin_mask])
                bin_confidence = np.mean(confidences[bin_mask])
                bin_samples = np.sum(bin_mask)
                ece += (bin_samples / total_samples) * np.abs(bin_accuracy - bin_confidence)

        return ece

    ece_before = compute_ece(uncalibrated_probs, true_labels)
    ece_after = compute_ece(calibrated_probs, true_labels)

    print(f"ECE Before Calibration: {ece_before:.3f}")
    print(f"ECE After Calibration: {ece_after:.3f}")

except RuntimeError as e:
    if "out of memory" in str(e):
        print("Still running into memory issues. Try these steps:")
        print("1. Reduce sub_batch_size further")
        print("2. Move part of validation set to CPU")
        print("3. Use gradient checkpointing")
    raise e

Starting temperature calibration...
Still running into memory issues. Try these steps:
1. Reduce sub_batch_size further
2. Move part of validation set to CPU
3. Use gradient checkpointing


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 22430 has 14.74 GiB memory in use. Of the allocated memory 14.58 GiB is allocated by PyTorch, and 19.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, Subset
import gc

class ModelWithTemperature(nn.Module):
    def __init__(self, model):
        super(ModelWithTemperature, self).__init__()
        self.model = model
        self.temperature = nn.Parameter(torch.ones(1) * 1.5)

        # Enable gradient checkpointing
        if hasattr(self.model, 'gradient_checkpointing_enable'):
            self.model.gradient_checkpointing_enable()
            print("Gradient checkpointing enabled")

    def forward(self, input_ids, attention_mask):
        logits = self.model(input_ids=input_ids, attention_mask=attention_mask).logits
        return logits / self.temperature

def calibrate_temperature(model, valid_loader, device='cuda', chunk_size=100):
    """
    Temperature calibration with gradient checkpointing and CPU offloading
    """
    torch.cuda.empty_cache()
    gc.collect()

    temperature_model = ModelWithTemperature(model)
    temperature_model = temperature_model.to(device)
    temperature_model.eval()

    nll_criterion = nn.CrossEntropyLoss()
    optimizer = optim.LBFGS([temperature_model.temperature], lr=0.01, max_iter=50)

    # Get total dataset size
    total_samples = len(valid_loader.dataset)

    # Create chunks of dataset indices
    chunk_indices = list(range(0, total_samples, chunk_size))

    def eval_step():
        optimizer.zero_grad()
        total_loss = torch.tensor(0., device=device)
        processed_samples = 0

        # Process dataset in chunks
        for start_idx in chunk_indices:
            end_idx = min(start_idx + chunk_size, total_samples)
            chunk_dataset = Subset(valid_loader.dataset, range(start_idx, end_idx))
            chunk_loader = DataLoader(
                chunk_dataset,
                batch_size=1,  # Process one sample at a time
                collate_fn=valid_loader.collate_fn
            )

            # Process each chunk
            for batch in chunk_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                with torch.set_grad_enabled(True):
                    scaled_logits = temperature_model(input_ids, attention_mask)
                    loss = nll_criterion(scaled_logits, labels)
                    total_loss += loss * input_ids.size(0)
                    processed_samples += input_ids.size(0)

                # Clear memory
                del input_ids, attention_mask, labels, scaled_logits
                torch.cuda.empty_cache()

            print(f"Processed {processed_samples}/{total_samples} samples")

        avg_loss = total_loss / processed_samples
        avg_loss.backward()
        return avg_loss

    optimizer.step(eval_step)
    return temperature_model

# Create validation dataloader with smaller chunk size
print("Starting temperature calibration with gradient checkpointing...")
try:
    calibrated_model = calibrate_temperature(
        model_5,
        valid_dataloader,
        chunk_size=50  # Process 50 samples at a time
    )
    optimal_temperature = calibrated_model.temperature.item()
    print(f"Optimal temperature: {optimal_temperature:.3f}")

    def get_predictions(model, dataloader, temperature=None, device='cuda', chunk_size=50):
        model.eval()
        all_probs = []
        all_labels = []
        total_samples = len(dataloader.dataset)

        for start_idx in range(0, total_samples, chunk_size):
            end_idx = min(start_idx + chunk_size, total_samples)
            chunk_dataset = Subset(dataloader.dataset, range(start_idx, end_idx))
            chunk_loader = DataLoader(
                chunk_dataset,
                batch_size=1,
                collate_fn=dataloader.collate_fn
            )

            with torch.no_grad():
                for batch in chunk_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels']

                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    logits = outputs.logits

                    if temperature is not None:
                        logits = logits / temperature

                    probs = torch.softmax(logits, dim=-1)

                    all_probs.extend(probs.cpu().numpy())
                    all_labels.extend(labels.numpy())

                    del input_ids, attention_mask, logits, probs
                    torch.cuda.empty_cache()

            print(f"Processed predictions for {end_idx}/{total_samples} samples")

        return np.array(all_probs), np.array(all_labels)

    print("Computing predictions...")
    uncalibrated_probs, true_labels = get_predictions(model_5, valid_dataloader)
    calibrated_probs, _ = get_predictions(model_5, valid_dataloader, optimal_temperature)

    def compute_ece(probs, labels, n_bins=15):
        confidences = np.max(probs, axis=1)
        predictions = np.argmax(probs, axis=1)
        accuracies = predictions == labels

        ece = 0
        total_samples = len(labels)

        for bin_lower in np.linspace(0, 1, n_bins+1)[:-1]:
            bin_upper = bin_lower + 1/n_bins
            bin_mask = (confidences >= bin_lower) & (confidences < bin_upper)
            if np.any(bin_mask):
                bin_accuracy = np.mean(accuracies[bin_mask])
                bin_confidence = np.mean(confidences[bin_mask])
                bin_samples = np.sum(bin_mask)
                ece += (bin_samples / total_samples) * np.abs(bin_accuracy - bin_confidence)

        return ece

    ece_before = compute_ece(uncalibrated_probs, true_labels)
    ece_after = compute_ece(calibrated_probs, true_labels)

    print(f"ECE Before Calibration: {ece_before:.3f}")
    print(f"ECE After Calibration: {ece_after:.3f}")

except RuntimeError as e:
    if "out of memory" in str(e):
        print("Still experiencing memory issues. Try these additional steps:")
        print("1. Reduce chunk_size further (currently 50)")
        print("2. Enable mixed precision (fp16)")
        print("3. Reduce model precision to float16")
    raise e

Starting temperature calibration with gradient checkpointing...
Gradient checkpointing enabled
Still experiencing memory issues. Try these additional steps:
1. Reduce chunk_size further (currently 50)
2. Enable mixed precision (fp16)
3. Reduce model precision to float16


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 22430 has 14.74 GiB memory in use. Of the allocated memory 14.59 GiB is allocated by PyTorch, and 11.80 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, Subset
from torch.cuda.amp import autocast, GradScaler
import gc

class ModelWithTemperature(nn.Module):
    def __init__(self, model):
        super(ModelWithTemperature, self).__init__()
        # Convert model to half precision
        self.model = model.half()
        self.temperature = nn.Parameter(torch.ones(1, dtype=torch.float16) * 1.5)

        # Enable gradient checkpointing
        if hasattr(self.model, 'gradient_checkpointing_enable'):
            self.model.gradient_checkpointing_enable()
            print("Gradient checkpointing enabled")

    def forward(self, input_ids, attention_mask):
        with autocast():
            logits = self.model(input_ids=input_ids, attention_mask=attention_mask).logits
            return logits / self.temperature

def calibrate_temperature(model, valid_loader, device='cuda', chunk_size=10):
    """
    Ultra memory-efficient temperature calibration with all optimizations
    """
    # Clear memory
    torch.cuda.empty_cache()
    gc.collect()

    # Initialize model with optimizations
    temperature_model = ModelWithTemperature(model)
    temperature_model = temperature_model.to(device)
    temperature_model.eval()

    # Initialize mixed precision training
    scaler = GradScaler()

    nll_criterion = nn.CrossEntropyLoss()
    optimizer = optim.LBFGS([temperature_model.temperature], lr=0.01, max_iter=50)

    # Get total dataset size
    total_samples = len(valid_loader.dataset)

    # Create smaller chunks
    chunk_indices = list(range(0, total_samples, chunk_size))

    def eval_step():
        optimizer.zero_grad()
        total_loss = torch.tensor(0., device=device, dtype=torch.float16)
        processed_samples = 0

        for start_idx in chunk_indices:
            end_idx = min(start_idx + chunk_size, total_samples)
            chunk_dataset = Subset(valid_loader.dataset, range(start_idx, end_idx))
            chunk_loader = DataLoader(
                chunk_dataset,
                batch_size=1,
                collate_fn=valid_loader.collate_fn
            )

            for batch in chunk_loader:
                # Convert inputs to half precision
                input_ids = batch['input_ids'].to(device).half()
                attention_mask = batch['attention_mask'].to(device).half()
                labels = batch['labels'].to(device)

                with torch.set_grad_enabled(True), autocast():
                    scaled_logits = temperature_model(input_ids, attention_mask)
                    loss = nll_criterion(scaled_logits, labels)
                    loss = loss * input_ids.size(0)

                # Scale loss and accumulate
                scaler.scale(loss).backward()
                total_loss += loss.item()
                processed_samples += input_ids.size(0)

                # Clear memory
                del input_ids, attention_mask, labels, scaled_logits, loss
                torch.cuda.empty_cache()

            print(f"Processed {processed_samples}/{total_samples} samples")

        avg_loss = total_loss / processed_samples
        return avg_loss

    try:
        optimizer.step(eval_step)
    except RuntimeError as e:
        print(f"Error during optimization: {str(e)}")
        print("Current temperature value:", temperature_model.temperature.item())

    return temperature_model

# Try to free up as much memory as possible
torch.cuda.empty_cache()
gc.collect()

print("Starting ultra memory-efficient temperature calibration...")

try:
    # Convert model to half precision before calibration
    model_5 = model_5.half()

    calibrated_model = calibrate_temperature(
        model_5,
        valid_dataloader,
        chunk_size=5  # Very small chunk size
    )
    optimal_temperature = calibrated_model.temperature.item()
    print(f"Optimal temperature: {optimal_temperature:.3f}")

    def get_predictions(model, dataloader, temperature=None, device='cuda', chunk_size=5):
        model.eval()
        all_probs = []
        all_labels = []
        total_samples = len(dataloader.dataset)

        for start_idx in range(0, total_samples, chunk_size):
            end_idx = min(start_idx + chunk_size, total_samples)
            chunk_dataset = Subset(dataloader.dataset, range(start_idx, end_idx))
            chunk_loader = DataLoader(
                chunk_dataset,
                batch_size=1,
                collate_fn=dataloader.collate_fn
            )

            with torch.no_grad(), autocast():
                for batch in chunk_loader:
                    input_ids = batch['input_ids'].to(device).half()
                    attention_mask = batch['attention_mask'].to(device).half()
                    labels = batch['labels']

                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    logits = outputs.logits

                    if temperature is not None:
                        logits = logits / temperature

                    probs = torch.softmax(logits, dim=-1)

                    all_probs.extend(probs.cpu().numpy())
                    all_labels.extend(labels.numpy())

                    del input_ids, attention_mask, logits, probs
                    torch.cuda.empty_cache()

            print(f"Processed predictions for {end_idx}/{total_samples} samples")

        return np.array(all_probs), np.array(all_labels)

    print("Computing predictions...")
    uncalibrated_probs, true_labels = get_predictions(model_5, valid_dataloader)
    calibrated_probs, _ = get_predictions(model_5, valid_dataloader, optimal_temperature)

    def compute_ece(probs, labels, n_bins=15):
        confidences = np.max(probs, axis=1)
        predictions = np.argmax(probs, axis=1)
        accuracies = predictions == labels

        ece = 0
        total_samples = len(labels)

        for bin_lower in np.linspace(0, 1, n_bins+1)[:-1]:
            bin_upper = bin_lower + 1/n_bins
            bin_mask = (confidences >= bin_lower) & (confidences < bin_upper)
            if np.any(bin_mask):
                bin_accuracy = np.mean(accuracies[bin_mask])
                bin_confidence = np.mean(confidences[bin_mask])
                bin_samples = np.sum(bin_mask)
                ece += (bin_samples / total_samples) * np.abs(bin_accuracy - bin_confidence)

        return ece

    ece_before = compute_ece(uncalibrated_probs, true_labels)
    ece_after = compute_ece(calibrated_probs, true_labels)

    print(f"ECE Before Calibration: {ece_before:.3f}")
    print(f"ECE After Calibration: {ece_after:.3f}")

except RuntimeError as e:
    if "out of memory" in str(e):
        print("\nStill experiencing memory issues. Additional suggestions:")
        print("1. Try running this on CPU only (much slower but should work)")
        print("2. Reduce the validation set size temporarily")
        print("3. Try a simpler calibration method like Platt scaling")
    raise e

Starting ultra memory-efficient temperature calibration...
Gradient checkpointing enabled
Error during optimization: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.HalfTensor instead (while checking arguments for embedding)
Current temperature value: 1.5
Optimal temperature: 1.500
Computing predictions...


  scaler = GradScaler()
  with torch.set_grad_enabled(True), autocast():
  with autocast():
  with torch.no_grad(), autocast():


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.HalfTensor instead (while checking arguments for embedding)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, Subset
from torch.amp import autocast, GradScaler
import gc

class ModelWithTemperature(nn.Module):
    def __init__(self, model):
        super(ModelWithTemperature, self).__init__()
        self.model = model.half()  # Convert model to half precision
        self.temperature = nn.Parameter(torch.ones(1, dtype=torch.float16) * 1.5)

        if hasattr(self.model, 'gradient_checkpointing_enable'):
            self.model.gradient_checkpointing_enable()
            print("Gradient checkpointing enabled")

    def forward(self, input_ids, attention_mask):
        with autocast('cuda'):
            # input_ids remains as Long type, attention_mask as float16
            logits = self.model(input_ids=input_ids, attention_mask=attention_mask).logits
            return logits / self.temperature

def calibrate_temperature(model, valid_loader, device='cuda', chunk_size=10):
    """
    Memory-efficient temperature calibration with correct type handling
    """
    torch.cuda.empty_cache()
    gc.collect()

    temperature_model = ModelWithTemperature(model)
    temperature_model = temperature_model.to(device)
    temperature_model.eval()

    scaler = GradScaler('cuda')
    nll_criterion = nn.CrossEntropyLoss()
    optimizer = optim.LBFGS([temperature_model.temperature], lr=0.01, max_iter=50)

    total_samples = len(valid_loader.dataset)
    chunk_indices = list(range(0, total_samples, chunk_size))

    def eval_step():
        optimizer.zero_grad()
        total_loss = torch.tensor(0., device=device, dtype=torch.float32)
        processed_samples = 0

        for start_idx in chunk_indices:
            end_idx = min(start_idx + chunk_size, total_samples)
            chunk_dataset = Subset(valid_loader.dataset, range(start_idx, end_idx))
            chunk_loader = DataLoader(
                chunk_dataset,
                batch_size=1,
                collate_fn=valid_loader.collate_fn
            )

            for batch in chunk_loader:
                # Keep input_ids as Long, convert attention_mask to half
                input_ids = batch['input_ids'].to(device)  # Keep as Long
                attention_mask = batch['attention_mask'].to(device).half()
                labels = batch['labels'].to(device)

                with torch.set_grad_enabled(True), autocast('cuda'):
                    scaled_logits = temperature_model(input_ids, attention_mask)
                    loss = nll_criterion(scaled_logits, labels)
                    loss = loss * input_ids.size(0)

                scaler.scale(loss).backward()
                total_loss += loss.item()
                processed_samples += input_ids.size(0)

                del input_ids, attention_mask, labels, scaled_logits, loss
                torch.cuda.empty_cache()

            print(f"Processed {processed_samples}/{total_samples} samples")

        avg_loss = total_loss / processed_samples
        return avg_loss

    try:
        optimizer.step(eval_step)
    except RuntimeError as e:
        print(f"Error during optimization: {str(e)}")
        print("Current temperature value:", temperature_model.temperature.item())

    return temperature_model

# Clear memory before starting
torch.cuda.empty_cache()
gc.collect()

print("Starting type-aware temperature calibration...")

try:
    # Convert model to half precision
    model_5 = model_5.half()

    calibrated_model = calibrate_temperature(
        model_5,
        valid_dataloader,
        chunk_size=5
    )
    optimal_temperature = calibrated_model.temperature.item()
    print(f"Optimal temperature: {optimal_temperature:.3f}")

    def get_predictions(model, dataloader, temperature=None, device='cuda', chunk_size=5):
        model.eval()
        all_probs = []
        all_labels = []
        total_samples = len(dataloader.dataset)

        for start_idx in range(0, total_samples, chunk_size):
            end_idx = min(start_idx + chunk_size, total_samples)
            chunk_dataset = Subset(dataloader.dataset, range(start_idx, end_idx))
            chunk_loader = DataLoader(
                chunk_dataset,
                batch_size=1,
                collate_fn=dataloader.collate_fn
            )

            with torch.no_grad(), autocast('cuda'):
                for batch in chunk_loader:
                    input_ids = batch['input_ids'].to(device)  # Keep as Long
                    attention_mask = batch['attention_mask'].to(device).half()
                    labels = batch['labels']

                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    logits = outputs.logits

                    if temperature is not None:
                        logits = logits / temperature

                    probs = torch.softmax(logits, dim=-1)

                    all_probs.extend(probs.cpu().numpy())
                    all_labels.extend(labels.numpy())

                    del input_ids, attention_mask, logits, probs
                    torch.cuda.empty_cache()

            print(f"Processed predictions for {end_idx}/{total_samples} samples")

        return np.array(all_probs), np.array(all_labels)

    print("Computing predictions...")
    uncalibrated_probs, true_labels = get_predictions(model_5, valid_dataloader)
    calibrated_probs, _ = get_predictions(model_5, valid_dataloader, optimal_temperature)

    def compute_ece(probs, labels, n_bins=15):
        confidences = np.max(probs, axis=1)
        predictions = np.argmax(probs, axis=1)
        accuracies = predictions == labels

        ece = 0
        total_samples = len(labels)

        for bin_lower in np.linspace(0, 1, n_bins+1)[:-1]:
            bin_upper = bin_lower + 1/n_bins
            bin_mask = (confidences >= bin_lower) & (confidences < bin_upper)
            if np.any(bin_mask):
                bin_accuracy = np.mean(accuracies[bin_mask])
                bin_confidence = np.mean(confidences[bin_mask])
                bin_samples = np.sum(bin_mask)
                ece += (bin_samples / total_samples) * np.abs(bin_accuracy - bin_confidence)

        return ece

    ece_before = compute_ece(uncalibrated_probs, true_labels)
    ece_after = compute_ece(calibrated_probs, true_labels)

    print(f"ECE Before Calibration: {ece_before:.3f}")
    print(f"ECE After Calibration: {ece_after:.3f}")

except RuntimeError as e:
    if "out of memory" in str(e):
        print("\nStill experiencing memory issues. Last resort options:")
        print("1. Run on CPU only")
        print("2. Use a smaller subset of validation data")
        print("3. Try Platt scaling instead")
    raise e

Starting type-aware temperature calibration...
Gradient checkpointing enabled
Processed 5/468 samples
Processed 10/468 samples
Processed 15/468 samples
Processed 20/468 samples
Processed 25/468 samples
Processed 30/468 samples
Processed 35/468 samples
Processed 40/468 samples
Processed 45/468 samples
Processed 50/468 samples
Processed 55/468 samples
Processed 60/468 samples
Processed 65/468 samples
Processed 70/468 samples
Processed 75/468 samples
Processed 80/468 samples
Processed 85/468 samples
Processed 90/468 samples
Processed 95/468 samples
Processed 100/468 samples
Processed 105/468 samples
Processed 110/468 samples
Processed 115/468 samples
Processed 120/468 samples
Processed 125/468 samples
Processed 130/468 samples
Processed 135/468 samples
Processed 140/468 samples
Processed 145/468 samples
Processed 150/468 samples
Processed 155/468 samples
Processed 160/468 samples
Processed 165/468 samples
Processed 170/468 samples
Processed 175/468 samples
Processed 180/468 samples
Proces