In [None]:
!pip install -q transformers datasets accelerate peft bitsandbytes wandb psutil

import os
import time
import psutil
import torch
import wandb
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
from peft import get_peft_model, LoraConfig, TaskType

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33madichats-2003[0m ([33madichats-2003-bits-pilani[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
pip install -U datasets huggingface_hub fsspec

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [None]:
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess(example):
    return tokenizer(example['text'], padding="max_length", truncation=True, max_length=256)

tokenized = dataset.map(preprocess, batched=True)
train_data = tokenized["train"].shuffle(seed=42).select(range(2000))
val_data = tokenized["test"].shuffle(seed=42).select(range(500))

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
def get_memory_usage():
    process = psutil.Process(os.getpid())
    cpu_mem = process.memory_info().rss / (1024**3)  # in GB
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.max_memory_allocated() / (1024**3)
        torch.cuda.reset_peak_memory_stats()
    else:
        gpu_mem = 0.0
    return cpu_mem, gpu_mem

In [None]:
def build_model(method="full"):
    base = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

    if method == "lora":
        lora_config = LoraConfig(
            r=8,
            lora_alpha=16,
            target_modules=["q_lin", "v_lin", "query", "value"],
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.SEQ_CLS,
        )
        return get_peft_model(base, lora_config)
    elif method == "full":
        return base
    else:
        raise NotImplementedError("Only 'full' and 'lora' implemented so far.")

In [None]:
def compute_metrics(eval_pred: EvalPrediction):
    preds = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids
    accuracy = (preds == labels).astype(np.float32).mean().item()
    return {"accuracy": accuracy}

def train_model(method="full"):
    model = build_model(method)
    run = wandb.init(project="llm-finetuning-cost-analysis", name=f"{method}_run", reinit=True)

    training_args = TrainingArguments(
        output_dir=f"./results_{method}",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",
        num_train_epochs=1,
        logging_steps=10,
        save_strategy="no",
        report_to=["wandb"],
        fp16=torch.cuda.is_available(),
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    start_time = time.time()
    trainer.train()
    end_time = time.time()

    metrics = trainer.evaluate()
    cpu_mem, gpu_mem = get_memory_usage()
    wall_time = end_time - start_time

    print(f"[{method.upper()}] Training time: {wall_time:.2f} seconds")
    print(f"Accuracy: {metrics['eval_accuracy']:.4f}")
    print(f"CPU Mem Used: {cpu_mem:.2f} GB, GPU Mem Used: {gpu_mem:.2f} GB")

    wandb.log({
        "final_accuracy": metrics['eval_accuracy'],
        "wall_time": wall_time,
        "cpu_mem_used_gb": cpu_mem,
        "gpu_mem_used_gb": gpu_mem
    })

    run.finish()

    return metrics, wall_time, cpu_mem, gpu_mem

In [None]:
results = []
for method in ["full", "lora"]:
    metrics, wall_time, cpu_mem, gpu_mem = train_model(method)
    results.append({
        "method": method,
        "accuracy": metrics["eval_accuracy"],
        "time_sec": wall_time,
        "cpu_mem_gb": cpu_mem,
        "gpu_mem_gb": gpu_mem
    })

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2439,0.289712,0.876


[FULL] Training time: 22.73 seconds
Accuracy: 0.8760
CPU Mem Used: 2.36 GB, GPU Mem Used: 1.68 GB


0,1
cpu_mem_used_gb,▁
eval/accuracy,▁▁
eval/loss,▁▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
final_accuracy,▁
gpu_mem_used_gb,▁
train/epoch,▁▂▂▃▃▄▅▅▆▆▇████
train/global_step,▁▂▂▃▃▄▅▅▆▆▇█████

0,1
cpu_mem_used_gb,2.36047
eval/accuracy,0.876
eval/loss,0.28971
eval/runtime,1.1603
eval/samples_per_second,430.936
eval/steps_per_second,27.58
final_accuracy,0.876
gpu_mem_used_gb,1.68058
total_flos,132467398656000.0
train/epoch,1.0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.638,0.637528,0.796


[LORA] Training time: 13.04 seconds
Accuracy: 0.7960
CPU Mem Used: 2.38 GB, GPU Mem Used: 0.98 GB


0,1
cpu_mem_used_gb,▁
eval/accuracy,▁▁
eval/loss,▁▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
final_accuracy,▁
gpu_mem_used_gb,▁
train/epoch,▁▂▂▃▃▄▅▅▆▆▇████
train/global_step,▁▂▂▃▃▄▅▅▆▆▇█████

0,1
cpu_mem_used_gb,2.37839
eval/accuracy,0.796
eval/loss,0.63753
eval/runtime,1.0111
eval/samples_per_second,494.505
eval/steps_per_second,31.648
final_accuracy,0.796
gpu_mem_used_gb,0.98478
total_flos,134739406848000.0
train/epoch,1.0


In [None]:
cost_table = pd.DataFrame({
    "GPU Instance": ["A100 40GB", "V100 16GB", "T4 16GB"],
    "Hourly Cost (USD)": [3.06, 2.48, 0.35],
})

df = pd.DataFrame(results)
df["estimated_cost_usd (A100)"] = (df["time_sec"] / 3600) * 3.06

print("\n=== Results ===")
print(df)
print("\n=== Cloud Cost Table ===")
print(cost_table)

# Save for export
cost_table.to_csv("cloud_gpu_costs.csv", index=False)
df.to_csv("finetune_results.csv", index=False)  # optionally log to wandb as artifact


=== Results ===
  method  accuracy   time_sec  cpu_mem_gb  gpu_mem_gb  \
0   full     0.876  22.726092    2.360474    1.680577   
1   lora     0.796  13.038663    2.378391    0.984779   

   estimated_cost_usd (A100)  
0                   0.019317  
1                   0.011083  

=== Cloud Cost Table ===
  GPU Instance  Hourly Cost (USD)
0    A100 40GB               3.06
1    V100 16GB               2.48
2      T4 16GB               0.35
