In [1]:
import os
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import ConcatDataset
from transformers import PreTrainedTokenizerFast, AutoTokenizer, AutoModelForCausalLM
import torch

def load_data(data_dir, labels_dir, tokenizer, max_length=1024):
    data_files = sorted([f for f in os.listdir(data_dir) if f.endswith('.txt')])
    label_files = sorted([f for f in os.listdir(labels_dir) if f.endswith('.txt')])

    dataset = []

    for data_file, label_file in zip(data_files, label_files):
        with open(os.path.join(data_dir, data_file), 'r', encoding='utf-8') as df, open(os.path.join(labels_dir, label_file), 'r', encoding='utf-8') as lf:
            data_content = df.read().strip()
            label_content = lf.read().strip()

            # Skip empty files
            if not data_content or not label_content:
                print(f"Skipping empty file: {data_file} or {label_file}")
                continue

            # Tokenize data and labels
            data = tokenizer(data_content,
                             padding="max_length",
                             truncation=True,
                             max_length=700,
                             return_tensors="pt")
            label = tokenizer(label_content,
                              padding="max_length",
                              truncation=True,
                              max_length=700,
                              return_tensors="pt")

            # Ensure input_ids and labels match in length
            if data["input_ids"].shape[1] != label["input_ids"].shape[1]:
                raise ValueError(f"Length mismatch in file: {data_file} or {label_file}")

            # Append to dataset
            dataset.append({"input_ids": data["input_ids"].squeeze(0),
                            "attention_mask": data["attention_mask"].squeeze(0),
                            "labels": label["input_ids"].squeeze(0)})

    return dataset


class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]  # Each item is a dictionary


# ## Training Data
# # Load data and labels
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token


# Training Data
dataset_betham_train = load_data('/home/nittaak/661/train/Betham/OCR/', '/home/nittaak/661/train/Betham/GT/', tokenizer)
dataset_IAM_train = load_data('/home/nittaak/661/train/IAM/OCR/', '/home/nittaak/661/train/IAM/GT/', tokenizer)

train_bethalm_dataset = TextDataset(dataset_betham_train)
train_iam_dataset = TextDataset(dataset_IAM_train)

overall_train_dataset = ConcatDataset([train_bethalm_dataset, train_iam_dataset])
dataloader_train = DataLoader(overall_train_dataset, batch_size=2, shuffle=True)

# Validation Data
dataset_betham_val = load_data('/home/nittaak/661/validation/Betham/OCR/', '/home/nittaak/661/validation/Betham/GT/', tokenizer)
dataset_IAM_val = load_data('/home/nittaak/661/validation/IAM/OCR/', '/home/nittaak/661/validation/IAM/GT/', tokenizer)

val_bethalm_dataset = TextDataset(dataset_betham_val)
val_iam_dataset = TextDataset(dataset_IAM_val)

overall_val_dataset = ConcatDataset([val_bethalm_dataset, val_iam_dataset])
dataloader_val = DataLoader(overall_val_dataset, batch_size=2, shuffle=True)

# Test Data
dataset_betham_test = load_data('/home/nittaak/661/test/Betham/OCR/', '/home/nittaak/661/test/Betham/GT/', tokenizer)
dataset_IAM_test = load_data('/home/nittaak/661/test/IAM/OCR/', '/home/nittaak/661/test/IAM/GT/', tokenizer)

test_bethalm_dataset = TextDataset(dataset_betham_test)
test_iam_dataset = TextDataset(dataset_IAM_test)

overall_test_dataset = ConcatDataset([test_bethalm_dataset, test_iam_dataset])
dataloader_test = DataLoader(overall_test_dataset, batch_size=1, shuffle=True)

  from .autonotebook import tqdm as notebook_tqdm


Skipping empty file: ocr_072_066_004.txt or GT_072_066_004.txt
Skipping empty file: ocr_073_049_002.txt or GT_073_049_002.txt
Skipping empty file: ocr_096_098_002.txt or GT_096_098_002.txt
Skipping empty file: ocr_a06-051.txt or GT_a06-051.txt
Skipping empty file: ocr_a06-110.txt or GT_a06-110.txt
Skipping empty file: ocr_071_054_002.txt or GT_071_054_002.txt


In [2]:
import torch 

model = AutoModelForCausalLM.from_pretrained("gpt2")

device = torch.device("cuda:0")

model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [3]:
# for name, param in model.named_parameters():
#     print(name)

for name, param in model.named_parameters():
    if "transformer.h." in name:  # Check if it's part of the transformer layers
        layer_index = int(name.split(".")[2])  # Extract the layer index
        if layer_index <= 10:
            param.requires_grad = False
for name, param in model.named_parameters():
    status = "Trainable" if param.requires_grad else "Frozen"
    print(f"{name}: {status}")
from transformers import GPT2LMHeadModel

# Manually initialize lm_head
model.lm_head = torch.nn.Linear(model.config.n_embd, model.config.vocab_size, bias=False)


transformer.wte.weight: Trainable
transformer.wpe.weight: Trainable
transformer.h.0.ln_1.weight: Frozen
transformer.h.0.ln_1.bias: Frozen
transformer.h.0.attn.c_attn.weight: Frozen
transformer.h.0.attn.c_attn.bias: Frozen
transformer.h.0.attn.c_proj.weight: Frozen
transformer.h.0.attn.c_proj.bias: Frozen
transformer.h.0.ln_2.weight: Frozen
transformer.h.0.ln_2.bias: Frozen
transformer.h.0.mlp.c_fc.weight: Frozen
transformer.h.0.mlp.c_fc.bias: Frozen
transformer.h.0.mlp.c_proj.weight: Frozen
transformer.h.0.mlp.c_proj.bias: Frozen
transformer.h.1.ln_1.weight: Frozen
transformer.h.1.ln_1.bias: Frozen
transformer.h.1.attn.c_attn.weight: Frozen
transformer.h.1.attn.c_attn.bias: Frozen
transformer.h.1.attn.c_proj.weight: Frozen
transformer.h.1.attn.c_proj.bias: Frozen
transformer.h.1.ln_2.weight: Frozen
transformer.h.1.ln_2.bias: Frozen
transformer.h.1.mlp.c_fc.weight: Frozen
transformer.h.1.mlp.c_fc.bias: Frozen
transformer.h.1.mlp.c_proj.weight: Frozen
transformer.h.1.mlp.c_proj.bias: Fro

In [4]:
from cosinesimcontained import cosine_sim_bert_loss, get_cosine_sim_bert
import numpy as np

# Custom loss function
def custom_loss(output, target, model=None, tokenizer=None, device=None):
    gt_embeddings = get_embedding_pool(target, model, tokenizer, device)
    pred_embeddings = get_embedding_pool(output, model, tokenizer, device)
    return cosine_sim_bert_loss(gt_embeddings, pred_embeddings)

def compute_metrics(eval_preds):
    try:
        logits, labels = eval_preds

        # Ensure logits and labels are tensors
        if isinstance(logits, np.ndarray):
            logits = torch.tensor(logits)
        if isinstance(labels, np.ndarray):
            labels = torch.tensor(labels)

        # Decode predictions
        predicted_token_ids = logits.argmax(dim=-1)
        decoded_predictions = [
            tokenizer.decode(pred.tolist(), skip_special_tokens=True)
            for pred in predicted_token_ids
        ]

        # Decode ground truth
        labels[labels == -100] = tokenizer.pad_token_id
        decoded_labels = [
            tokenizer.decode(label.tolist(), skip_special_tokens=True)
            for label in labels
        ]

        # Compute cosine similarity
        similarities = get_cosine_sim_bert(decoded_labels, decoded_predictions)
        return {"cosine_accuracy": np.mean(similarities)}
    except Exception as e:
        print("Error during metric computation")
        print(f"Logits shape: {logits.shape}")
        print(f"Labels shape: {labels.shape}")
        raise e

In [5]:
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,  # Stop if no improvement for 3 eval steps
    early_stopping_threshold=0.01  # Minimum change to qualify as improvement
)

#training 
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    per_device_train_batch_size=3,  # Reduce batch size
    per_device_eval_batch_size=3,  # Reduce batch size
    num_train_epochs=100,
    learning_rate=5e-6,
    warmup_steps=1_100,
    weight_decay=0.0001,
    fp16=False,  # Enable mixed precision
    gradient_accumulation_steps=4,  # Simulate larger batch size
    dataloader_num_workers=0,
    load_best_model_at_end = True,
    max_grad_norm = 1.0
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=overall_train_dataset,
    eval_dataset=overall_val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[early_stopping] 
)


#Api: 139ebae25a64b3f22839c529cd6babfcacbc8e4b

# Fine-Tune the Model
trainer.train()

# Save the Fine-Tuned Model
trainer.save_model("./fine_tuned_gpt2_30_2")

  trainer = Trainer(


Step,Training Loss,Validation Loss,Cosine Accuracy
500,19.8205,19.433357,0.653775
1000,7.4331,6.726549,0.652691
1500,1.7914,2.398871,0.692972
2000,1.5472,2.139818,0.658573
2500,1.5022,2.098565,0.64224
3000,1.4732,2.081419,0.625383
3500,1.4205,2.0721,0.620243
4000,1.4346,2.064595,0.612912
4500,1.4426,2.061933,0.615884
5000,1.418,2.058184,0.609924


In [6]:
import torch

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MiB")
        print(f"  Memory Cached: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MiB")
        print(f"  Total Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**2:.2f} MiB")
else:
    print("No GPU available.")

GPU 0: NVIDIA H100 PCIe
  Memory Allocated: 1350.56 MiB
  Memory Cached: 71148.00 MiB
  Total Memory: 81116.69 MiB


In [16]:
print(torch.cuda.memory_summary(device="cuda:0", abbreviated=True))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 2            |        cudaMalloc retries: 37        |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  51414 MiB |  66943 MiB |  64167 GiB |  64117 GiB |
|---------------------------------------------------------------------------|
| Active memory         |  51414 MiB |  66943 MiB |  64167 GiB |  64117 GiB |
|---------------------------------------------------------------------------|
| Requested memory      |  51409 MiB |  66938 MiB |  64152 GiB |  64102 GiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |  64498 MiB |  80184 MiB |   1051 GiB |    988 GiB |
|---------------------------------------------------------------