In [2]:
import os
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import ConcatDataset
from transformers import PreTrainedTokenizerFast, AutoTokenizer, AutoModelForCausalLM
import torch

def load_data(data_dir, labels_dir, tokenizer, max_length=1024):
    data_files = sorted([f for f in os.listdir(data_dir) if f.endswith('.txt')])
    label_files = sorted([f for f in os.listdir(labels_dir) if f.endswith('.txt')])

    dataset = []

    for data_file, label_file in zip(data_files, label_files):
        with open(os.path.join(data_dir, data_file), 'r', encoding='utf-8') as df, open(os.path.join(labels_dir, label_file), 'r', encoding='utf-8') as lf:
            data_content = df.read().strip()
            label_content = lf.read().strip()

            # Skip empty files
            if not data_content or not label_content:
                print(f"Skipping empty file: {data_file} or {label_file}")
                continue

            # Tokenize data and labels
            data = tokenizer(data_content,
                             padding="max_length",
                             truncation=True,
                             max_length=500,
                             return_tensors="pt")
            label = tokenizer(label_content,
                              padding="max_length",
                              truncation=True,
                              max_length=500,
                              return_tensors="pt")

            # Ensure input_ids and labels match in length
            if data["input_ids"].shape[1] != label["input_ids"].shape[1]:
                raise ValueError(f"Length mismatch in file: {data_file} or {label_file}")

            # Append to dataset
            dataset.append({"input_ids": data["input_ids"].squeeze(0),
                            "attention_mask": data["attention_mask"].squeeze(0),
                            "labels": label["input_ids"].squeeze(0)})

    return dataset

class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]  # Each item is a dictionary


# ## Training Data
# # Load data and labels
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Test Data
dataset_betham_test = load_data('/home/nittaak/661/test/Betham/OCR/', '/home/nittaak/661/test/Betham/GT/', tokenizer)
dataset_IAM_test = load_data('/home/nittaak/661/test/IAM/OCR/', '/home/nittaak/661/test/IAM/GT/', tokenizer)

test_bethalm_dataset = TextDataset(dataset_betham_test)
test_iam_dataset = TextDataset(dataset_IAM_test)

overall_test_dataset = ConcatDataset([test_bethalm_dataset, test_iam_dataset])
dataloader_test = DataLoader(overall_test_dataset, batch_size=1, shuffle=True)

In [5]:
import torch
from transformers import PreTrainedTokenizerFast, AutoTokenizer, AutoModelForCausalLM

device2 = torch.device("cuda:0")
model = AutoModelForCausalLM.from_pretrained("fine_tuned_gpt2_100epochs_700tokens_stopat48")
model.to(device2)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
from torch.nn.functional import softmax
from cosinesimcontained import get_cosine_sim_bert
import numpy as np
from transformers import BertTokenizer, BertModel

# Function to evaluate the model
def evaluate_model(model, dataloader, tokenizer, device):
    model.eval()
    total_loss = 0
    predictions = []
    references = []

    document = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            # Collect loss
            total_loss += loss.item()

            # Decode predictions
            predicted_tokens = logits.argmax(dim=-1)
            predicted_text = tokenizer.decode(predicted_tokens[0], skip_special_tokens=True)
            
            directory_gpt = f'{os.getcwd()}/test_data/GPT/'
            # 661/test_data/GPT/gpt_0.txt
            os.makedirs(directory_gpt, exist_ok=True)
            file_name_gpt = f'gpt_{document}.txt'
            file_path_gpt = os.path.join(directory_gpt, file_name_gpt)
            with open(file_path_gpt, "w+") as file:
                file.write(predicted_text)
                
            reference_text = tokenizer.decode(labels[0], skip_special_tokens=True)
            
            # str_ref_file = f'{os.getcwd()}/test_data/GT/gt_{document}.txt'
            directory_gt = f'{os.getcwd()}/test_data/GT/'
            file_name_gt = f'gt_{document}.txt'
            
            file_path_gt = os.path.join(directory_gt, file_name_gt)
            with open(file_path_gt, "w+") as file:
                file.write(predicted_text)
            document+=1

            predictions.append(predicted_text)
            references.append(reference_text)
    
    # Compute average loss
    avg_loss = total_loss / len(dataloader)
    return predictions, references, avg_loss

# Run evaluation
predictions, references, avg_loss = evaluate_model(model, dataloader_test, tokenizer, device2)
cosine_sim = []

model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# get_cosine_sim_bert(gt_arr, pred_arr, model=None, tokenizer=None, device=None)

for pred, ref in zip(predictions, references):
    cosine_sim.append(get_cosine_sim_bert(ref, pred, model = model, tokenizer=tokenizer, device=device2))
    
print(f'cosine_similarity mean: {np.mean(cosine_sim[0])}')

cosine_similarity mean: 0.5048904142379761


In [1]:
import torch

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MiB")
        print(f"  Memory Cached: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MiB")
        print(f"  Total Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**2:.2f} MiB")
else:
    print("No GPU available.")

GPU 0: Tesla V100-SXM2-32GB
  Memory Allocated: 0.00 MiB
  Memory Cached: 0.00 MiB
  Total Memory: 32494.12 MiB
GPU 1: Tesla V100-SXM2-32GB
  Memory Allocated: 0.00 MiB
  Memory Cached: 0.00 MiB
  Total Memory: 32494.12 MiB
GPU 2: Tesla V100-SXM2-32GB
  Memory Allocated: 0.00 MiB
  Memory Cached: 0.00 MiB
  Total Memory: 32494.12 MiB
GPU 3: Tesla V100-SXM2-32GB
  Memory Allocated: 0.00 MiB
  Memory Cached: 0.00 MiB
  Total Memory: 32494.12 MiB
GPU 4: Tesla V100-SXM2-32GB
  Memory Allocated: 0.00 MiB
  Memory Cached: 0.00 MiB
  Total Memory: 32494.12 MiB
GPU 5: Tesla V100-SXM2-32GB
  Memory Allocated: 0.00 MiB
  Memory Cached: 0.00 MiB
  Total Memory: 32494.12 MiB
