# Instructions

To use this notebook, first ensure the file paths are sent to the right location. You should have downloaded the dataset dev-2.0 and val-2.0 and put them in a folder named data. Then create a folder for each model you plan to train, and set that model path to that folder.

An important oversight of this notebook is that lack of storage for the test performance. I currently screenshot the cells. An improvement would be to save them so some type of csv file with the model name, total training time, and accuracy.

# Setup
* Load modules
* switch to cuda
* read data

In [1]:
# Load in required Packages
import torch
import json
import requests
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, BertForQuestionAnswering
from tqdm import tqdm



In [2]:
# Check the available device and use GPU if available, otherwise use CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Print the device being used
print(f'Working on {device}')

Working on cuda


## Load Data, Set Up Tokenizer

In [3]:
def read_data(path):
    """
    Read SQuAD data from a JSON file.

    Parameters:
    - path: Path to the JSON file containing SQuAD data

    Returns:
    - contexts: List of contexts (passages)
    - questions: List of questions
    - answers: List of answers
    """
    # Open the JSON file and load the data
    with open(path, 'r', encoding='utf-8') as f:
        squad = json.load(f)

    # Initialize lists to store contexts, questions, and answers
    contexts = []
    questions = []
    answers = []

    # Iterate over groups in the SQuAD data
    for group in squad.get('data', []):
        # Iterate over paragraphs in the group
        for passage in group.get('paragraphs', []):
            # Get the context (passage)
            context = passage.get('context', '')
            # Iterate over questions and answers in the paragraph
            for qa in passage.get('qas', []):
                # Get the question
                question = qa.get('question', '')
                # Iterate over answers for the question
                for answer in qa.get('answers', []):
                    # Append context, question, and answer to their respective lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    # Return the lists of contexts, questions, and answers
    return contexts, questions, answers

def add_end_index(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # Check if the answer is correctly positioned
        for offset in [0, -1, -2]:
            if context[start_idx + offset:end_idx + offset] == gold_text:
                # Update answer start and end indices
                answer['answer_start'] = start_idx + offset
                answer['answer_end'] = end_idx + offset
                break  # Break loop once correct offset is found

def add_token_positions(encodings, answers):
    """
    Adds token positions for answers to encodings.

    Parameters:
    - encodings: Encodings object containing tokenized inputs
    - answers: List of dictionaries containing answer positions

    Returns:
    None (modifies encodings in place)
    """
    start_positions = []
    end_positions = []

    # Loop through each answer
    for i, answer in enumerate(answers):
        # Convert character positions to token positions
        start_positions.append(encodings.char_to_token(i, answer['answer_start']))
        end_positions.append(encodings.char_to_token(i, answer['answer_end'] - 1))

        # Handle cases where answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    # Update encodings with start and end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

class SQuAD_Dataset(torch.utils.data.Dataset):
    """
    Custom dataset class for SQuAD.

    Parameters:
    - encodings: Encodings object containing tokenized inputs
    """
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        """
        Retrieves an item from the dataset.

        Parameters:
        - idx: Index of the item to retrieve

        Returns:
        Dictionary containing tensors for each key in the encodings
        """
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        """
        Returns the length of the dataset.

        Returns:
        Integer representing the length of the dataset
        """
        return len(self.encodings.input_ids)


In [4]:
# Read training data
contexts, questions, answers = read_data('/accounts/grad/fangyuan_li/259/data/train-v2.0.json')
# Read validation data
valid_contexts, valid_questions, valid_answers = read_data('/accounts/grad/fangyuan_li/259/data/val-v2.0.json')
# Split train-v2.0 into train and test sets
train_contexts = contexts[5000:]
train_questions = questions[5000:]
train_answers = answers[5000:]

test_contexts = contexts[:5000]
test_questions = questions[:5000]
test_answers = answers[:5000]

# Add indexes
add_end_index(train_answers, train_contexts)
add_end_index(valid_answers, valid_contexts)
add_end_index(test_answers, test_contexts)

# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

# Add token positions for training data
add_token_positions(train_encodings, train_answers)
# Add token positions for validation data
add_token_positions(valid_encodings, valid_answers)
# Add token positions for test data
add_token_positions(test_encodings, test_answers)

# Create training dataset
train_dataset = SQuAD_Dataset(train_encodings)
# Create validation dataset
valid_dataset = SQuAD_Dataset(valid_encodings)
# Create test dataset
test_dataset = SQuAD_Dataset(test_encodings)

# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

## Load Model + Freezing Layers

In [5]:
import os
import torch
from tqdm import tqdm

In [6]:
def freeze_layers(model, num_layers_to_freeze):
    """
    Freeze the first 'num_layers_to_freeze' layers of a model.

    Args:
    model (torch.nn.Module): The model whose layers are to be frozen.
    num_layers_to_freeze (int): The number of layers to freeze.

    Returns:
    None
    """
    # Check for the typical attribute in BERT-like models
    encoder_layers = model.bert.encoder.layer

    # Freeze specified number of layers in the encoder
    layer_count = 0
    for layer in encoder_layers:
        if layer_count < num_layers_to_freeze:
            for param in layer.parameters():
                param.requires_grad = False
        layer_count += 1

        # Break if we have frozen the desired number of layers
        if layer_count >= num_layers_to_freeze:
            break


In [7]:
from transformers import BertForQuestionAnswering

# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Freeze the first 11 layers of the model
freeze_layers(model, 11)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
def check_if_layers_are_frozen(model):
    """
    Check if the encoder layers of a model are frozen.

    Args:
    model (torch.nn.Module): The model to check.

    Returns:
    bool: True if all encoder layers are frozen, False otherwise.
    """
    encoder_layers = model.bert.encoder.layer
    all_frozen = True
    layer_count = 0

    for layer in encoder_layers:
        layer_frozen = True
        for param in layer.parameters():
            if param.requires_grad:
                layer_frozen = False
                all_frozen = False
        print(f"Layer {layer_count + 1} is {'frozen' if layer_frozen else 'not frozen'}.")
        layer_count += 1

    if all_frozen:
        print("All layers are frozen.")
    else:
        print("Some layers are not fully frozen.")

    return all_frozen

# Now, check if the layers are frozen
check_if_layers_are_frozen(model)

Layer 1 is frozen.
Layer 2 is frozen.
Layer 3 is frozen.
Layer 4 is frozen.
Layer 5 is frozen.
Layer 6 is frozen.
Layer 7 is frozen.
Layer 8 is frozen.
Layer 9 is frozen.
Layer 10 is not frozen.
Layer 11 is not frozen.
Layer 12 is not frozen.
Some layers are not fully frozen.


False

## One Layer

In [9]:
# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/fangyuan_li/259/full_data/one_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Early stopping triggered. No improvement in validation loss for 1 epoch(s).


('/accounts/grad/fangyuan_li/259/full_data/one_layer/tokenizer_config.json',
 '/accounts/grad/fangyuan_li/259/full_data/one_layer/special_tokens_map.json',
 '/accounts/grad/fangyuan_li/259/full_data/one_layer/vocab.txt',
 '/accounts/grad/fangyuan_li/259/full_data/one_layer/added_tokens.json',
 '/accounts/grad/fangyuan_li/259/full_data/one_layer/tokenizer.json')

In [11]:
torch.save(model.state_dict(), '/accounts/grad/fangyuan_li/259/full_data/one_layer_model_backup/backup.pt')

In [12]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)

100%|██████████| 313/313 [00:43<00:00,  7.24it/s]


In [13]:
print(acc)

0.6498602236421726


## Two Layers

In [16]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Freeze the first 11 layers of the model
freeze_layers(model, 10)

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/fangyuan_li/259/full_data/two_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 5114/5114 [27:34<00:00,  3.09it/s, loss=1.04] 
Epoch 2: 100%|██████████| 5114/5114 [27:35<00:00,  3.09it/s, loss=1.28] 
Epoch 3: 100%|██████████| 5114/5114 [27:33<00:00,  3.09it/s, loss=1.39] 


Early stopping triggered. No improvement in validation loss for 1 epoch(s).


('/accounts/grad/fangyuan_li/259/full_data/two_layer/tokenizer_config.json',
 '/accounts/grad/fangyuan_li/259/full_data/two_layer/special_tokens_map.json',
 '/accounts/grad/fangyuan_li/259/full_data/two_layer/vocab.txt',
 '/accounts/grad/fangyuan_li/259/full_data/two_layer/added_tokens.json',
 '/accounts/grad/fangyuan_li/259/full_data/two_layer/tokenizer.json')

In [17]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)

100%|██████████| 313/313 [00:42<00:00,  7.31it/s]


In [18]:
print(acc)

0.6901956869009584


## Three Layers

In [21]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Freeze the first 11 layers of the model
freeze_layers(model, 9)

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/fangyuan_li/259/full_data/three_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 5114/5114 [28:27<00:00,  3.00it/s, loss=1.74] 
Epoch 2: 100%|██████████| 5114/5114 [28:31<00:00,  2.99it/s, loss=1]    
Epoch 3: 100%|██████████| 5114/5114 [28:34<00:00,  2.98it/s, loss=1.58] 


Early stopping triggered. No improvement in validation loss for 1 epoch(s).


('/accounts/grad/fangyuan_li/259/full_data/three_layer/tokenizer_config.json',
 '/accounts/grad/fangyuan_li/259/full_data/three_layer/special_tokens_map.json',
 '/accounts/grad/fangyuan_li/259/full_data/three_layer/vocab.txt',
 '/accounts/grad/fangyuan_li/259/full_data/three_layer/added_tokens.json',
 '/accounts/grad/fangyuan_li/259/full_data/three_layer/tokenizer.json')

In [22]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)

100%|██████████| 313/313 [00:43<00:00,  7.26it/s]


In [23]:
print(acc)

0.7134584664536742


# Four Layers

In [None]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Freeze the first 11 layers of the model
freeze_layers(model, 8)

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/sorenraj/four_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1441/1441 [17:13<00:00,  1.39it/s, loss=1.39] 
Epoch 2: 100%|██████████| 1441/1441 [17:13<00:00,  1.39it/s, loss=0.721]
Epoch 3: 100%|██████████| 1441/1441 [17:12<00:00,  1.40it/s, loss=0.741]


Early stopping triggered. No improvement in validation loss for 1 epoch(s).


('/accounts/grad/sorenraj/four_layer/tokenizer_config.json',
 '/accounts/grad/sorenraj/four_layer/special_tokens_map.json',
 '/accounts/grad/sorenraj/four_layer/vocab.txt',
 '/accounts/grad/sorenraj/four_layer/added_tokens.json',
 '/accounts/grad/sorenraj/four_layer/tokenizer.json')

In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)
print(acc)

100%|██████████| 1269/1269 [07:32<00:00,  2.81it/s]

0.5871432793038253





## Five Layers

In [None]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Freeze the first 11 layers of the model
freeze_layers(model, 7)

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/sorenraj/five_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1441/1441 [17:38<00:00,  1.36it/s, loss=0.828]
Epoch 2: 100%|██████████| 1441/1441 [17:35<00:00,  1.36it/s, loss=0.821]
Epoch 3: 100%|██████████| 1441/1441 [17:36<00:00,  1.36it/s, loss=0.0673]


Early stopping triggered. No improvement in validation loss for 1 epoch(s).


('/accounts/grad/sorenraj/five_layer/tokenizer_config.json',
 '/accounts/grad/sorenraj/five_layer/special_tokens_map.json',
 '/accounts/grad/sorenraj/five_layer/vocab.txt',
 '/accounts/grad/sorenraj/five_layer/added_tokens.json',
 '/accounts/grad/sorenraj/five_layer/tokenizer.json')

In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)
print(acc)

100%|██████████| 1269/1269 [07:33<00:00,  2.80it/s]

0.60295297197522





## Six Layers

In [24]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Freeze the first 11 layers of the model
freeze_layers(model, 6)

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/fangyuan_li/259/full_data/six_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 5114/5114 [30:49<00:00,  2.76it/s, loss=1.47] 
Epoch 2: 100%|██████████| 5114/5114 [30:54<00:00,  2.76it/s, loss=0.594]
Epoch 3: 100%|██████████| 5114/5114 [30:54<00:00,  2.76it/s, loss=1.04] 


('/accounts/grad/fangyuan_li/259/full_data/six_layer/tokenizer_config.json',
 '/accounts/grad/fangyuan_li/259/full_data/six_layer/special_tokens_map.json',
 '/accounts/grad/fangyuan_li/259/full_data/six_layer/vocab.txt',
 '/accounts/grad/fangyuan_li/259/full_data/six_layer/added_tokens.json',
 '/accounts/grad/fangyuan_li/259/full_data/six_layer/tokenizer.json')

In [25]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)
print(acc)

100%|██████████| 313/313 [00:42<00:00,  7.30it/s]

0.7291333865814696





## Seven Layers

In [None]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Freeze the first 11 layers of the model
freeze_layers(model, 5)

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/sorenraj/seven_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1441/1441 [18:19<00:00,  1.31it/s, loss=2.54] 
Epoch 2: 100%|██████████| 1441/1441 [18:18<00:00,  1.31it/s, loss=0.552]
Epoch 3: 100%|██████████| 1441/1441 [18:20<00:00,  1.31it/s, loss=0.589]


Early stopping triggered. No improvement in validation loss for 1 epoch(s).


('/accounts/grad/sorenraj/seven_layer/tokenizer_config.json',
 '/accounts/grad/sorenraj/seven_layer/special_tokens_map.json',
 '/accounts/grad/sorenraj/seven_layer/vocab.txt',
 '/accounts/grad/sorenraj/seven_layer/added_tokens.json',
 '/accounts/grad/sorenraj/seven_layer/tokenizer.json')

In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)
print(acc)

100%|██████████| 1269/1269 [07:30<00:00,  2.81it/s]

0.6027418946637424





## Eight Layers

In [None]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Freeze the first 11 layers of the model
freeze_layers(model, 4)

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/sorenraj/eight_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1441/1441 [18:41<00:00,  1.28it/s, loss=0.531]
Epoch 2: 100%|██████████| 1441/1441 [18:41<00:00,  1.29it/s, loss=0.955]


Early stopping triggered. No improvement in validation loss for 1 epoch(s).


('/accounts/grad/sorenraj/eight_layer/tokenizer_config.json',
 '/accounts/grad/sorenraj/eight_layer/special_tokens_map.json',
 '/accounts/grad/sorenraj/eight_layer/vocab.txt',
 '/accounts/grad/sorenraj/eight_layer/added_tokens.json',
 '/accounts/grad/sorenraj/eight_layer/tokenizer.json')

In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)
print(acc)

100%|██████████| 1269/1269 [07:30<00:00,  2.82it/s]

0.6195506867112326





## Nine Layers

In [None]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Freeze the first 11 layers of the model
freeze_layers(model, 3)

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/sorenraj/nine_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1441/1441 [19:02<00:00,  1.26it/s, loss=0.906]
Epoch 2: 100%|██████████| 1441/1441 [19:04<00:00,  1.26it/s, loss=0.505]
Epoch 3: 100%|██████████| 1441/1441 [19:04<00:00,  1.26it/s, loss=0.422]


Early stopping triggered. No improvement in validation loss for 1 epoch(s).


('/accounts/grad/sorenraj/nine_layer/tokenizer_config.json',
 '/accounts/grad/sorenraj/nine_layer/special_tokens_map.json',
 '/accounts/grad/sorenraj/nine_layer/vocab.txt',
 '/accounts/grad/sorenraj/nine_layer/added_tokens.json',
 '/accounts/grad/sorenraj/nine_layer/tokenizer.json')

In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)
print(acc)

100%|██████████| 1269/1269 [07:30<00:00,  2.82it/s]

0.6203387087758504





# Ten Layer

In [None]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Freeze the first 11 layers of the model
freeze_layers(model, 2)

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/sorenraj/ten_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1441/1441 [19:27<00:00,  1.23it/s, loss=0.845]
Epoch 2: 100%|██████████| 1441/1441 [19:28<00:00,  1.23it/s, loss=1.09] 
Epoch 3: 100%|██████████| 1441/1441 [19:27<00:00,  1.23it/s, loss=0.535] 


Early stopping triggered. No improvement in validation loss for 1 epoch(s).


('/accounts/grad/sorenraj/ten_layer/tokenizer_config.json',
 '/accounts/grad/sorenraj/ten_layer/special_tokens_map.json',
 '/accounts/grad/sorenraj/ten_layer/vocab.txt',
 '/accounts/grad/sorenraj/ten_layer/added_tokens.json',
 '/accounts/grad/sorenraj/ten_layer/tokenizer.json')

In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)
print(acc)

100%|██████████| 1269/1269 [07:31<00:00,  2.81it/s]

0.626002617365291





## Eleven Layer

In [None]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Freeze the first 11 layers of the model
freeze_layers(model, 1)

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/sorenraj/eleven_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1441/1441 [19:46<00:00,  1.21it/s, loss=0.902]
Epoch 2: 100%|██████████| 1441/1441 [19:48<00:00,  1.21it/s, loss=0.657]


Early stopping triggered. No improvement in validation loss for 1 epoch(s).


('/accounts/grad/sorenraj/eleven_layer/tokenizer_config.json',
 '/accounts/grad/sorenraj/eleven_layer/special_tokens_map.json',
 '/accounts/grad/sorenraj/eleven_layer/vocab.txt',
 '/accounts/grad/sorenraj/eleven_layer/added_tokens.json',
 '/accounts/grad/sorenraj/eleven_layer/tokenizer.json')

In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)
print(acc)

100%|██████████| 1269/1269 [07:33<00:00,  2.80it/s]

0.626248874260484





## No Layers

In [None]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/sorenraj/twelve_layer'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False

# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        # Save the best model
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= 1:  # Stops if no improvement in one epoch
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1441/1441 [20:10<00:00,  1.19it/s, loss=1.07] 
Epoch 2: 100%|██████████| 1441/1441 [20:10<00:00,  1.19it/s, loss=0.741]


Early stopping triggered. No improvement in validation loss for 1 epoch(s).


('/accounts/grad/sorenraj/twelve_layer/tokenizer_config.json',
 '/accounts/grad/sorenraj/twelve_layer/special_tokens_map.json',
 '/accounts/grad/sorenraj/twelve_layer/vocab.txt',
 '/accounts/grad/sorenraj/twelve_layer/added_tokens.json',
 '/accounts/grad/sorenraj/twelve_layer/tokenizer.json')

In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)
print(acc)

100%|██████████| 1269/1269 [07:33<00:00,  2.80it/s]

0.6342979567763936





## Gradual Unfreezing

In [None]:
def unfreeze_last_layers(model, num_layers_to_unfreeze):
    """
    Unfreeze the last 'num_layers_to_unfreeze' layers of a model.

    Args:
    model (torch.nn.Module): The model whose layers are to be unfrozen.
    num_layers_to_unfreeze (int): The number of last layers to unfreeze.

    Returns:
    None
    """
    # Initially freeze all layers
    for param in model.parameters():
        param.requires_grad = False

    # Check for the typical attribute in BERT-like models
    encoder_layers = model.bert.encoder.layer
    total_layers = len(encoder_layers)

    # Unfreeze the specified number of last layers
    layers_to_start_unfreezing = total_layers - num_layers_to_unfreeze

    for i, layer in enumerate(encoder_layers):
        if i >= layers_to_start_unfreezing:
            for param in layer.parameters():
                param.requires_grad = True

In [None]:
# Instantiate the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Define the path where the model and tokenizer will be saved
model_path = '/accounts/grad/sorenraj/gradual_unfreezing'

# Maximum number of epochs for training
MAX_EPOCHS = 20

# Optimizer definition
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Move model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Set model in training mode
model.train()

# Calculate batches per half epoch
half_epoch_batches = len(train_loader) // 2

# Function to save the model and optimizer state
def save_checkpoint(epoch, batch_idx, model, optimizer, path):
    checkpoint = {
        'epoch': epoch,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, os.path.join(path, f'checkpoint_epoch{epoch}_batch{batch_idx}.pt'))

# Early stopping initialization
best_val_loss = float('inf')
epochs_no_improve = 0
early_stop = False
num_freeze = 11
freeze_layers(model, num_freeze)


# Iterate over epochs
for epoch in range(MAX_EPOCHS):
    if early_stop:
        break

    # Reset loss for each epoch
    total_loss = 0
    # Create a progress bar for the training data
    loop = tqdm(train_loader, leave=True)

    # Iterate over batches in the training data
    for batch_idx, batch in enumerate(loop):
        # Zero gradients from previous iteration
        optim.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation: compute gradients
        loss.backward()

        # Update model parameters
        optim.step()

        # Update progress bar description with current epoch
        loop.set_description(f'Epoch {epoch+1}')

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

    # Validation step after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_batch in valid_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].to(device)
            end_positions = val_batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            val_loss += outputs.loss.item()

    # Calculate average validation loss
    val_loss /= len(valid_loader)

    # Check for early stopping
    # If improving, keep training
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    # If not improving, unfreeze a layer or stop early
    else:
        epochs_no_improve += 1

        if epochs_no_improve >= 3:  # Stops if no improvement in two epochs
            print(f'Early stopping triggered. No improvement in validation loss for {epochs_no_improve} epoch(s).')
            early_stop = True

        elif epochs_no_improve <= 2 and epochs_no_improve > 0:
            num_freeze -= 1

            # Stop instead of unfreezing the "13th" layer
            if num_freeze > 12:
                early_stop = True
                print('Stopping triggered. No improvement in validation loss on last layer')

            # Otherwise, drop down a layer
            else:
                unfreeze_last_layers(model,12-num_freeze)
                print(f'unfreezing layer {12-num_freeze}')

    # Set model back to training mode
    model.train()

# Optionally, save the model and tokenizer at the end of training
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 1441/1441 [16:06<00:00,  1.49it/s, loss=2.2] 
Epoch 2: 100%|██████████| 1441/1441 [16:08<00:00,  1.49it/s, loss=2.22] 
Epoch 3: 100%|██████████| 1441/1441 [16:09<00:00,  1.49it/s, loss=1.18] 
Epoch 4: 100%|██████████| 1441/1441 [16:10<00:00,  1.48it/s, loss=0.537]
Epoch 5: 100%|██████████| 1441/1441 [16:10<00:00,  1.48it/s, loss=0.932]


unfreezing layer 2


Epoch 6: 100%|██████████| 1441/1441 [10:43<00:00,  2.24it/s, loss=0.49] 


unfreezing layer 3


Epoch 7: 100%|██████████| 1441/1441 [11:38<00:00,  2.06it/s, loss=0.164]


Early stopping triggered. No improvement in validation loss for 3 epoch(s).


('/accounts/grad/sorenraj/gradual_unfreezing/tokenizer_config.json',
 '/accounts/grad/sorenraj/gradual_unfreezing/special_tokens_map.json',
 '/accounts/grad/sorenraj/gradual_unfreezing/vocab.txt',
 '/accounts/grad/sorenraj/gradual_unfreezing/added_tokens.json',
 '/accounts/grad/sorenraj/gradual_unfreezing/tokenizer.json')

In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize a list to store accuracy values
acc = []

# Iterate over batches in the validation data
for batch in tqdm(test_loader):
    with torch.no_grad():
        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask)

        # Get predicted start and end positions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Compute accuracy for start positions and end positions
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())

# Compute the average accuracy
acc = sum(acc) / len(acc)
print(acc)

100%|██████████| 1269/1269 [07:33<00:00,  2.80it/s]

0.5147015366547685



