In [None]:
import json
import torch
import numpy as np
from transformers import AdamW
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from torch.utils.data import Subset

In [None]:
file_path = 'qa_dataset.json'

with open(file_path, 'r') as file:
    data = json.load(file)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Original 
# Prepare data 

# Tokenizing the data
# inputs = tokenizer([x['question'] + " [SEP] " + x['answer'] for x in data], padding=True, truncation=True, return_tensors="pt")

# Assuming binary classification (change as needed)
# labels = torch.tensor([1 if x['answer_length'] > 100 else 0 for x in data])

# Create a dataset
# dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)

In [None]:
# Prepare data

# Tokenizing the data
max_length = max(len(tokenizer.encode(x['question'] + " [SEP] " + x['answer'])) for x in data)
tokenized_data = [tokenizer(x['question'] + " [SEP] " + x['answer'], 
                            padding='max_length',  
                            max_length=max_length,  
                            truncation=True, 
                            return_tensors="pt") for x in data]

# input
input_ids = torch.cat([item['input_ids'] for item in tokenized_data], dim=0)
attention_masks = torch.cat([item['attention_mask'] for item in tokenized_data], dim=0)
labels = torch.tensor([1 if x['answer_length'] > 100 else 0 for x in data])

# dataset seprate
input_ids_train, input_ids_test, attention_masks_train, attention_masks_test, labels_train, labels_test = train_test_split(
    input_ids, attention_masks, labels, train_size=0.7, random_state=42)

# create dataset
train_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_train)
test_dataset = TensorDataset(input_ids_test, attention_masks_test, labels_test)
dataset = TensorDataset(input_ids, attention_masks, labels)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# check GPU is work
# Train the model using a suitable optimizer and loss function.
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Using CPU.")
    device = torch.device("cpu")

In [None]:
print(len(dataset))
dataset[:10]

## Config 1-8

In [None]:
# Parameters
batch_size_list = [4, 8]
epochs_list = [25, 30]
learning_rate_list = [1e-5, 1e-4]

for batch_size in batch_size_list:
    for epochs in epochs_list:
        for learning_rate in learning_rate_list:
            model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device)
            
            print(f'Parameters: \n    Batch size: {batch_size}\n    Epochs: {epochs}\n    Learning rate:{learning_rate}\n')
            # Prepare for epoch_losses
            epoch_losses = []
            
            # Create dataloader
            train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
            test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)
            
            # Optimizer
            optimizer = AdamW(model.parameters(), lr=2e-5)
            
            model.train()
            
            # Training loop
            for epoch in range(epochs):
                total_loss = 0
                progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
            
                for batch in progress_bar:
                    # b_input_ids, b_input_mask, b_labels = batch
                    b_input_ids, b_input_mask, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
                    model.zero_grad()
            
                    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
                    loss = outputs.loss
                    total_loss += loss.item()
                    loss.backward()
                    optimizer.step()
            
                    # Update the progress bar
                    progress_bar.set_postfix({'loss': total_loss/len(train_dataloader)})
                    
                # Calculate and store the average loss for this epoch
                avg_loss = total_loss / len(train_dataloader)
                epoch_losses.append(avg_loss)
                
                # Save the model after each epoch
                # model_save_file = os.path.join(model_save_path, f'bert_model_epoch_{epoch+1}.pt')
                # torch.save(model.state_dict(), model_save_file)
            
                # Closing the progress bar and printing the epoch loss
                progress_bar.close()
                # print(f"Epoch {epoch+1} finished. Loss: {total_loss/len(train_dataloader)}")
            print(f'Loss on training: {epoch_losses}')

            # Switch to evaluation mode
            model.eval()
            predictions, true_labels = [], []
            # Add tqdm progress bar
            for batch in tqdm(test_dataloader, desc="Evaluating"):
                batch = tuple(t.to(device) for t in batch)
                b_input_ids, b_input_mask, b_labels = batch
            
                with torch.no_grad():
                    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                
                logits = outputs.logits
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
            
                batch_preds = np.argmax(logits, axis=1)
                predictions.extend(batch_preds)
                true_labels.extend(label_ids)
            
            # Calculate the accuracy
            accuracy = accuracy_score(true_labels, predictions)
            print(f"Accuracy on test set: {accuracy}")

            # Plotting the training loss
            plt.plot(epoch_losses, label='Training Loss')
            plt.xlabel('Epochs')
            plt.ylabel('Loss')
            plt.title('Training Loss Over Epochs')
            plt.legend()
            plt.show()

## Config 9

In [None]:
epochs = 25
batch_size = 2
# Prepare for epoch_losses
epoch_losses = []

# Create dataloader
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device)
# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-6)

model.train()
# Training loop
for epoch in range(epochs):
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")

    for batch in progress_bar:
        # b_input_ids, b_input_mask, b_labels = batch
        b_input_ids, b_input_mask, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        model.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        # Update the progress bar
        progress_bar.set_postfix({'loss': total_loss/len(train_dataloader)})
        
    # Calculate and store the average loss for this epoch
    avg_loss = total_loss / len(train_dataloader)
    epoch_losses.append(avg_loss)
    
    # Save the model after each epoch
    # model_save_file = os.path.join(model_save_path, f'bert_model_epoch_{epoch+1}.pt')
    # torch.save(model.state_dict(), model_save_file)

    # Closing the progress bar and printing the epoch loss
    progress_bar.close()
    print(f"Epoch {epoch+1} finished. Loss: {total_loss/len(train_dataloader)}")

# Switch to evaluation mode
model.eval()
predictions, true_labels = [], []
# Add tqdm progress bar
for batch in tqdm(test_dataloader, desc="Evaluating"):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    batch_preds = np.argmax(logits, axis=1)
    predictions.extend(batch_preds)
    true_labels.extend(label_ids)

# Calculate the accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy on test set: {accuracy}")

# Plotting the training loss
plt.plot(epoch_losses, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()
plt.show()

## Config 10

In [None]:
epochs = 25
batch_size = 8
# Prepare for epoch_losses
epoch_losses = []

# Create dataloader
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device)
# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

model.train()
# Training loop
for epoch in range(epochs):
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")

    for batch in progress_bar:
        # b_input_ids, b_input_mask, b_labels = batch
        b_input_ids, b_input_mask, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        model.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        # Update the progress bar
        progress_bar.set_postfix({'loss': total_loss/len(train_dataloader)})
        
    # Calculate and store the average loss for this epoch
    avg_loss = total_loss / len(train_dataloader)
    epoch_losses.append(avg_loss)
    
    # Save the model after each epoch
    # model_save_file = os.path.join(model_save_path, f'bert_model_epoch_{epoch+1}.pt')
    # torch.save(model.state_dict(), model_save_file)

    # Closing the progress bar and printing the epoch loss
    progress_bar.close()
    print(f"Epoch {epoch+1} finished. Loss: {total_loss/len(train_dataloader)}")

In [None]:
model_save_path = 'models'  
os.makedirs(model_save_path, exist_ok=True)
final_model_save_file = os.path.join(model_save_path, 'bert_final_model_1.pt')
torch.save(model.state_dict(), final_model_save_file)

# print paramter
print("Training Parameters:")
print(f"Batch Size: {batch_size}")
print(f"Epochs: {epochs}")
print(f"Learning Rate: {optimizer.defaults['lr']}")
print(f"Beta1: {optimizer.defaults['betas'][0]}")
print(f"Beta2: {optimizer.defaults['betas'][1]}")

print(f"Training completed. Final model saved to {final_model_save_file}")

In [None]:
# Plotting the training loss
plt.plot(epoch_losses, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()
plt.show()

### Evulation 

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Switch to evaluation mode
model.eval()

predictions, true_labels = [], []

# Add tqdm progress bar
for batch in tqdm(test_dataloader, desc="Evaluating"):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    batch_preds = np.argmax(logits, axis=1)
    predictions.extend(batch_preds)
    true_labels.extend(label_ids)

# Calculate the accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy on test set: {accuracy}")

### K-Fold

In [None]:
import numpy as np
# Define the KFold cross-validator
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize performance metric list
fold_performance = []

# Parameters
batch_size = 8
epochs = 25
learning_rate = 2e-5

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Start the KFold cross-validation
for fold, (train_ids, test_ids) in enumerate(kf.split(dataset)):
    print(f"FOLD {fold}")
    print("-------------------------------")

    # Split the data into training and validation sets for the current fold
    train_subset = Subset(dataset, train_ids)
    test_subset = Subset(dataset, test_ids)
    
    train_dataloader = DataLoader(train_subset, sampler=RandomSampler(train_subset), batch_size=batch_size)
    test_dataloader = DataLoader(test_subset, sampler=SequentialSampler(test_subset), batch_size=batch_size)
    
    # Initialize the BERT model for sequence classification
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    model.to(device)
    
    # Define the optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    model.train()
    
    # Training loop for the current fold
    for epoch in range(epochs):
        total_loss = 0
        
        train_progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{epochs}, Fold {fold+1}/10")
        
        for step, batch in enumerate(train_progress_bar):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            model.zero_grad()
            
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            
            train_progress_bar.set_postfix(loss=total_loss/(step+1))
        
        train_progress_bar.close()
        
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
        
    # Validation step
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0

    eval_progress_bar = tqdm(test_dataloader, desc=f"Validation, Fold {fold+1}/10")

    for batch in eval_progress_bar:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
    
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
    
        loss = outputs.loss
        total_eval_loss += loss.item()
    
        # Move logits and labels to CPU
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences
        preds = np.argmax(logits, axis=1)
        batch_accuracy = np.mean(preds == label_ids)
        total_eval_accuracy += batch_accuracy
        
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print(f"Validation accuracy: {avg_val_accuracy}")


    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(test_dataloader)
    
    # Record all statistics from this epoch.
    fold_performance.append({
        'fold': fold,
        'train_loss': avg_train_loss,
        'val_loss': avg_val_loss,
        'val_accuracy': avg_val_accuracy
    })

    eval_progress_bar.close()

# Calculate and print the average performance across all folds
average_performance = {
    'avg_train_loss': np.mean([x['train_loss'] for x in fold_performance]),
    'avg_val_loss': np.mean([x['val_loss'] for x in fold_performance]),
    'avg_val_accuracy': np.mean([x['val_accuracy'] for x in fold_performance])
}
print(f"Average performance across all folds: {average_performance}")