<p><span style="font-size: 24pt;"><strong>Data Manipulation</strong></span></p>

In [3]:
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
from transformers import BertModel, BertConfig, BertTokenizer

<p><span style="font-size: 24pt;"><strong>Creating DataLoaders</strong></span></p>

In [4]:
sm_enfr_df = pd.read_csv('/home/jovyan/pub/bert_pruning/transl_data.csv')
sm_rt_df = pd.read_csv('/home/jovyan/pub/bert_pruning/rt_data.csv')
sm_summ_df = pd.read_csv('/home/jovyan/pub/bert_pruning/summ_data.csv')
sm_mcq_df = pd.read_csv('/home/jovyan/pub/bert_pruning/mc_data.csv')

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

"""
# Trranslation dataset
enfr_inputs = tokenizer(sm_enfr_df['en'].tolist(), return_tensors='pt', padding=True, truncation=True)
enfr_labels = tokenizer(sm_enfr_df['fr'].tolist(), return_tensors='pt', padding=True, truncation=True).input_ids
print("Translation Data Sizes")
print(f"Input IDs: {enfr_inputs['input_ids'].size()}")
print(f"Attention Mask: {enfr_inputs['attention_mask'].size()}")
print(f"Labels: {enfr_labels.size()}")
translation_dataset = TensorDataset(enfr_inputs['input_ids'], enfr_inputs['attention_mask'], enfr_labels)
translation_dataloader = DataLoader(translation_dataset, batch_size=4096, shuffle=True)
"""

# Sentiment classification dataset
rt_inputs = tokenizer(sm_rt_df['Review'].tolist(), return_tensors='pt', padding=True, truncation=True)
rt_labels = torch.tensor([1 if label == 'fresh' else 0 for label in sm_rt_df['Freshness'].tolist()])
print("\nSentiment Classification Data Sizes")
print(f"Input IDs: {rt_inputs['input_ids'].size()}")
print(f"Attention Mask: {rt_inputs['attention_mask'].size()}")
print(f"Labels: {rt_labels.size()}")
classification_dataset = TensorDataset(rt_inputs['input_ids'], rt_inputs['attention_mask'], rt_labels)
classification_dataloader = DataLoader(classification_dataset, batch_size=8, shuffle=True)

"""
# Text Summarization dataset
text_inputs = tokenizer(sm_summ_df['text'].tolist(), return_tensors='pt', padding=True, truncation=True)
summary_labels = tokenizer(sm_summ_df['summary'].tolist(), return_tensors='pt', padding=True, truncation=True).input_ids
print("\nSummarization Data Sizes")
print(f"Text Input IDs: {text_inputs['input_ids'].size()}")
print(f"Text Attention Mask: {text_inputs['attention_mask'].size()}")
print(f"Summary Labels: {summary_labels.size()}")
summarization_dataset = TensorDataset(text_inputs['input_ids'], text_inputs['attention_mask'], summary_labels)
summarization_dataloader = DataLoader(summarization_dataset, batch_size=4096, shuffle=True)


# Multiple choice dataset
mcq_inputs = tokenizer(sm_mcq_df['question'].tolist(), return_tensors='pt', padding=True, truncation=True)
mcq_labels = torch.tensor([1 if label == 'correct' else 0 for label in sm_mcq_df['cop'].tolist()])
print("\nMultiple Choice Data Sizes")
print(f"Question Input IDs: {mcq_inputs['input_ids'].size()}")
print(f"Question Attention Mask: {mcq_inputs['attention_mask'].size()}")
print(f"Labels: {mcq_labels.size()}")
mcq_dataset = TensorDataset(mcq_inputs['input_ids'], mcq_inputs['attention_mask'], mcq_labels)
mcq_dataloader = DataLoader(mcq_dataset, batch_size=4096, shuffle=True)
"""

# Inspecting the DataLoaders: 1 batch
def inspect_dataloader(dataloader, name):
    print(f"\n{name} DataLoader Inspection:")
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        print(f"Input IDs size: {input_ids.size()}")
        print(f"Attention Mask size: {attention_mask.size()}")
        print(f"Labels size: {labels.size()}")
        break  

#inspect_dataloader(translation_dataloader, "Translation")
inspect_dataloader(classification_dataloader, "Sentiment Classification")
#inspect_dataloader(summarization_dataloader, "Summarization")
#inspect_dataloader(mcq_dataloader, "Multiple Choice")


Sentiment Classification Data Sizes
Input IDs: torch.Size([23455, 116])
Attention Mask: torch.Size([23455, 116])
Labels: torch.Size([23455])

Sentiment Classification DataLoader Inspection:
Input IDs size: torch.Size([8, 116])
Attention Mask size: torch.Size([8, 116])
Labels size: torch.Size([8])


<p><span style="font-size: 24pt;"><strong>Defining `BertCustomHead` and training</strong></span></p>

In [None]:
class BertCustomHead(nn.Module):
    """
    Defines the BertCustomHead module. 
    """
    def __init__(self, config, num_classes, task_type='sequence_classification'):
        super(BertCustomHead, self).__init__()
        self.bert = BertModel(config)
        self.task_type = task_type
        self.heads = {
            'sequence_classification': nn.Linear(config.hidden_size, num_classes),
            'token_classification': nn.Linear(config.hidden_size, num_classes),
            'multiple_choice': nn.Linear(config.hidden_size, 1)
        }
        self.loss_fns = {
            'sequence_classification': nn.CrossEntropyLoss(),
            'token_classification': nn.CrossEntropyLoss(),
            'multiple_choice': nn.BCEWithLogitsLoss()
        }
        
        if task_type not in self.heads:
            raise ValueError("Invalid task type. Supported types: 'sequence_classification', 'token_classification', 'multiple_choice'")

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, next_sentence_labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        return self.heads[self.task_type](pooled_output)

def train(model, dataloaders, optimizer, num_epochs=5, device='cuda'):
    """
    Training loop for `num_epochs` epochs.
    """
    model.to(device)
    model.train()

    task_losses = {task_name: [] for task_name in dataloaders.keys()}

    for epoch in range(num_epochs):
        total_loss = 0.0
        for name, dataloader in dataloaders.items():
            for batch in dataloader:
                # Move batch to device
                batch = tuple(t.to(device) for t in batch)
                input_ids, attention_mask, labels = batch

                # Determine the task type based on the name of the data loader
                if name == 'translation':
                    task_type = 'sequence_classification'
                    labels = labels.view(-1)  # Flatten the labels for sequence classification
                elif name == 'classification':
                    task_type = 'sequence_classification'
                elif name == 'summarization':
                    task_type = 'token_classification'
                elif name == 'multiple_choice':
                    task_type = 'multiple_choice'
                    # Ensure labels have the same batch size as logits
                    labels = labels.float()
                else:
                    raise ValueError("Invalid data loader name. Supported names: 'translation', 'classification', 'summarization', 'multiple_choice'")    
                # Forward pass
                logits = model(input_ids, attention_mask)

                # Calculate loss
                loss = model.loss_fns[task_type](logits, labels)

                # Backward pass
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                task_losses[name].append(loss.item())
                
        print(f"Epoch {epoch+1}/{num_epochs}, Total Loss: {total_loss}")
        for task_name, losses in task_losses.items():
            print(f"Task: {task_name}, Last Loss: {losses[-1]}")

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate model, optimizer, and dataloaders
config = BertConfig.from_pretrained('bert-base-uncased')
model = BertCustomHead(config, num_classes=2, task_type='sequence_classification')
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
dataloaders = {'classification': classification_dataloader}
# Train the model
train(model, dataloaders, optimizer, num_epochs=5, device=device)



