<p><span style="font-size: 24pt;"><strong>Data Manipulation</strong></span></p>

In [2]:
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
from transformers import BertModel, BertConfig, BertTokenizer, BertOnlyNSPHead

In [3]:
file_path = '/home/jovyan/pub/en-fr.csv'
enfr_df = pd.read_csv(file_path)

In [4]:
enfr_df['en'] = enfr_df['en'].astype(str).fillna('')
enfr_df['fr'] = enfr_df['fr'].astype(str).fillna('')
# Dimension
print("Dimensions of the dataframe:")
print(enfr_df.shape)
# Cardinality
print("Cardinality of categorical features in the dataframe:")
print(enfr_df.select_dtypes(include="object").nunique())

Dimensions of the dataframe:
(22520376, 2)
Cardinality of categorical features in the dataframe:
en    20317468
fr    21358854
dtype: int64


In [5]:
sm_enfr_df = enfr_df.sample(n=18949, random_state=42)

In [6]:
rt_path = '/home/jovyan/pub/rt_reviews.csv'
rt_df = pd.read_csv(rt_path, encoding='latin1')

In [7]:
rt_df['Review'] = rt_df['Review'].astype(str).fillna('')
rt_df['Freshness'] = rt_df['Freshness'].replace({'fresh': 0, 'rotten': 1})
# Dimension
print("Dimensions of the dataframe:")
print(rt_df.shape)
# Duplicates 
duplicates_count = rt_df.duplicated().sum()
duplicates_percentage = 100 * duplicates_count / len(rt_df)
print(f"Duplicates in the dataframe: {duplicates_count} ({duplicates_percentage:.2f}%)")
# Cardinality 
print("Cardinality of categorical features in the dataframe:")
print(rt_df.select_dtypes(include="object").nunique())

  rt_df['Freshness'] = rt_df['Freshness'].replace({'fresh': 0, 'rotten': 1})


Dimensions of the dataframe:
(480000, 2)
Duplicates in the dataframe: 140284 (29.23%)
Cardinality of categorical features in the dataframe:
Review    339697
dtype: int64


In [8]:
sm_rt_df = rt_df.sample(n=18949, random_state=42)

In [9]:
file_path = '/home/jovyan/pub/bs_train.parquet'
summ_df = pd.read_parquet(file_path)

In [10]:
summ_df['text'] = summ_df['text'].astype(str).fillna('')
summ_df['summary'] = summ_df['summary'].astype(str).fillna('')
summ_df['title'] = summ_df['title'].astype(str).fillna('')
# Dimension
print("Dimensions of the dataframe:")
print(summ_df.shape)
# Duplicates 
duplicates_count = summ_df.duplicated().sum()
duplicates_percentage = 100 * duplicates_count / len(summ_df)
print(f"Duplicates in the dataframe: {duplicates_count} ({duplicates_percentage:.2f}%)")
# Cardinality (for categorical features)
print("Cardinality of categorical features in the dataframe:")
print(summ_df.select_dtypes(include="object").nunique())

Dimensions of the dataframe:
(18949, 3)
Duplicates in the dataframe: 0 (0.00%)
Cardinality of categorical features in the dataframe:
text       18941
summary    18949
title      17106
dtype: int64


In [11]:
sm_summ_df = summ_df.sample(n=18949, random_state=42)

In [12]:
data = pd.read_json('/home/jovyan/pub/mcq.json', lines=True)
single_choice_data = data[data['choice_type'] == 'single']

In [13]:
# dropping Nan rows 
single_choice_data.loc[:, 'question'] = single_choice_data['question'].astype(str).fillna('')
single_choice_data.loc[:, 'exp'] = single_choice_data['exp'].astype(str).fillna('')
single_choice_data.loc[:, 'cop'] = single_choice_data['cop'].astype(str).fillna('')
single_choice_data.loc[:, 'opa'] = single_choice_data['opa'].astype(str).fillna('')
single_choice_data.loc[:, 'opb'] = single_choice_data['opb'].astype(str).fillna('')
single_choice_data.loc[:, 'opc'] = single_choice_data['opc'].astype(str).fillna('')
single_choice_data.loc[:, 'opd'] = single_choice_data['opd'].astype(str).fillna('')
single_choice_data.loc[:, 'subject_name'] = single_choice_data['subject_name'].astype(str).fillna('')
single_choice_data.loc[:, 'topic_name'] = single_choice_data['topic_name'].astype(str).fillna('')
# dropping redundant cols
single_choice_data = single_choice_data.drop(columns=['choice_type'])
single_choice_data = single_choice_data.drop(columns=['id'])
# Basic Sanity checks
# Dimension
print("Dimensions of the dataframe:")
print(single_choice_data.shape)
# Duplicates 
duplicates_count = single_choice_data.duplicated().sum()
duplicates_percentage = 100 * duplicates_count / len(single_choice_data)
print(f"Duplicates in the dataframe: {duplicates_count} ({duplicates_percentage:.2f}%)")
# Cardinality (for categorical features)
print("Cardinality of categorical features in the dataframe:")
print(single_choice_data.select_dtypes(include="object").nunique())

  single_choice_data.loc[:, 'cop'] = single_choice_data['cop'].astype(str).fillna('')


Dimensions of the dataframe:
(120765, 9)
Duplicates in the dataframe: 0 (0.00%)
Cardinality of categorical features in the dataframe:
question        120765
exp             103814
cop                  4
opa              51427
opb              53743
opc              54864
opd              56004
subject_name        21
topic_name        2307
dtype: int64


In [14]:
sm_single_choice_data = single_choice_data.sample(n=18949, random_state=42)

<p><span style="font-size: 24pt;"><strong>Creating DataLoaders</strong></span></p>

In [16]:
# Sampling small subset of the dataset
sm_enfr_df = enfr_df.sample(n=18949, random_state=42)
sm_rt_df = rt_df.sample(n=18949, random_state=42)
sm_summ_df = summ_df.sample(n=18949, random_state=42) 
sm_mcq_df = single_choice_data.sample(n=18949, random_state=42) 

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Trranslation dataset
enfr_inputs = tokenizer(sm_enfr_df['en'].tolist(), return_tensors='pt', padding=True, truncation=True)
enfr_labels = tokenizer(sm_enfr_df['fr'].tolist(), return_tensors='pt', padding=True, truncation=True).input_ids
print("Translation Data Sizes")
print(f"Input IDs: {enfr_inputs['input_ids'].size()}")
print(f"Attention Mask: {enfr_inputs['attention_mask'].size()}")
print(f"Labels: {enfr_labels.size()}")
translation_dataset = TensorDataset(enfr_inputs['input_ids'], enfr_inputs['attention_mask'], enfr_labels)
translation_dataloader = DataLoader(translation_dataset, batch_size=8, shuffle=True)

# Sentiment classification dataset
rt_inputs = tokenizer(sm_rt_df['Review'].tolist(), return_tensors='pt', padding=True, truncation=True)
rt_labels = torch.tensor([1 if label == 'fresh' else 0 for label in sm_rt_df['Freshness'].tolist()])
print("\nSentiment Classification Data Sizes")
print(f"Input IDs: {rt_inputs['input_ids'].size()}")
print(f"Attention Mask: {rt_inputs['attention_mask'].size()}")
print(f"Labels: {rt_labels.size()}")
classification_dataset = TensorDataset(rt_inputs['input_ids'], rt_inputs['attention_mask'], rt_labels)
classification_dataloader = DataLoader(classification_dataset, batch_size=8, shuffle=True)

# Text Summarization dataset
text_inputs = tokenizer(sm_summ_df['text'].tolist(), return_tensors='pt', padding=True, truncation=True)
summary_labels = tokenizer(sm_summ_df['summary'].tolist(), return_tensors='pt', padding=True, truncation=True).input_ids
print("\nSummarization Data Sizes")
print(f"Text Input IDs: {text_inputs['input_ids'].size()}")
print(f"Text Attention Mask: {text_inputs['attention_mask'].size()}")
print(f"Summary Labels: {summary_labels.size()}")
summarization_dataset = TensorDataset(text_inputs['input_ids'], text_inputs['attention_mask'], summary_labels)
summarization_dataloader = DataLoader(summarization_dataset, batch_size=8, shuffle=True)

# Multiple choice dataset
mcq_inputs = tokenizer(sm_mcq_df['question'].tolist(), return_tensors='pt', padding=True, truncation=True)
mcq_labels = torch.tensor([1 if label == 'correct' else 0 for label in sm_mcq_df['cop'].tolist()])
print("\nMultiple Choice Data Sizes")
print(f"Question Input IDs: {mcq_inputs['input_ids'].size()}")
print(f"Question Attention Mask: {mcq_inputs['attention_mask'].size()}")
print(f"Labels: {mcq_labels.size()}")
mcq_dataset = TensorDataset(mcq_inputs['input_ids'], mcq_inputs['attention_mask'], mcq_labels)
mcq_dataloader = DataLoader(mcq_dataset, batch_size=8, shuffle=True)

# Inspecting the DataLoaders: 1 batch
def inspect_dataloader(dataloader, name):
    print(f"\n{name} DataLoader Inspection:")
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        print(f"Input IDs size: {input_ids.size()}")
        print(f"Attention Mask size: {attention_mask.size()}")
        print(f"Labels size: {labels.size()}")
        break  

inspect_dataloader(translation_dataloader, "Translation")
inspect_dataloader(classification_dataloader, "Sentiment Classification")
inspect_dataloader(summarization_dataloader, "Summarization")
inspect_dataloader(mcq_dataloader, "Multiple Choice")

Translation Data Sizes
Input IDs: torch.Size([18949, 512])
Attention Mask: torch.Size([18949, 512])
Labels: torch.Size([18949, 512])

Sentiment Classification Data Sizes
Input IDs: torch.Size([18949, 116])
Attention Mask: torch.Size([18949, 116])
Labels: torch.Size([18949])

Summarization Data Sizes
Text Input IDs: torch.Size([18949, 512])
Text Attention Mask: torch.Size([18949, 512])
Summary Labels: torch.Size([18949, 512])

Multiple Choice Data Sizes
Question Input IDs: torch.Size([18949, 278])
Question Attention Mask: torch.Size([18949, 278])
Labels: torch.Size([18949])

Translation DataLoader Inspection:
Input IDs size: torch.Size([8, 512])
Attention Mask size: torch.Size([8, 512])
Labels size: torch.Size([8, 512])

Sentiment Classification DataLoader Inspection:
Input IDs size: torch.Size([8, 116])
Attention Mask size: torch.Size([8, 116])
Labels size: torch.Size([8])

Summarization DataLoader Inspection:
Input IDs size: torch.Size([8, 512])
Attention Mask size: torch.Size([8, 512

<p><span style="font-size: 24pt;"><strong>Defining `BertCustomHead` and training</strong></span></p>

In [3]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer, BertConfig
from torch.utils.data import DataLoader


class BertCustomHead(nn.Module):
    """
    Defines the BertCustomHead module. 
    """
    def __init__(self, config, num_classes, task_type='sequence_classification'):
        super(BertCustomHead, self).__init__()
        self.bert = BertModel(config)
        self.task_type = task_type
        self.heads = {
            'sequence_classification': nn.Linear(config.hidden_size, num_classes),
            'token_classification': nn.Linear(config.hidden_size, num_classes),
            'multiple_choice': nn.Linear(config.hidden_size, 1),
            'next_sentence_prediction': BertOnlyNSPHead(config)  
        }
        self.loss_fns = {
            'sequence_classification': nn.CrossEntropyLoss(),
            'token_classification': nn.CrossEntropyLoss(),
            'multiple_choice': nn.BCEWithLogitsLoss(),
            'next_sentence_prediction': nn.CrossEntropyLoss()
        }
        
        if task_type not in self.heads:
            raise ValueError("Invalid task type. Supported types: 'sequence_classification', 'token_classification', 'multiple_choice', 'next_sentence_prediction'")

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, next_sentence_labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        return self.heads[self.task_type](pooled_output)

def train(model, dataloaders, optimizer, num_epochs=5, device='cuda'):
    """
    Training loop for `num_epochs` epochs.
    
    Example usage:
    
            config = BertConfig.from_pretrained('bert-base-uncased')
            model = BertCustomHead(config, num_classes=2, task_type='sequence_classification')
            optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

            dataloaders = {
                'translation': translation_dataloader,
            }

            train(model, dataloaders, optimizer, num_epochs=5, device='cuda')

    """
    model.to(device)
    model.train()

    task_losses = {task_name: [] for task_name in dataloaders.keys()}

    for epoch in range(num_epochs):
        total_loss = 0.0
        for name, dataloader in dataloaders.items():
            for batch in dataloader:
                # Move batch to device
                batch = tuple(t.to(device) for t in batch)
                input_ids, attention_mask, labels = batch

                # Determine the task type based on the name of the data loader
                if name == 'translation':
                    task_type = 'sequence_classification'
                    labels = labels.view(-1)  # Flatten the labels for sequence classification
                elif name == 'classification':
                    task_type = 'sequence_classification'
                elif name == 'summarization':
                    task_type = 'token_classification'
                elif name == 'multiple_choice':
                    task_type = 'multiple_choice'
                    # Ensure labels have the same batch size as logits
                    labels = labels.unsqueeze(1).float()  # Add a dimension to match logits
                    labels = labels.expand(-1, logits.size(1))  # Expand labels to match logits shape
                else:
                    raise ValueError("Invalid data loader name. Supported names: 'translation', 'classification', 'summarization', 'multiple_choice'")    
                # Forward pass
                logits = model(input_ids, attention_mask)

                # Calculate loss
                loss = model.loss_fns[task_type](logits, labels)

                # Backward pass
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                task_losses[name].append(loss.item())
                
        print(f"Epoch {epoch+1}/{num_epochs}, Total Loss: {total_loss}")
        for task_name, losses in task_losses.items():
            print(f"Task: {task_name}, Last Loss: {losses[-1]}")





NameError: name 'BertOnlyNSPHead' is not defined