In [None]:
!pip install transformers==4.28.0
!pip install optuna

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import time, datetime, random, optuna, re, string
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from optuna.pruners import SuccessiveHalvingPruner
from optuna.samplers import TPESampler
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import train_test_split
from collections import Counter
from transformers import BertModel, BertTokenizer

SEED = 15
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
device = torch.device("cuda")

In [None]:
# List of unique accessions
unique_accessions = ['VH134067', 'VH139380', 'VH266015', 'VH266510', 'VH269384',
                     'VH271613', 'VH302907', 'VH304954', 'VH507804', 'VH525628']



# Dictionary to store the dataframes
dfs = {}

# Loop through the unique accessions
for accession in unique_accessions:
    # Create the dataframe name
    path = '/content/drive/MyDrive/NAEP_Comp/'
    df_name = 'df_cleaned' + accession

    # Read the CSV file into a dataframe
    df = pd.read_csv(path + df_name + '.csv')

    # Add the dataframe to the dictionary
    dfs[accession] = df

In [None]:
df = dfs['VH525628']


In [None]:
df = df.dropna(subset = ['predict_from'])

In [None]:
def preprocess(text):
    text=text.lower()
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
    #Replace &amp, &lt, &gt with &,<,> respectively
    text=text.replace(r'&amp;?',r'and')
    text=text.replace(r'&lt;',r'<')
    text=text.replace(r'&gt;',r'>')
    #remove hashtag sign
    text=re.sub(r"#","",text)   
    #remove mentions
    text = re.sub(r"(?:\@)\w+", '', text)
    #text=re.sub(r"@","",text)
    #remove non ascii chars
    text=text.encode("ascii",errors="ignore").decode()
    #remove some puncts (except . ! ?)
    text=re.sub(r'[:"#$%&\*+,-/:;<=>@\\^_`{|}~]+','',text)
    text=re.sub(r'[!]+','!',text)
    text=re.sub(r'[?]+','?',text)
    text=re.sub(r'[.]+','.',text)
    text=re.sub(r"'","",text)
    text=re.sub(r"\(","",text)
    text=re.sub(r"\)","",text)
    
    text=" ".join(text.split())
    return text

In [None]:
df['predict_from'] = df['predict_from'].apply(preprocess)

In [None]:
from transformers.models.bert.modeling_bert import BertForSequenceClassification
# instantiate BERT model with hidden states
model = BertForSequenceClassification.from_pretrained(MODEL, output_hidden_states=True).cuda()

In [None]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer, AutoTokenizer, AutoModelForSequenceClassification

In [None]:
MODEL = "tbs17/MathBERT"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, do_lower_case=True)

In [None]:
def geIdsAndMasks (df) : 
    input_ids = []
    attention_masks = []
    for sentence in df['predict_from'].tolist():
        dictionary = tokenizer.encode_plus(
                            sentence,                      
                            add_special_tokens = True,
                            max_length = 80,
                            truncation=True,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                    )
        # encode_plus returns a dictionary 
        input_ids.append(dictionary['input_ids'])
        attention_masks.append(dictionary['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks

In [None]:
imput_ids, attention_masks = geIdsAndMasks(df)

In [None]:
from transformers import ElectraConfig, AutoConfig, AutoModelForSequenceClassification, BertConfig, BertModel,AutoModel

In [None]:
config = BertConfig.from_pretrained("tbs17/MathBERT", output_hidden_states=True)
model = BertModel.from_pretrained("tbs17/MathBERT", config=config)

In [None]:
class BertForSentenceClassification(BertModel):
    def __init__(self, config):
        super().__init__(config)
        
        ##### START YOUR CODE HERE #####
        # Add a linear classifier that map BERTs [CLS] token representation to the unnormalized
        # output probabilities for each class (logits).
        # Notes: 
        #  * See the documentation for torch.nn.Linear
        #  * You do not need to add a softmax, as this is included in the loss function
        #  * The size of BERTs token representation can be accessed at config.hidden_size
        #  * The number of output classes can be accessed at config.num_labels
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        ##### END YOUR CODE HERE #####
        self.loss = torch.nn.CrossEntropyLoss()

    def forward(self, labels=None, **kwargs):
        outputs = super().forward(**kwargs)
        ##### START YOUR CODE HERE #####
        # Pass BERTs [CLS] token representation to this new classifier to produce the logits.
        # Notes:
        #  * The [CLS] token representation can be accessed at outputs.pooler_output
        cls_token_repr = outputs.pooler_output
        print(outputs.pooler_output.shape)
        logits = self.classifier(cls_token_repr)
        ##### END YOUR CODE HERE #####
        if labels is not None:
            outputs = (logits, self.loss(logits, labels))
        else:
            outputs = (logits,)
        return outputs

In [None]:

class DistillBertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False, num_labels = 3):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(DistillBertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, num_labels

        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained('distilbert-base-uncased', num_labels = num_labels)

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
   
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [None]:

class ElectraModel(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False, num_labels = 3):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(ElectraModel, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, num_labels

        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained('google/electra-base-discriminator',num_labels=  num_labels)

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
   
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [None]:
df['predict_from'].isna().sum()

In [None]:
def datapreprocess(df):
    MAX_LEN = 50
    bert_tokenizer = AutoTokenizer.from_pretrained('tbs17/MathBERT', do_lower_case=True)
    electra_tokenizer = AutoTokenizer.from_pretrained('google/electra-base-discriminator', do_lower_case=True)
    distill_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
    inputs = df.predict_from.values
    labels = df.assigned_score.values
    indexes = df.index.values
    inputs = ["[CLS] " + text + " [SEP]" for text in inputs]
    tokenized_inputs_1 = bert_tokenizer( inputs, add_special_tokens=True, padding='max_length', max_length=MAX_LEN,
        return_tensors='pt', truncation=True)
    tokenized_inputs_2 = electra_tokenizer( inputs, add_special_tokens=True, padding='max_length', max_length=MAX_LEN,
        return_tensors='pt', truncation=True)
    tokenized_inputs_3 = distill_tokenizer( inputs, add_special_tokens=True, padding='max_length', max_length=MAX_LEN,
        return_tensors='pt', truncation=True)

    input_ids_1= tokenized_inputs_1['input_ids']
    attention_masks_1 = tokenized_inputs_1['attention_mask']

    input_ids_2= tokenized_inputs_2['input_ids']
    attention_masks_2= tokenized_inputs_2['attention_mask']

    input_ids_3= tokenized_inputs_3['input_ids']
    attention_masks_3 = tokenized_inputs_3['attention_mask']

    # Split the data into train and test sets
    train_inputs1, test_inputs1, train_masks1, test_masks1, train_labels, test_labels, train_indexes, test_indexes = train_test_split(
        input_ids_1, attention_masks_1, labels, indexes, random_state=42, test_size=0.2, stratify=labels
    )

    train_inputs2, test_inputs2, train_masks2, test_masks2 = train_test_split(
        input_ids_2, attention_masks_2, random_state=42, test_size=0.2, stratify=labels)
    train_inputs3, test_inputs3, train_masks3, test_masks3,= train_test_split(
        input_ids_3, attention_masks_3, random_state=42, test_size=0.2, stratify=labels )

    return train_inputs1,train_inputs2, train_inputs3,test_inputs1,test_inputs2,test_inputs3, train_masks1, train_masks2, train_masks3,test_masks1, test_masks2,test_masks3,train_labels, test_labels, test_indexes

In [None]:
train_inputs1,train_inputs2, train_inputs3,test_inputs1,test_inputs2,test_inputs3, train_masks1, train_masks2, train_masks3,test_masks1, test_masks2,test_masks3,train_labels, test_labels, test_indexes = datapreprocess(df)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(test_labels)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data1 = TensorDataset(train_inputs1, train_masks1, train_labels)
train_sampler1 = RandomSampler(train_data1)
train_dataloader1 = DataLoader(train_data1, sampler=train_sampler1, batch_size=batch_size)

train_data2= TensorDataset(train_inputs2, train_masks2, train_labels)
train_sampler2 = RandomSampler(train_data2)
train_dataloader2 = DataLoader(train_data2, sampler=train_sampler2, batch_size=batch_size)

train_data3= TensorDataset(train_inputs3, train_masks3, train_labels)
train_sampler3= RandomSampler(train_data3)
train_dataloader3 = DataLoader(train_data3, sampler=train_sampler3, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data1 = TensorDataset(test_inputs1, test_masks1, val_labels)
val_sampler1 = SequentialSampler(val_data1)
val_dataloader1 = DataLoader(val_data1, sampler=val_sampler1, batch_size=batch_size)

val_data2 = TensorDataset(test_inputs2, test_inputs2, val_labels)
val_sampler2 = SequentialSampler(val_data2)
val_dataloader2 = DataLoader(val_data2, sampler=val_sampler2, batch_size=batch_size)

val_data3= TensorDataset(test_inputs3, test_inputs3, val_labels)
val_sampler3 = SequentialSampler(val_data3)
val_dataloader3 = DataLoader(val_data3, sampler=val_sampler3, batch_size=batch_size)

In [None]:
mathbert = BertForSentenceClassification.from_pretrained("tbs17/MathBERT", num_labels=df['assigned_score'].nunique()  ) 
electrbert = ElectraModel(freeze_bert=False)
distill = DistillBertClassifier(freeze_bert=False)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    device = 'cuda'
    # Instantiate Bert Classifier
    mathbert = BertForSentenceClassification.from_pretrained("tbs17/MathBERT", num_labels=df['assigned_score'].nunique()  ) 
    electrbert = ElectraModel(freeze_bert=False)
    distill = DistillBertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    mathbert.to(device)
    electrbert.to(device)
    distill.to(device)

    # Create the optimizer
    optimizer1 = AdamW(mathbert.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )
    
    optimizer2 = AdamW(electrbert.parameters(), lr=5e-5,  eps=1e-8   )
    optimizer3 = AdamW(distill.parameters(), lr=5e-5,  eps=1e-8   )
    


    # Total number of training steps
    total_steps = len(train_dataloader1) * epochs

    # Set up the learning rate scheduler
    #scheduler = get_linear_schedule_with_warmup(optimizer1,
                                             #   num_warmup_steps=0, # Default value
                                              #  num_training_steps=total_steps)
    return mathbert, electrbert, distill,optimizer1, optimizer2, optimizer3

In [None]:
mathbert, electrbert, distill,optimizer1, optimizer2, optimizer3  = initialize_model(4)

In [None]:
from sklearn.metrics import cohen_kappa_score

In [None]:
import torch.optim as optim



# Define the loss function
loss_function = nn.CrossEntropyLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Number of training epochs
num_epochs = 4
mathbert.cuda()
electrbert.cuda()
distill.cuda()

mathbert.train()
electrbert.train()
distill.train()

# Training loop
for epoch in range(num_epochs):
    # Training phase

    train_loss = 0
    train_correct = 0
    total_train = 0
    for (step1, batch1), (step2, batch2), (step3, batch3) in zip(enumerate(train_dataloader1), enumerate(train_dataloader2), enumerate(train_dataloader3)):
    # Your code for each step and batch goes here

        b_input_ids1, b_attn_mask1, b_labels1 = tuple(t.to(device) for t in batch1)
        b_input_ids2, b_attn_mask2, b_labels2 = tuple(t.to(device) for t in batch2)
        b_input_ids3, b_attn_mask3, b_labels3 = tuple(t.to(device) for t in batch3)
        
        #input_ids = batch['input_ids'].to(device)
        #attention_mask = batch['attention_mask'].to(device)
        #labels = batch['labels'].to(device)

        # Clear gradients
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        optimizer3.zero_grad()

        # Forward pass
        #logits1 = mathbert(b_input_ids1, b_attn_mask1,b_labels1)
        logits2 = electrbert(b_input_ids2, b_attn_mask2)
        #logits3 = distill(b_input_ids3, b_attn_mask3)

        # Combine logits
        #concatenated_logits = torch.cat((logits2), dim=1)

        # Calculate loss
        loss = loss_function(logits2, b_labels2)  # Assuming b_labels1 is used for all models

        train_loss += loss.item()

        # Backward pass
        loss.backward()

        #print(logits2, logits3)

        # Calculate loss
        #loss2 = loss_function(logits2, b_labels2)
        #loss3 = loss_function(logits3, b_labels3)

        train_loss += loss.item()
        

        # Backward pass
        #loss2.backward()
        #loss3.backward()

        # Update weights
        optimizer2.step()
        #optimizer3.step()

        # Calculate accuracy
        preds = torch.argmax(logits2, dim=1).flatten()
        #print(preds.unique(return_counts = True))
        train_correct += (preds == b_labels1).sum().item()
        total_train += b_labels1.size(0)
    avg_train_loss = train_loss / len(train_dataloader1)
    print(avg_train_loss)

    # Validation phase
    model.eval()
    val_accuracy = []
    val_loss = []
    total_val = 0
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        all_preds = []
        all_labels = []
        for batch in val_dataloader2:
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Forward pass
            logits2 = electrbert(b_input_ids2, b_attn_mask2)
            #logits3 = distill(b_input_ids3, b_attn_mask3)
            concatenated_logits = torch.cat((logits2), dim=1)

              
            #print(concatenated_logits.shape, b_labels.shape)
            # Truncate logits and labels if batch size is different
            #if concatenated_logits.shape[0] > b_labels.shape[0]:
             #   concatenated_logits = concatenated_logits[:b_labels.shape[0], :]
            #elif concatenated_logits.shape[0] < b_labels.shape[0]:
            #    b_labels = b_labels[:concatenated_logits.shape[0]]
            # Calculate loss
            loss = loss_function(concatenated_logits, b_labels)
            val_loss.append(loss.item())

            # Calculate accuracy
            preds = torch.argmax(concatenated_logits, dim=1).flatten()
            accuracy = (preds == b_labels).cpu().numpy().mean() * 100
            val_accuracy.append(accuracy)
            val_predictions.extend(preds.tolist())
            val_labels.extend(b_labels.tolist())

    # Calculate average loss and accuracy
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)
    quadratic_kappa = cohen_kappa_score(val_labels, val_predictions, weights='quadratic')
    # Print training progress for each epoch
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Val Loss: {val_loss:.4f}, Kappa: {quadratic_kappa:.4f}")
    print("--------------------")
