# Imports and Downloads

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.metrics import classification_report


In [2]:
def seed_everything(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything(1)

# CNNLSTMClassifiers - To load old models

In [3]:
class CNNLSTMClassifier(nn.Module):
    def __init__(self, bert, in_channels, cnn_out_channels=64, lstm_hidden_dim=64, num_classes=2):
        super(CNNLSTMClassifier, self).__init__()
        self.bert = bert
        self.cnn = nn.Conv1d(in_channels=in_channels, out_channels=cnn_out_channels, kernel_size=3, padding=1)
        self.lstm = nn.LSTM(cnn_out_channels, lstm_hidden_dim, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_dim, num_classes)
        
    def forward(self, input_ids, attention_mask):
        with torch.set_grad_enabled(self.bert.training):
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = bert_output.last_hidden_state.permute(0, 2, 1)  # (batch, embed_dim, seq_len)
        
        cnn_out = self.cnn(embeddings)
        
        lstm_out, _ = self.lstm(cnn_out.permute(0, 2, 1))  # (batch, seq_len, lstm_hidden_dim)
        
        logits = self.fc(lstm_out[:, -1, :])  # Use last hidden state for classification
        return logits

# Calculating weights for models

In [5]:
models = [ "distilbert-base-uncased", "huawei-noah/TinyBERT_General_4L_312D", "google/electra-small-discriminator"]
val_f1s = [0.8959, 0.8635, 0.8382]

In [6]:
model_weightages = []
total_f1s = sum(val_f1s)
for i in range(len(val_f1s)):
    model_weightages.append(round((val_f1s[i]/total_f1s), 4)) 
print(model_weightages)

[0.3449, 0.3324, 0.3227]


# Ensembling Model

Here, ensembling will be performed on the already trained baseline models. These transformers are frozen (ie, not trained again). Soft voting is used where each embedding is given a weight, and these weights are marked as a trainable parameter.

In [7]:
class CNNLSTMEnsemble(nn.Module):
    def __init__(self, distilbert, tinybert, electra, model_weightages, cnn_out_channels=64, lstm_hidden_dim=64, num_classes=2):
        super(CNNLSTMEnsemble, self).__init__()
        self.distilbert = distilbert.bert
        self.tinybert = tinybert.bert
        self.electra = electra.bert
        self.model_weights = nn.Parameter(torch.FloatTensor(model_weightages), requires_grad=True).to(device)
        self.tinybert_projection = nn.Linear(312, 768)  # Project TinyBERT to 768
        self.electra_projection = nn.Linear(256, 768)   # Project Electra to 768
        self.cnn = nn.Conv1d(in_channels=768, out_channels=cnn_out_channels, kernel_size=3, padding=1)
        self.lstm = nn.LSTM(cnn_out_channels, lstm_hidden_dim, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_dim, num_classes)

    def forward(self, distilbert_input_ids, tinybert_input_ids, electra_input_ids, distilbert_attention_mask, tinybert_attention_mask, electra_attention_mask):
        with torch.no_grad():
            distilbert_output = self.distilbert(input_ids=distilbert_input_ids, attention_mask=distilbert_attention_mask).last_hidden_state
            tinybert_output = self.tinybert(input_ids=tinybert_input_ids, attention_mask=tinybert_attention_mask).last_hidden_state
            electra_output = self.electra(input_ids=electra_input_ids, attention_mask=electra_attention_mask).last_hidden_state

        tinybert_embedding =  F.relu(self.tinybert_projection(tinybert_output)).permute(0, 2, 1)
        electra_embedding =  F.relu(self.electra_projection(electra_output)).permute(0, 2, 1) 
        distilbert_embedding = distilbert_output.permute(0, 2, 1)  

        model_weights_n = F.softmax(self.model_weights, dim=0)
        combined_embedding = (model_weights_n[0] * tinybert_embedding 
                              + model_weights_n[1] * electra_embedding 
                              + model_weights_n[2] * distilbert_embedding)

        cnn_out = self.cnn(combined_embedding)
        lstm_out, _ = self.lstm(cnn_out.permute(0, 2, 1))

        logits = self.fc(lstm_out[:, -1, :])

        return logits

In [8]:
class CNNLSTMHybridEnsemble(nn.Module):
    def __init__(self, distilbert, tinybert, electra, cnn_out_channels=64, lstm_hidden_dim=64, num_classes=2):
        super(CNNLSTMHybridEnsemble, self).__init__()
        self.distilbert = distilbert.bert
        self.tinybert = tinybert.bert
        self.electra = electra.bert
        
        self.tinybert_projection = nn.Linear(312, 768)  # Project TinyBERT to 768
        self.electra_projection = nn.Linear(256, 768)   # Project ELECTRA to 768
        
        self.combination_mlp = nn.Sequential(
            nn.Linear(768 * 3, 768),
            nn.ReLU(),
            nn.Linear(768, 768)
        )
        
        self.cnn = nn.Conv1d(in_channels=768, out_channels=cnn_out_channels, kernel_size=3, padding=1)
        self.lstm = nn.LSTM(cnn_out_channels, lstm_hidden_dim, batch_first=True)
        
        self.fc = nn.Linear(lstm_hidden_dim, num_classes)

    def forward(self, distilbert_input_ids, tinybert_input_ids, electra_input_ids, distilbert_attention_mask, tinybert_attention_mask, electra_attention_mask):
        with torch.no_grad():
            # Get frozen outputs from each pretrained model
            distilbert_output = self.distilbert(input_ids=distilbert_input_ids, attention_mask=distilbert_attention_mask).last_hidden_state
            tinybert_output = self.tinybert(input_ids=tinybert_input_ids, attention_mask=tinybert_attention_mask).last_hidden_state
            electra_output = self.electra(input_ids=electra_input_ids, attention_mask=electra_attention_mask).last_hidden_state

        tinybert_embedding = F.relu(self.tinybert_projection(tinybert_output)).permute(0, 2, 1)
        electra_embedding = F.relu(self.electra_projection(electra_output)).permute(0, 2, 1)
        distilbert_embedding = distilbert_output.permute(0, 2, 1)

        combined_embedding = torch.cat((distilbert_embedding, tinybert_embedding, electra_embedding), dim=1)
        combined_embedding = self.combination_mlp(combined_embedding.permute(0, 2, 1)).permute(0, 2, 1)

        cnn_out = self.cnn(combined_embedding)
        lstm_out, _ = self.lstm(cnn_out.permute(0, 2, 1))

        # Fully connected layer to get logits
        logits = self.fc(lstm_out[:, -1, :])


# Dataloaders

In [9]:
class CustomDataset(Dataset):
    def __init__(self, distilbert_tokens, electra_tokens, tinybert_tokens, labels):
        self.distilbert_input_ids = distilbert_tokens['input_ids']
        self.electra_input_ids = electra_tokens['input_ids']
        self.tinybert_input_ids = tinybert_tokens['input_ids']
        
        self.distilbert_attention_mask = distilbert_tokens['attention_mask']
        self.electra_attention_mask =  electra_tokens['attention_mask']
        self.tinybert_attention_mask = tinybert_tokens['attention_mask']
        
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'distilbert_input_ids': self.distilbert_input_ids[idx],
            'electra_input_ids': self.electra_input_ids[idx],
            'tinybert_input_ids': self.tinybert_input_ids[idx], 
            
            'distilbert_attention_mask': self.distilbert_attention_mask[idx],
            'electra_attention_mask': self.electra_attention_mask[idx], 
            'tinybert_attention_mask': self.tinybert_attention_mask[idx], 
            
            'labels': self.labels[idx]
        }

In [10]:
train_DB_inputs = torch.load('intermediates/DB_inputs.pt')
dev_DB_inputs = torch.load('intermediates/DB_dev_inputs.pt')
test_DB_inputs = torch.load('intermediates/DB_test_inputs.pt')

train_TB_inputs = torch.load('intermediates/TB_inputs.pt')
dev_TB_inputs = torch.load('intermediates/TB_dev_inputs.pt')
test_TB_inputs = torch.load('intermediates/TB_test_inputs.pt')

train_EL_inputs = torch.load('intermediates/EL_inputs.pt')
dev_EL_inputs = torch.load('intermediates/EL_dev_inputs.pt')
test_EL_inputs = torch.load('intermediates/EL_test_inputs.pt')

train_labels = torch.load('intermediates/labels.pt')
dev_labels = torch.load('intermediates/labels_dev.pt')
test_labels = torch.load('intermediates/labels_test.pt')

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
print(train_DB_inputs['input_ids'].shape)
print(train_EL_inputs['input_ids'].shape)
print(train_TB_inputs['input_ids'].shape)
print(train_labels.shape)


torch.Size([610767, 512])
torch.Size([610767, 512])
torch.Size([610767, 512])
torch.Size([610767])


In [12]:
train_dataset = CustomDataset(train_DB_inputs, train_EL_inputs, train_TB_inputs, train_labels)
dev_dataset = CustomDataset(dev_DB_inputs, dev_EL_inputs, dev_TB_inputs, dev_labels)
test_dataset = CustomDataset(test_DB_inputs, test_EL_inputs, test_TB_inputs, test_labels)

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=10, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False) 

# Adaptive Model Training

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [14]:
tinybert = torch.load('models/TB_model.pt')
electra = torch.load('models/EL_model.pt')
distilbert = torch.load('models/DB_model.pt')

# model_tinybert = CNNLSTMClassifier(tinybert)
# model_distilbert = CNNLSTMClassifier(distilbert)
# model_electra = CNNLSTMClassifier(electra, in_channels=256)

# model_tinybert.load_state_dict(torch.load('models/TB_model_dict.pt'))
# model_distilbert.load_state_dict(torch.load('models/DB_model_dict.pt'))
# model_electra.load_state_dict(torch.load('models/EL_model_dict.pt'))


In [15]:
ensemble_model = CNNLSTMEnsemble(distilbert, tinybert, electra, model_weightages)
ensemble_model = ensemble_model.to(device)

In [21]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(ensemble_model.parameters(), lr=2e-5)
num_epochs = 1
ensemble_model.train()

CNNLSTMEnsemble(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): 

In [None]:
def train_model(model, dataloader, criterion, optimizer, num_epochs=1, accumulation_steps=10, device='cuda'):
    model.train()
    model.zero_grad()
    for epoch in range(num_epochs):
        total_loss = 0
        for i, data in enumerate(tqdm(dataloader)):
            distilbert_input_ids = data['distilbert_input_ids'].to(device)
            tinybert_input_ids = data['tinybert_input_ids'].to(device)
            electra_input_ids = data['electra_input_ids'].to(device)
            
            distilbert_attention_mask = data['distilbert_attention_mask'].to(device)
            tinybert_attention_mask = data['tinybert_attention_mask'].to(device)
            electra_attention_mask = data['electra_attention_mask'].to(device)
            
            labels = data['labels'].to(device)
            
            outputs = model(distilbert_input_ids, tinybert_input_ids, electra_input_ids, distilbert_attention_mask, tinybert_attention_mask, electra_attention_mask)
            loss = criterion(outputs, labels)
            loss = loss / accumulation_steps
            loss.backward(retain_graph=True)
            total_loss += loss.item()
            
            if ((i + 1) % accumulation_steps == 0) or (i+1 == len(dataloader)):
                optimizer.step()
                model.zero_grad()

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}')
    return model

In [28]:
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    print("Classification Report:\n")
    with torch.no_grad():
        for batches in tqdm(dataloader):
            distilbert_input_ids = batches['distilbert_input_ids'].to(device)
            tinybert_input_ids = batches['tinybert_input_ids'].to(device)
            electra_input_ids = batches['electra_input_ids'].to(device)
            
            distilbert_attention_mask = batches['distilbert_attention_mask'].to(device)
            tinybert_attention_mask = batches['tinybert_attention_mask'].to(device)
            electra_attention_mask = batches['electra_attention_mask'].to(device)
            
            labels = batches['labels'].to(device)
            
            outputs = model(distilbert_input_ids, tinybert_input_ids, electra_input_ids, distilbert_attention_mask, tinybert_attention_mask, electra_attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    report = classification_report(true_labels, predictions, digits=4)
    print(report)

In [29]:
def test_model(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    print("Classification Report:\n")
    with torch.no_grad():
        for batches in tqdm(dataloader):
            distilbert_input_ids = batches['distilbert_input_ids'].to(device)
            tinybert_input_ids = batches['tinybert_input_ids'].to(device)
            electra_input_ids = batches['electra_input_ids'].to(device)
            
            distilbert_attention_mask = batches['distilbert_attention_mask'].to(device)
            tinybert_attention_mask = batches['tinybert_attention_mask'].to(device)
            electra_attention_mask = batches['electra_attention_mask'].to(device)
            
            labels = batches['labels'].to(device)
            
            outputs = model(distilbert_input_ids, tinybert_input_ids, electra_input_ids, distilbert_attention_mask, tinybert_attention_mask, electra_attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    report = classification_report(true_labels, predictions, digits=4)
    print(report)
    return predictions, true_labels

In [24]:
ensemble_model = train_model(ensemble_model, train_loader, criterion, optimizer, num_epochs, device=device)

100%|██████████| 61077/61077 [3:12:56<00:00,  5.28it/s]  

Epoch 1/1, Loss: 0.0157





In [25]:
torch.save(ensemble_model, 'models/ensemble_model_ae.pt')

In [30]:
evaluate_model(ensemble_model, dev_loader, device)

Classification Report:



100%|██████████| 26176/26176 [1:15:37<00:00,  5.77it/s]


              precision    recall  f1-score   support

           0     0.9221    0.8128    0.8640     98328
           1     0.8948    0.9587    0.9257    163430

    accuracy                         0.9039    261758
   macro avg     0.9085    0.8857    0.8948    261758
weighted avg     0.9051    0.9039    0.9025    261758



In [32]:
all_predictions, all_labels = test_model(ensemble_model, test_loader, device)

Classification Report:



100%|██████████| 7395/7395 [21:18<00:00,  5.78it/s]

              precision    recall  f1-score   support

           0     0.8358    0.5261    0.6457     34675
           1     0.6847    0.9087    0.7809     39266

    accuracy                         0.7293     73941
   macro avg     0.7602    0.7174    0.7133     73941
weighted avg     0.7555    0.7293    0.7175     73941






In [33]:
torch.save(all_predictions, 'predictions/ensemble_ae_predictions.pt')