In [1]:
import torch

# For some reason needed or torchtext will not work...
torch.utils.data.datapipes.utils.common.DILL_AVAILABLE = torch.utils._import_utils.dill_available()

from tqdm import tqdm
from torchtext.datasets import IMDB
import re
from transformers import AutoTokenizer,AutoModel
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
import torch.nn.functional as F

train_iter, test_iter = IMDB()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
# Ensure the reproducibility of results

from transformers import set_seed

seed = 42

torch.manual_seed(seed)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# Set Seed for transfomers
set_seed(seed)


# Dataset processing

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_and_encode(batch, tokenizer, max_length=512):
    inputs = tokenizer.batch_encode_plus(
        batch,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return inputs['input_ids'], inputs['attention_mask'].to(torch.bool)


def process_dataset(iterator, tokenizer):
    texts = []
    labels = []
    #label 1 is negative, 2 is positive
    for i, (label, text) in enumerate(iterator):
        cleaned_text = clean_text(text)
        texts.append(cleaned_text)
        labels.append(1 if label == 2 else 0)


    input_ids, attention_masks = tokenize_and_encode(texts, tokenizer)
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels

In [4]:
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-mini')#'kanishka/GlossBERT')

train_input_ids, train_attention_masks, train_labels = process_dataset(train_iter, tokenizer)
test_input_ids, test_attention_masks, test_labels = process_dataset(test_iter, tokenizer)



In [5]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

vocab_size = tokenizer.vocab_size

train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data) # SequentialSampler 
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [10]:
len(test_data)

25000

In [None]:
# Used only for testing

#for elem in train_dataloader:
#    input_ids_batch, attention_mask_batch, target_batch = elem
#    output = model(input_ids_batch.to(device), attention_mask = attention_mask_batch.to(device), granularity_level=3)
#    break

torch.Size([32, 512, 768])
torch.Size([32, 768])


# Model definition

In [6]:
class NestedFFN(nn.Module):
    def __init__(self, d_model, d_ff, num_granularities=4):
        super(NestedFFN, self).__init__()

        # Initialize FFN layers
        self.num_granularities = num_granularities
        self.d_model = d_model
        self.d_ff = d_ff

        # Create weight matrices for W1 and W2 with the largest size
        self.W1 = nn.Parameter(torch.randn(d_ff, d_model))
        self.W2 = nn.Parameter(torch.randn(d_ff, d_model))

        # Calculate the sizes of each granularity
        self.granularity_sizes = [d_ff // (2 ** i) for i in range(num_granularities)]
        self.granularity_sizes_mix = []

        # This is for mix' n' match
        for i in range(num_granularities-1) :
            self.granularity_sizes_mix.append(int(1/2 * (self.granularity_sizes[i] + self.granularity_sizes[i+1])))

        #print(self.granularity_sizes_mix)

    def forward(self, x, granularity_level):
        assert 0 <= granularity_level < self.num_granularities, "Invalid granularity level"

        # m_i Number of neuron selected
        m_i = self.granularity_sizes[granularity_level]

        # Perform the FFN operation with the selected subset of weights
        hidden = F.gelu(x @ self.W1[:m_i, :].T)
        output = F.gelu( hidden @ self.W2[:m_i, :] )

        return output
    
    # This function is used only at inference, where we choose different granulaties that the model is not explicitly trained on that granularities
    def forward_mix(self, x, granularity_level):
        # m_i Number of neuron selected
        m_i = self.granularity_sizes_mix[granularity_level]

        # Perform the FFN operation with the selected subset of weights
        hidden = F.gelu(x @ self.W1[:m_i, :].T)
        output = F.gelu( hidden @ self.W2[:m_i, :] )

        return output

class TransformerLayer(nn.Module):
    def __init__(self, d_model, num_heads, nested_ffn, granularity_level, dropout=0.1):
        super(TransformerLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
        self.granularity_level = granularity_level
        self.nested_ffn = nested_ffn
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, inference, src_mask=None, src_key_padding_mask=None):
        src2, _ = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)
        src = src + self.dropout(src2)
        src = self.layernorm1(src)

        if inference == False:
            src2 = self.nested_ffn(src, self.granularity_level)
        else:
            src2 = self.nested_ffn.forward_mix(src, self.granularity_level)

        src = src + self.dropout(src2)
        src = self.layernorm2(src)
        return src

class Transformer(nn.Module):
    def __init__(self, d_model, num_layers, num_heads, nested_ffn, num_granularities=4, dropout=0.1):
        super(Transformer, self).__init__()
        self.models = [ ]
        # We Stack l Layers with the same granularity_level
        # Creating M1, M2, ... , Mg
        for id in range(num_granularities):

          self.models.append( nn.ModuleList([
            TransformerLayer(d_model, num_heads, nested_ffn, id, dropout).to(device)
            for _ in range(num_layers)
          ]))

        self.layernorm = nn.LayerNorm(d_model)

    def forward(self, src, src_mask=None, src_key_padding_mask=None, granularity_level = 0, inference = False):
      # So granularity_level indicates the model M_i that we want to use
        for layer in self.models[granularity_level]:
            src = layer(src, inference, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        src = self.layernorm(src)
        return src
    

class SentimentTransformer(nn.Module):
    def __init__(self,d_ff, num_layers, num_heads, granularity_levels=4, dropout=0.1,num_granularities = 4):
        super(SentimentTransformer, self).__init__()

        self.bert = AutoModel.from_pretrained('prajjwal1/bert-mini') #'kanishka/GlossBERT') #, torch_dtype=torch.float16)

        for param in self.bert.parameters():
            param.requires_grad = False

        d_model = self.bert.config.hidden_size
        self.nested_ffn = NestedFFN(d_model, d_ff, num_granularities)

        self.transformer = Transformer(d_model, num_layers, num_heads, self.nested_ffn, granularity_levels, dropout)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(d_model, 512)
        self.fc2 = nn.Linear(512, 1)  # Binary classification
        

    
    def forward(self, input_ids, attention_mask=None, granularity_level=0, inference = False):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state 

        src = self.transformer(hidden_states, granularity_level=granularity_level, inference = inference)
        src = self.relu(self.fc1(torch.mean(src, dim=1)))
        
        src = self.fc2(src)

        return src

# Training

In [7]:
# Hyperparameters
d_ff = 2048
num_granularities = 4
num_layers = 2  
num_heads = 4 
dropout = 0
epochs = 10 
learning_rate = 0.001

# Hyperparameters for single transformer, the rest is the same
#num_granularities = 1
#num_layers = 1  

In [8]:
torch.cuda.empty_cache()

In [9]:

model = SentimentTransformer( d_ff, num_layers, num_heads, dropout=dropout,num_granularities=num_granularities).to(device)



In [10]:
# Count the total number of parameters
total_params = sum(p.numel() for p in model.parameters())
# Count the number of parameters that require gradients
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

Total parameters: 12351745
Trainable parameters: 1181185


In [11]:

model.train()

# Initialize the model

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0.0
    for batch in tqdm(train_dataloader, unit='batch'):
        input_ids_batch, attention_mask_batch, target_batch = batch

        input_ids_batch = input_ids_batch.to(device)
        attention_mask_batch = attention_mask_batch.to(device)
        target_batch = target_batch.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()

        # Compute the loss for each granularity level and combine them
        losses = []
        for granularity_level in range(num_granularities):

            output = model(input_ids_batch, attention_mask = attention_mask_batch, granularity_level=granularity_level)
            loss = criterion(output.flatten(), target_batch.float())
            losses.append(loss)
            
        # Combine the losses
        combined_loss = sum(losses) / num_granularities

        # Backpropagation
        combined_loss.backward()

        # Update parameters
        optimizer.step()

        # Accumulate loss for reporting
        total_loss += combined_loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/batch_size}")


100%|██████████| 782/782 [11:55<00:00,  1.09batch/s]


Epoch 1/10, Loss: 12.342774744145572


100%|██████████| 782/782 [19:05<00:00,  1.46s/batch]


Epoch 2/10, Loss: 11.283162438310683


100%|██████████| 782/782 [19:56<00:00,  1.53s/batch]


Epoch 3/10, Loss: 10.998607359360904


100%|██████████| 782/782 [20:15<00:00,  1.55s/batch]


Epoch 4/10, Loss: 10.67746351333335


100%|██████████| 782/782 [20:23<00:00,  1.57s/batch]


Epoch 5/10, Loss: 10.388783525675535


100%|██████████| 782/782 [20:08<00:00,  1.55s/batch]


Epoch 6/10, Loss: 9.99887270713225


100%|██████████| 782/782 [19:14<00:00,  1.48s/batch]


Epoch 7/10, Loss: 9.659495792817324


100%|██████████| 782/782 [19:12<00:00,  1.47s/batch]


Epoch 8/10, Loss: 9.365285453386605


100%|██████████| 782/782 [18:48<00:00,  1.44s/batch]


Epoch 9/10, Loss: 9.054775128606707


100%|██████████| 782/782 [19:12<00:00,  1.47s/batch]

Epoch 10/10, Loss: 8.775679647922516





# Evaluation

In [12]:

model.eval()

# Number of correct prediction made by M1,...Mg
correct_model = [ 0 for _ in range(num_granularities) ]
# Number of correct prediction made by mix' n' match
correct_model_mix = [ 0 for _ in range(len(model.nested_ffn.granularity_sizes_mix)) ]

total = 0

with torch.no_grad():
    for batch in tqdm(test_dataloader, unit='batch'):
        input_ids_batch, attention_mask_batch, target_batch = batch
        
        input_ids_batch = input_ids_batch.to(device)
        attention_mask_batch = attention_mask_batch.to(device)
        target_batch = target_batch.to(device)

        for granularity_level in range(num_granularities):
            output = model(input_ids_batch, attention_mask = attention_mask_batch, granularity_level=granularity_level)
            output = torch.sigmoid(output)
            output = torch.tensor([True if prob >0.5 else False for prob in output.flatten()]).to(device)

            correct_prediction = torch.eq(output, target_batch)

            correct_model[granularity_level] += torch.sum(correct_prediction).item()

        for granularity_level in range(len(model.nested_ffn.granularity_sizes_mix)):
            output = model(input_ids_batch, attention_mask = attention_mask_batch, granularity_level=granularity_level, inference = True)
            output = torch.sigmoid(output)
            output = torch.tensor([True if prob >0.5 else False for prob in output.flatten()]).to(device)

            correct_prediction = torch.eq(output, target_batch)

            correct_model_mix[granularity_level] += torch.sum(correct_prediction).item()

        total += torch.sum(target_batch)






100%|██████████| 782/782 [16:23<00:00,  1.26s/batch]


In [13]:
accuracy = [n_correct/len(test_data) for n_correct in correct_model]

n_sub_models_neurons = [d_ff // (2 ** i) for i in range(num_granularities)]

print('Accuracy of each sub-model', accuracy)
print('Each sub-model has number of neurons:', n_sub_models_neurons )
print("So for example, the first sub model with ", n_sub_models_neurons[0], "neurons, has an accuracy of ", accuracy[0])

Accuracy of each sub-model [0.82616, 0.82548, 0.80956, 0.79756]
Each sub-model has number of neurons: [2048, 1024, 512, 256]
So for example, the first sub model with  2048 neurons, has an accuracy of  0.82616


In [14]:
accuracy = [n_correct/len(test_data) for n_correct in correct_model_mix]

n_sub_models_neurons_mix = model.nested_ffn.granularity_sizes_mix

print('Accuracy of each sub-model with different granularities', accuracy)
print('Each sub-model has number of neurons:', n_sub_models_neurons_mix )
print("So for example, the first sub model with ", n_sub_models_neurons_mix[0], "neurons, has an accuracy of ", accuracy[0])

Accuracy of each sub-model with different granularities [0.81008, 0.77784, 0.78828]
Each sub-model has number of neurons: [1536, 768, 384]
So for example, the first sub model with  1536 neurons, has an accuracy of  0.81008


In [14]:
torch.save(model.state_dict(),'model_weights_transformer_2')
#model.load_state_dict(torch.load('model_weights_1'))