In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel

  from .autonotebook import tqdm as notebook_tqdm


# Task 1: Sentence Transformer Implementation

Implement a sentence transformer model using any deep learning framework of your choice. 
This model should be able to encode input sentences into fixed-length embeddings. 
Test your implementation with a few sample sentences and showcase the obtained embeddings. 
Describe any choices you had to make regarding the model architecture outside of the transformer backbone.

## Prepare the dataset  

In [642]:
import pandas as pd
import numpy as np

# Load the dataset
classification_df = pd.read_csv('data/classification_data.csv')
classification_df.head(3)

Unnamed: 0,sentence,label
0,The sun set behind the ancient ruins of Machu ...,Travel
1,"Exploring the bustling markets of Marrakech, t...",Travel
2,"After a long hike, adventurers finally reached...",Travel


In [413]:
# 
classification_df['label'], unique_labels = pd.factorize(classification_df['label'])
num_classes = len(classification_df.label.unique())
# label_array = classification_df['label'].to_numpy()
# one_hot_labels = torch.eye(num_classes)[label_array]
# classification_df['label'] = list(one_hot_labels.numpy())
classification_df.head(3)

Unnamed: 0,sentence,label
0,The sun set behind the ancient ruins of Machu ...,0
1,"Exploring the bustling markets of Marrakech, t...",0
2,"After a long hike, adventurers finally reached...",0


In [414]:
# # Encode classification labels
# classification_df['label'], unique_labels = pd.factorize(classification_df['label'])
# num_classes = len(unique_labels)
# print(num_classes)
# classification_df['one_hot'] = torch.eye(num_classes)[classification_df['label']]
# # print(one_hot)
# # classification_df.describe()
# classification_df.head(3)

In [415]:
# Load the sentiment dataset
sentiment_df = pd.read_csv('data/sentiment_data.csv')
# Transform text label to numerical label 
sentiment_df['label'], unique_labels = pd.factorize(sentiment_df['label'])
num_sentiment = len(unique_labels)
sentiment_df.head(3)

# one_hot_labels = pd.get_dummies(sentiment_df['label'])
# sent_labels = list(one_hot_labels.columns)
# print(sent_labels)
# # Assign the one-hot encoded array as a new column to the DataFrame
# sentiment_df['label'] = list(one_hot_labels.to_numpy(dtype=np.float32))
# sentiment_df.head(3)

Unnamed: 0,sentence,label
0,The tourist's trip to Machu Picchu was ruined ...,0
1,The chaotic crowds and overwhelming noise made...,0
2,"After getting lost on the hike, the adventurer...",0


## Combine the dataset

In [416]:
from sklearn.model_selection import train_test_split

classification_df['task'] = 'class'
sentiment_df['task'] = 'sent'
combined_df = pd.concat([classification_df, sentiment_df])

# Split the combined dataset
train_df, eval_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df['task'])

# Ensure the columns are correctly formatted
train_df = train_df[['sentence', 'label', 'task']]
eval_df = eval_df[['sentence', 'label', 'task']]

In [417]:
train_df.head(3)

Unnamed: 0,sentence,label,task
15,The long-awaited sequel to the blockbuster hit...,3,class
2,"After getting lost on the hike, the adventurer...",0,sent
10,The underdog team's loss in the championship g...,0,sent


In [419]:
# num_classes = len(class_labels)
# num_sentiment = len(sent_labels)

## Tokenize the dataset

Using the distilled Bert model to create tokens with a maximum length of 128.

In [420]:
from src.sentrans.constants import MODEL_NAME

# Load the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
model = DistilBertModel.from_pretrained(MODEL_NAME)

In [421]:
from src.sentrans.constants import TOKEN_MAX_LENGTH

# tokenize classification dataset
tokenized_train_data = tokenizer(train_df['sentence'].tolist(), padding=True, truncation=True,max_length=TOKEN_MAX_LENGTH, return_tensors='pt')
print(tokenized_train_data.keys())

# tokenize classification dataset
tokenized_eval_data = tokenizer(eval_df['sentence'].tolist(), padding=True, truncation=True,max_length=TOKEN_MAX_LENGTH, return_tensors='pt')
print(tokenized_eval_data.keys())

# # tokenize sentiment dataset
# tokenized_sent_data = tokenizer(sentiment_df['Sentence'].tolist(), padding=True, truncation=True,max_length=TOKEN_MAX_LENGTH, return_tensors='pt')
# print(tokenized_sent_data.keys())

dict_keys(['input_ids', 'attention_mask'])
dict_keys(['input_ids', 'attention_mask'])


Using the model() function, we will get the embeddings for each token in each sentence.

In [422]:
# Create embeddings
def create_embeddings(tokenized_data):
    input_ids = tokenized_data['input_ids']
    attention_mask = tokenized_data['attention_mask']
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    embeddings = outputs.last_hidden_state
    return embeddings

In [423]:
embeddings = create_embeddings(tokenized_train_data)

print("Embeddings shape:", embeddings.shape)  

Embeddings shape: torch.Size([80, 32, 768])


Embeddings of size 768 are created for each token in each sentence. 

Hence the size of the embeddings is
(number_of_sentences, max_sequence_length, embedding_size)

In [424]:
for i in range(0,2):
    print('\nSentence: \n',classification_df.sentence[i],'\n Embedding: \n', embeddings[i])
    print('\n Embedding Shape: \n', embeddings[i].shape)


Sentence: 
 The sun set behind the ancient ruins of Machu Picchu, casting a golden hue over the Andes mountains. 
 Embedding: 
 tensor([[-0.0950, -0.3981,  0.1139,  ..., -0.2301,  0.4218,  0.1480],
        [-0.3557, -0.7085, -0.2099,  ..., -0.0974,  1.0732, -0.3508],
        [-0.0573, -0.7418,  0.5210,  ..., -0.2448,  0.3817, -0.0507],
        ...,
        [-0.0080, -0.4159,  0.4840,  ..., -0.1446,  0.3752, -0.2440],
        [-0.1118, -0.3623,  0.2918,  ..., -0.1980,  0.2387, -0.1145],
        [-0.2808, -0.3713,  0.3383,  ..., -0.2427,  0.2749, -0.1207]])

 Embedding Shape: 
 torch.Size([32, 768])

Sentence: 
 Exploring the bustling markets of Marrakech, travelers are captivated by the vibrant colors and aromatic spices. 
 Embedding: 
 tensor([[-0.0209,  0.0239, -0.1220,  ...,  0.1055,  0.3616,  0.4078],
        [ 0.0467, -0.2368, -0.1285,  ..., -0.1304,  0.2988,  0.1410],
        [-0.3015, -0.1668,  0.1919,  ..., -0.1315,  0.0691,  0.0147],
        ...,
        [ 0.1730,  0.1865,  0.

# Task 2: Mutli-Task Learning

## Create a Dataset object

In [578]:
class NewsDataset(Dataset):
    def __init__(self, encodings, labels,task):
        self.encodings = encodings
        self.labels = labels
        self.tasks = task

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['label'] = torch.tensor(self.labels[idx], dtype=torch.long)
        item['task'] = self.tasks[idx]#, dtype=torch.str)
        return item

In [579]:
# Create datasets using the pre-tokenized data
train_dataset = NewsDataset(tokenized_train_data, train_df['label'].tolist(),train_df['task'].tolist())
eval_dataset = NewsDataset(tokenized_eval_data, eval_df['label'].tolist(), eval_df['task'].tolist())


In [580]:
for d in train_dataset:
    print(d)
    break

{'input_ids': tensor([  101,  1996,  2146,  1011, 19605,  8297,  2000,  1996, 27858,  2718,
        14408, 21967,  9501,  2007,  2049, 14726,  5107,  3896,  1998, 13940,
         9994,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), 'label': tensor(3), 'task': 'class'}


## DataLoader and Iterator

In [581]:

# from src.sentrans.constants import BATCH_SIZE
batch_size = 4 #BATCH_SIZE

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=True)

# Create an iterator for each DataLoader
train_iter = iter(train_loader)
eval_iter = iter(eval_loader)

## Collator

In [582]:
# Define the custom collator
class SentenceDataCollator:
    def __call__(self, batch):
        # print('batch', batch)
        input_ids = torch.stack([item['input_ids'] for item in batch])
        attention_mask = torch.stack([item['attention_mask'] for item in batch])
        labels = torch.tensor([item['label'] for item in batch])#, dtype=torch.long)
        tasks = [item['task'] for item in batch]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': labels,
            'task': tasks
        }

data_collator = SentenceDataCollator()

## Define Model

In [605]:
import torch
import torch.nn as nn
from transformers import DistilBertModel
from src.sentrans.constants import MODEL_NAME

class MultiTaskBERT(nn.Module):
    def __init__(self, num_classes_topic, num_classes_sentiment):
        super(MultiTaskBERT, self).__init__()
        self.bert = DistilBertModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(0.1)
        self.classifier_topic = nn.Linear(self.bert.config.hidden_size, num_classes_topic)
        self.classifier_sentiment = nn.Linear(self.bert.config.hidden_size, num_classes_sentiment) 

    def forward(self, input_ids, attention_mask, task=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # Take the [CLS] token representation
        pooled_output = self.dropout(pooled_output)
        if 'class' in task:
            return self.classifier_topic(pooled_output)
        elif 'sent' in task:
            return self.classifier_sentiment(pooled_output)
        else:
            return None


In [606]:
# bert = DistilBertModel.from_pretrained(MODEL_NAME)

In [607]:
# print(bert)

## Training Arguments

In [608]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='results',             # Output directory
    num_train_epochs=5,              # Increase number of epochs for small datasets
    per_device_train_batch_size=1,    # Adjust batch size per device during training
    per_device_eval_batch_size=1,     # Adjust batch size for evaluation
    warmup_steps=10,                  # Reduce warmup steps
    weight_decay=0.01,                # Weight decay
    logging_dir='logs',             # Directory for storing logs
    logging_steps=50,                  # Log every 5 steps
    eval_strategy="steps",      # Evaluation strategy
    # evaluation_strategy="no",      # Evaluation strategy
    eval_steps=100,                    # Evaluate every 10 steps
    save_steps=200,                    # Save every 20 steps
    save_total_limit=1                # Keep only the last saved model
)


In [609]:
print(num_classes,num_sentiment)

5 3


In [610]:
# Initialize the custom multi-task model
model = MultiTaskBERT(num_classes_topic=num_classes, num_classes_sentiment=num_sentiment)

# Trainer

In [633]:
from transformers import Trainer
import torch.nn.functional as F

# Define a custom trainer
class SentenceTrainer(Trainer):
    def compute_loss(self, model, inputs):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels = inputs['label']
        tasks = inputs['task']
        # print(input_ids, attention_mask, labels,tasks)
        outputs = model(input_ids, attention_mask, tasks) 
        
        # Convert labels to one-hot encoded labels
        # num_classes = outputs[0].size(-1)  # Assuming all output logits have the same number of classes
        # one_hot_labels = F.one_hot(labels, num_classes=num_classes).float()
       
        # print(outputs, labels)
        return nn.CrossEntropyLoss()(outputs, labels)
    
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        tasks = inputs['task']
        tasks = inputs['task']
        
        # We do not need to calculate gradients for validation
        with torch.no_grad():
            logits = []
            for i, task in enumerate(tasks):
                task_output = model(input_ids=input_ids[i].unsqueeze(0), attention_mask=attention_mask[i].unsqueeze(0), task=task)
                logits.append(task_output)

        logits = torch.cat(logits)

        if prediction_loss_only:
            return (None, logits, None)

        # labels = torch.argmax(inputs['label'], dim=1)
        labels = inputs['label']
        return (None, logits, labels)

### Metric

In [634]:
from datasets import load_metric

# Define metric
metric = load_metric("accuracy")
# metric = evaluate.load("accuracy")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = metric.compute(predictions=preds, references=labels)
    return {"accuracy": accuracy}

In [635]:
# Initialize the Trainer
trainer = SentenceTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   # Use the classification dataset for the initial example
    eval_dataset=eval_dataset,         # Use the sentiment dataset for evaluation
    data_collator=data_collator
)

In [613]:
trainer.train()

 25%|██▌       | 100/400 [02:26<07:20,  1.47s/it]
 12%|█▎        | 50/400 [00:24<03:08,  1.86it/s]
 12%|█▎        | 50/400 [00:24<03:08,  1.86it/s]

{'loss': 1.2808, 'grad_norm': 26.699920654296875, 'learning_rate': 4.4871794871794874e-05, 'epoch': 0.62}


 25%|██▌       | 100/400 [00:52<03:03,  1.64it/s]
 25%|██▌       | 100/400 [00:52<03:03,  1.64it/s]

{'loss': 0.8011, 'grad_norm': 8.837166786193848, 'learning_rate': 3.846153846153846e-05, 'epoch': 1.25}



[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

 25%|██▌       | 100/400 [00:53<03:03,  1.64it/s]
[A
[A

{'eval_runtime': 1.3234, 'eval_samples_per_second': 15.112, 'eval_steps_per_second': 15.112, 'epoch': 1.25}


 38%|███▊      | 150/400 [01:24<02:36,  1.60it/s]
 38%|███▊      | 150/400 [01:24<02:36,  1.60it/s]

{'loss': 0.4922, 'grad_norm': 1.303548812866211, 'learning_rate': 3.205128205128206e-05, 'epoch': 1.88}


 50%|█████     | 200/400 [01:52<01:52,  1.77it/s]
 50%|█████     | 200/400 [01:52<01:52,  1.77it/s]

{'loss': 0.2143, 'grad_norm': 0.1587505340576172, 'learning_rate': 2.564102564102564e-05, 'epoch': 2.5}



[A
[A
[A
[A
[A
[A
[A
[A
                                                 

 50%|█████     | 200/400 [01:53<01:52,  1.77it/s]
[A
[A

{'eval_runtime': 1.0985, 'eval_samples_per_second': 18.206, 'eval_steps_per_second': 18.206, 'epoch': 2.5}


 62%|██████▎   | 250/400 [02:25<01:36,  1.55it/s]
 62%|██████▎   | 250/400 [02:25<01:36,  1.55it/s]

{'loss': 0.1451, 'grad_norm': 0.22588986158370972, 'learning_rate': 1.923076923076923e-05, 'epoch': 3.12}


 75%|███████▌  | 300/400 [02:57<00:59,  1.67it/s]
 75%|███████▌  | 300/400 [02:57<00:59,  1.67it/s]

{'loss': 0.0465, 'grad_norm': 0.12452692538499832, 'learning_rate': 1.282051282051282e-05, 'epoch': 3.75}



[A
[A
[A
[A
[A
[A
[A
                                                 

 75%|███████▌  | 300/400 [02:58<00:59,  1.67it/s]
[A
[A

{'eval_runtime': 1.0158, 'eval_samples_per_second': 19.69, 'eval_steps_per_second': 19.69, 'epoch': 3.75}


 88%|████████▊ | 350/400 [03:31<00:32,  1.55it/s]
 88%|████████▊ | 350/400 [03:31<00:32,  1.55it/s]

{'loss': 0.0678, 'grad_norm': 0.4132500886917114, 'learning_rate': 6.41025641025641e-06, 'epoch': 4.38}


100%|██████████| 400/400 [04:03<00:00,  1.43it/s]
100%|██████████| 400/400 [04:03<00:00,  1.43it/s]

{'loss': 0.0268, 'grad_norm': 0.025203969329595566, 'learning_rate': 0.0, 'epoch': 5.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

100%|██████████| 400/400 [04:04<00:00,  1.43it/s]
[A
[A

{'eval_runtime': 1.1839, 'eval_samples_per_second': 16.893, 'eval_steps_per_second': 16.893, 'epoch': 5.0}



100%|██████████| 400/400 [04:08<00:00,  1.61it/s]

{'train_runtime': 248.6484, 'train_samples_per_second': 1.609, 'train_steps_per_second': 1.609, 'train_loss': 0.38433348208665846, 'epoch': 5.0}





TrainOutput(global_step=400, training_loss=0.38433348208665846, metrics={'train_runtime': 248.6484, 'train_samples_per_second': 1.609, 'train_steps_per_second': 1.609, 'total_flos': 0.0, 'train_loss': 0.38433348208665846, 'epoch': 5.0})

## Eval

In [636]:
predictions, labels, metrics = trainer.predict(eval_dataset)


100%|██████████| 20/20 [00:00<00:00, 33.25it/s]


In [639]:
predicted_classes = torch.argmax(torch.tensor(predictions), dim=-1)

In [641]:
print(predicted_classes)
print(labels)

tensor([0, 1, 0, 0, 1, 2, 4, 2, 3, 4, 3, 0, 4, 2, 2, 1, 0, 0, 0, 1])
[0 1 0 0 1 2 4 1 3 4 3 0 4 2 2 1 0 0 0 1]


# Task 3. Training Considerations

Implications and advantages of each scenario and explain your rationale as to how the model
 should be trained given the following:

If the entire network should be frozen.
If only the transformer backbone should be frozen.
If only one of the task-specific heads (either for Task A or Task B) should be frozen.