In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel

: 

# Task 1: Sentence Transformer Implementation

Implement a sentence transformer model using any deep learning framework of your choice. 
This model should be able to encode input sentences into fixed-length embeddings. 
Test your implementation with a few sample sentences and showcase the obtained embeddings. 
Describe any choices you had to make regarding the model architecture outside of the transformer backbone.

## Prepare the dataset  

In [140]:
import pandas as pd
import numpy as np

# Load the dataset
classification_df = pd.read_csv('data/classification_data.csv')
classification_df.head(3)

Unnamed: 0,sentence,label
0,The sun set behind the ancient ruins of Machu ...,Travel
1,"Exploring the bustling markets of Marrakech, t...",Travel
2,"After a long hike, adventurers finally reached...",Travel


In [141]:
# Encode classification labels
classification_df['label'], unique_labels = pd.factorize(classification_df['label'])
num_classes = len(classification_df.label.unique())

# Create a dictionary mapping numerical values to original labels
class_label_decode = {index: label for index, label in enumerate(unique_labels)}

classification_df.head(3)


Unnamed: 0,sentence,label
0,The sun set behind the ancient ruins of Machu ...,0
1,"Exploring the bustling markets of Marrakech, t...",0
2,"After a long hike, adventurers finally reached...",0


In [138]:
# Load the sentiment dataset
sentiment_df = pd.read_csv('data/sentiment_data.csv')
# Transform text label to numerical label 
sentiment_df['label'], unique_labels = pd.factorize(sentiment_df['label'])
num_sentiment = len(unique_labels)

# Create a dictionary mapping numerical values to original labels
sent_label_decode = {index: label for index, label in enumerate(unique_labels)}

sentiment_df.head(3)

Unnamed: 0,sentence,label
0,The tourist's trip to Machu Picchu was ruined ...,0
1,The chaotic crowds and overwhelming noise made...,0
2,"After getting lost on the hike, the adventurer...",0


In [139]:
print(sent_label_decode)

{0: 'Negative', 1: 'Positive', 2: 'Neutral'}


## Combine the dataset

In [15]:
from sklearn.model_selection import train_test_split

classification_df['task'] = 'class'
sentiment_df['task'] = 'sent'
combined_df = pd.concat([classification_df, sentiment_df])

# Split the combined dataset
train_df, eval_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df['task'])

# Ensure the columns are correctly formatted
train_df = train_df[['sentence', 'label', 'task']]
eval_df = eval_df[['sentence', 'label', 'task']]

In [16]:
train_df.head(3)

Unnamed: 0,sentence,label,task
47,"In response to growing public pressure, the go...",1,sent
22,"After a long hike, adventurers finally reached...",1,sent
16,Award-winning actor Smith delivers a powerhous...,3,class


In [17]:
# num_classes = len(class_labels)
# num_sentiment = len(sent_labels)

## Tokenize the dataset

Using the distilled Bert model to create tokens with a maximum length of 128.

In [18]:
from src.sentrans.constants import MODEL_NAME

# Load the tokenizer and} model
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
model = DistilBertModel.from_pretrained(MODEL_NAME)

In [19]:
from src.sentrans.constants import TOKEN_MAX_LENGTH

# tokenize classification dataset
tokenized_train_data = tokenizer(train_df['sentence'].tolist(), padding=True, truncation=True,max_length=TOKEN_MAX_LENGTH, return_tensors='pt')
print(tokenized_train_data.keys())

# tokenize classification dataset
tokenized_eval_data = tokenizer(eval_df['sentence'].tolist(), padding=True, truncation=True,max_length=TOKEN_MAX_LENGTH, return_tensors='pt')
print(tokenized_eval_data.keys())

# # tokenize sentiment dataset
# tokenized_sent_data = tokenizer(sentiment_df['Sentence'].tolist(), padding=True, truncation=True,max_length=TOKEN_MAX_LENGTH, return_tensors='pt')
# print(tokenized_sent_data.keys())

dict_keys(['input_ids', 'attention_mask'])
dict_keys(['input_ids', 'attention_mask'])


Using the model() function, we will get the embeddings for each token in each sentence.

In [20]:
# Create embeddings
def create_embeddings(tokenized_data):
    input_ids = tokenized_data['input_ids']
    attention_mask = tokenized_data['attention_mask']
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    embeddings = outputs.last_hidden_state
    return embeddings

In [21]:
embeddings = create_embeddings(tokenized_train_data)

print("Embeddings shape:", embeddings.shape)  

Embeddings shape: torch.Size([80, 32, 768])


Embeddings of size 768 are created for each token in each sentence. 

Hence the size of the embeddings is
(number_of_sentences, max_sequence_length, embedding_size)

In [22]:
for i in range(0,2):
    print('\nSentence: \n',classification_df.sentence[i],'\n Embedding: \n', embeddings[i])
    print('\n Embedding Shape: \n', embeddings[i].shape)


Sentence: 
 The sun set behind the ancient ruins of Machu Picchu, casting a golden hue over the Andes mountains. 
 Embedding: 
 tensor([[-0.1222, -0.1128, -0.2531,  ..., -0.1338,  0.0045,  0.4976],
        [-0.7761, -0.4002, -0.6663,  ..., -0.2066,  0.2082, -0.0224],
        [-0.7013, -0.1719,  0.0192,  ..., -0.3123, -0.3600, -0.1614],
        ...,
        [-0.2623, -0.1712,  0.2383,  ..., -0.0115, -0.1635,  0.1747],
        [-0.0526, -0.1568,  0.2685,  ...,  0.0012, -0.2301,  0.2412],
        [-0.0841, -0.1908,  0.1006,  ...,  0.0780, -0.2866,  0.0928]])

 Embedding Shape: 
 torch.Size([32, 768])

Sentence: 
 Exploring the bustling markets of Marrakech, travelers are captivated by the vibrant colors and aromatic spices. 
 Embedding: 
 tensor([[-0.1070, -0.0393,  0.1511,  ...,  0.0250,  0.2682,  0.3162],
        [ 0.1359, -0.3061,  0.1702,  ..., -0.1851,  0.1301,  0.1654],
        [ 0.0027, -0.2043,  0.2608,  ..., -0.1751, -0.1059,  0.3911],
        ...,
        [ 0.3709,  0.2008,  0.

# Task 2: Mutli-Task Learning

## Create a Dataset object

In [121]:
class NewsDataset(Dataset):
    def __init__(self, encodings, labels,task):
        self.encodings = encodings
        self.labels = labels
        self.tasks = task

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['label'] = torch.tensor(self.labels[idx], dtype=torch.long)
        item['task'] = self.tasks[idx]#, dtype=torch.str)
        return item

In [24]:
# Create datasets using the pre-tokenized data
train_dataset = NewsDataset(tokenized_train_data, train_df['label'].tolist(),train_df['task'].tolist())
eval_dataset = NewsDataset(tokenized_eval_data, eval_df['label'].tolist(), eval_df['task'].tolist())


In [25]:
for d in train_dataset:
    print(d)
    break

{'input_ids': tensor([  101,  1999,  3433,  2000,  3652,  2270,  3778,  1010,  1996,  2231,
         2623, 12720,  8818,  2000,  4769, 22575, 16440,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), 'label': tensor(1), 'task': 'sent'}


## Collator

In [27]:
# Define the custom collator
class SentenceDataCollator:
    def __call__(self, batch):
        # print('batch', batch)
        input_ids = torch.stack([item['input_ids'] for item in batch])
        attention_mask = torch.stack([item['attention_mask'] for item in batch])
        labels = torch.tensor([item['label'] for item in batch])#, dtype=torch.long)
        tasks = [item['task'] for item in batch]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': labels,
            'task': tasks
        }

data_collator = SentenceDataCollator()

## Define Model

In [28]:
import torch
import torch.nn as nn
from transformers import DistilBertModel
from src.sentrans.constants import MODEL_NAME

class MultiTaskBERT(nn.Module):
    def __init__(self, num_classes_topic, num_classes_sentiment,MODEL_NAME = MODEL_NAME):
        super(MultiTaskBERT, self).__init__()
        self.bert = DistilBertModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(0.3)
        self.classifier_topic = nn.Linear(self.bert.config.hidden_size, num_classes_topic)
        self.classifier_sentiment = nn.Linear(self.bert.config.hidden_size, num_classes_sentiment) 

    def forward(self, input_ids, attention_mask, task=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # Take the [CLS] token representation
        pooled_output = self.dropout(pooled_output)
        if 'class' in task:
            return self.classifier_topic(pooled_output)
        elif 'sent' in task:
            return self.classifier_sentiment(pooled_output)
        else:
            return None


## Training Arguments

In [31]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='results',             # Output directory
    num_train_epochs=2,              # Increase number of epochs for small datasets
    per_device_train_batch_size=1,    # Adjust batch size per device during training
    per_device_eval_batch_size=1,     # Adjust batch size for evaluation
    warmup_steps=10,                  # Reduce warmup steps
    weight_decay=0.01,                # Weight decay
    logging_dir='logs',             # Directory for storing logs
    logging_steps=50,                  # Log every 5 steps
    eval_strategy="steps",      # Evaluation strategy
    # evaluation_strategy="no",      # Evaluation strategy
    eval_steps=40,                    # Evaluate every 10 steps
    save_steps=100,                    # Save every 20 steps
    save_total_limit=1                # Keep only the last saved model
)


In [32]:
print(num_classes,num_sentiment)

5 3


In [33]:
# Initialize the custom multi-task model
from src.sentrans.constants import MODEL_NAME
model = MultiTaskBERT(num_classes_topic=num_classes, num_classes_sentiment=num_sentiment,MODEL_NAME=MODEL_NAME)

# Trainer

In [108]:
from transformers import Trainer
import torch.nn.functional as F

# Define a custom trainer
class SentenceTrainer(Trainer):
    def compute_loss(self, model, inputs):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels = inputs['label']
        tasks = inputs['task']
        # print(input_ids, attention_mask, labels,tasks)
        outputs = model(input_ids, attention_mask, tasks) 
        return nn.CrossEntropyLoss()(outputs, labels)
    
    
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        if inputs['task']:
            tasks = inputs['task'] 
        else: 
            tasks = ['class','sent'] 
            with torch.no_grad():
                logits = []
                for task in tasks:
                    task_output = model(input_ids=input_ids[0].unsqueeze(0), attention_mask=attention_mask[0].unsqueeze(0), task=task)
                    logits.append(task_output)
                print(logits)

        # We do not need to calculate gradients for validation
        with torch.no_grad():
            logits = []
            for i, task in enumerate(tasks):
                task_output = model(input_ids=input_ids[i].unsqueeze(0), attention_mask=attention_mask[i].unsqueeze(0), task=task)
                logits.append(task_output)

        logits = torch.cat(logits)

        if prediction_loss_only:
            return (None, logits, None)
        
        labels = inputs['label']

        # labels = torch.argmax(inputs['label'], dim=1)
        return (None, logits, labels)

### Metric

In [109]:
from datasets import load_metric

# Define metric
metric = load_metric("accuracy")
# metric = evaluate.load("accuracy")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = metric.compute(predictions=preds, references=labels)
    return {"accuracy": accuracy}

In [110]:
# Initialize the Trainer
trainer = SentenceTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   # Use the classification dataset for the initial example
    eval_dataset=eval_dataset,         # Use the sentiment dataset for evaluation
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [93]:
trainer.train()

                                                
 25%|██▌       | 40/160 [00:20<01:09,  1.73it/s]

{'eval_accuracy': {'accuracy': 0.8}, 'eval_runtime': 1.0084, 'eval_samples_per_second': 19.833, 'eval_steps_per_second': 19.833, 'epoch': 0.5}


 31%|███▏      | 50/160 [00:26<01:06,  1.65it/s]

{'loss': 0.2256, 'grad_norm': 0.15037643909454346, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.62}


                                                
 50%|█████     | 80/160 [00:46<00:48,  1.66it/s]

{'eval_accuracy': {'accuracy': 0.85}, 'eval_runtime': 0.9481, 'eval_samples_per_second': 21.095, 'eval_steps_per_second': 21.095, 'epoch': 1.0}


 62%|██████▎   | 100/160 [00:58<00:36,  1.65it/s]

{'loss': 0.1057, 'grad_norm': 0.03196905925869942, 'learning_rate': 2e-05, 'epoch': 1.25}


                                                 
 75%|███████▌  | 120/160 [01:14<00:24,  1.63it/s]

{'eval_accuracy': {'accuracy': 0.85}, 'eval_runtime': 0.9637, 'eval_samples_per_second': 20.753, 'eval_steps_per_second': 20.753, 'epoch': 1.5}


 94%|█████████▍| 150/160 [01:34<00:08,  1.23it/s]

{'loss': 0.0409, 'grad_norm': 0.35310038924217224, 'learning_rate': 3.3333333333333333e-06, 'epoch': 1.88}


                                                 
100%|██████████| 160/160 [01:44<00:00,  1.54it/s]

{'eval_accuracy': {'accuracy': 0.85}, 'eval_runtime': 1.2389, 'eval_samples_per_second': 16.143, 'eval_steps_per_second': 16.143, 'epoch': 2.0}
{'train_runtime': 104.2467, 'train_samples_per_second': 1.535, 'train_steps_per_second': 1.535, 'train_loss': 0.11689814766868949, 'epoch': 2.0}





TrainOutput(global_step=160, training_loss=0.11689814766868949, metrics={'train_runtime': 104.2467, 'train_samples_per_second': 1.535, 'train_steps_per_second': 1.535, 'total_flos': 0.0, 'train_loss': 0.11689814766868949, 'epoch': 2.0})

## Evaluate Model

In [152]:
for i in eval_dataset:
    print(i)
    break

{'input_ids': tensor([  101,  8354,  2083,  1996, 27375,  5380,  1997,  1996,  7139,  1010,
        10885,  2545,  2024,  2033,  6491, 11124,  5422,  2011,  1996,  5053,
         1997,  1996,  5133,  3470,  1012,   102,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0]), 'label': tensor(0), 'task': 'class'}


In [105]:
predictions, labels, metrics = trainer.predict(eval_dataset)

100%|██████████| 20/20 [00:00<00:00, 27.26it/s]


In [106]:
metrics

{'test_accuracy': {'accuracy': 0.85},
 'test_runtime': 0.8988,
 'test_samples_per_second': 22.253,
 'test_steps_per_second': 22.253}

## Predictions on sentences

In [149]:
def predict_sentence(sentence):
    # Create a DataFrame for the input sentence
    pred_df = pd.DataFrame({'sentence': [sentence, sentence], 'label': [0, 0], 'task': ['class', 'sent']})
    
    # Tokenize the input sentence
    inputs = tokenizer(pred_df['sentence'].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=TOKEN_MAX_LENGTH)
    
    # Create dataset object for the encodings
    pred_dataset = NewsDataset(inputs, pred_df.label.tolist(), pred_df.task.tolist())
    
    # Get predictions
    predictions, labels, metrics = trainer.predict(pred_dataset)
    
    # Get the predicted class and sentiment
    predicted_class = torch.argmax(torch.tensor(predictions[0]), dim=-1).item()
    predicted_sentiment = torch.argmax(torch.tensor(predictions[1]), dim=-1).item()
    
    return predicted_class, predicted_sentiment

In [150]:
sample = 'The frustration of lost luggage did not overshadow the excitement of exploring the vibrant streets of Barcelona.'

In [151]:
topic, sentiment = predict_sentence(sample)
print(f"Predicted Topic: {class_label_decode[topic]}, Predicted Sentiment: {sent_label_decode[sentiment]}")

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00, 40.20it/s]

Predicted Topic: Travel, Predicted Sentiment: Positive





In [649]:
trainer.save_model("model")
torch.save(model.state_dict(), 'model/pytorch_model.bin')

# Task 3. Training Considerations

Implications and advantages of each scenario and explain your rationale as to how the model
 should be trained given the following:

If the entire network should be frozen.
If only the transformer backbone should be frozen.
If only one of the task-specific heads (either for Task A or Task B) should be frozen.