In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import nltk
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\siddu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\siddu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Task 1

- I will be implementing a sentence transformer model using pytorch. 
- In real time, I would rather use a pre-trained model from huggingface because that would save a lot of time and would perform better than something that can be made locally. 
- For OA purposes, I will be making a very basic transformer from scratch using just pytorch. However, I will be using NLTK for testing the transformer.
- Design choice will be below the code

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        """
        Positional encoding adds information about word order into the embeddings.
        - I am using sine and cosine functions of different frequencies to create positional encodings since it is a smooth periodic function. I chose this because it is better at avoiding overfitting,
        and has the generalization ability that other alternatives like T5 and BERT lack. 
        - In Fetch rewards, a better choice would be to use BERT, which can be good for tasks like analyzing data/patterns over fixed timwe windows. 
        - d_model: embedding size (must be even for alternating sin-cos).
        - max_len: max seqn length we expect.
        """
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)  # create a matrix of shape (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # shape: (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000) / d_model))
        
        # apply sin to even indices and cosine to odd indices
        pe[:, 0::2] = torch.sin(position * div_term)  # shape remains (max_len, d_model)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)  # store as a persistent buffer (not a parameter)
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]  # make sure sequence length matches input


In [4]:
class SentenceTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=128, num_heads=4, num_layers=2, dropout=0.1, max_len=50):
        """
        A simple Sentence Transformer that encodes sentences into fixed-length vectors.
        - vocab_size: no. of words in the vocabulary.
        - d_model: dimensionality of embeddings and transformer hidden size.
        - num_heads: no. of attention heads in the Transformer.
        - num_layers: no. of Transformer encoder layers.
        - dropout: dropout probability.
        - max_len: max seqn length.
        """
        super(SentenceTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)  # convert token indices to embeddings (batch_size, seq_len, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)  # add positional information
        
        # Transformer encoder layers
        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        
        # Pooling layer to reduce seqn length and get a fixed-size sentence embedding
        self.pooling = nn.AdaptiveAvgPool1d(1)  # mean pooling over token dimension
        
    def forward(self, x):
        """ Forward pass through the model. """
        x = self.embedding(x)  # convert token indices to dense vectors, shape: (batch_size, seq_len, d_model)
        x = self.pos_encoder(x)  # add positional encoding
        x = self.transformer_encoder(x)  # pass through transformer encoder
        
        # Pooling step
        x = x.permute(0, 2, 1)  # Rearrange to (batch_size, d_model, seq_len) for pooling
        x = self.pooling(x).squeeze(-1)  # mean pooling, shape: (batch_size, d_model)
        return x  # output embeddings

In [5]:
def nltk_tokenizer(sentence, vocab, max_len=6):
    """ 
    Tokenizes a sentence using NLTK and converts words to indices based on a vocabulary. 
    """
    tokens = word_tokenize(sentence.lower())
    token_ids = [vocab.get(word, vocab["<UNK>"]) for word in tokens]
    if len(token_ids) < max_len:
        token_ids += [vocab["<PAD>"]] * (max_len - len(token_ids))  # pad to max_len
    return token_ids[:max_len]

In [6]:
vocab_size = 10000  # Assume a vocabulary of 10k words
d_model = 128
sentence_transformer = SentenceTransformer(vocab_size, d_model)

# Dummy vocabulary
dummy_vocab = {
    "<PAD>": 0, "<UNK>": 1, "fetch": 2, "rewards": 3, "dog": 4, "loves": 5,
    "treats": 6, "and": 7, "belly": 8, "rubs": 9, "woof": 10
}

# Sample sentences
sample_sentences = [
    "My dog asked for a raise, I told him to fetch his own paycheck",
    "I threw a ball for my dog... he brought back a receipt",
    "Why don't dogs use phones? Too many collar ID issues"
]

# Convert sentences to token indices
tokenized_inputs = [nltk_tokenizer(sent, dummy_vocab) for sent in sample_sentences]
input_tensor = torch.tensor(tokenized_inputs)  # Shape: (batch_size, seq_len)

# Run the sentences through the model
embeddings = sentence_transformer(input_tensor)

# Print sentence embeddings
print("Sentence Embeddings:")
for i, sent in enumerate(sample_sentences):
    print(f"Sentence: {sent}")
    print(f"Embedding: {embeddings[i].detach().numpy()}")


Sentence Embeddings:
Sentence: My dog asked for a raise, I told him to fetch his own paycheck
Embedding: [-1.4645281  -0.02077602 -1.5391865   0.4521033   0.06101049 -0.8322439
  0.23527747  0.6854346   0.3462597  -0.11951858 -0.04943123 -0.86029387
  0.11193186  1.8133273  -1.2709274   2.0255535  -0.28470692 -1.2981143
  0.749053   -0.49446544 -0.0879648   0.33344948 -0.9736144   0.06899364
 -0.5991185  -0.48273894  0.46165535  0.8682397  -0.11344402  0.82438844
  1.7020359   0.5912024   0.41512123  0.04551455 -2.5492058   1.4014207
 -0.3305967   1.6340065   1.6474422  -0.23978777 -0.5385265  -0.6233191
  0.300016   -0.825381    0.4184396   0.74974346  0.35828504 -0.03175083
 -0.51528144 -1.1523126   0.5120567   0.60272986 -1.456618    0.80301356
 -0.6090191   2.0761003  -0.30633292  0.8451187  -0.72727567  0.35224783
 -0.28926584  0.47845015  1.0099648   0.05782488 -0.6429448  -0.4509724
 -1.938521   -0.17681621 -0.6862107   0.43777862 -1.0125352   0.6615725
 -0.5226415   1.4313277  



#### Design Choices Outside the Transformer Backbone

For Task 1, I made several architectural choices outside the transformer backbone to keep the model simple, efficient, and easy to run. I also made it generalizable since I don't exactly know have a target dataset

##### 1. Token Embedding Layer (nn.Embedding)
- Choice: Used nn.Embedding(vocab_size, d_model), mapping each word index to a d_model-dimensional vector.  
- Alternative:  
  - Could have used pretrained word embeddings (e.g., GloVe, Word2Vec) for better initialization.  
  - Opted for learnable embeddings to let the model adapt to the dataset.  

##### 2. Positional Encoding
- Choice: Used a sine-cosine positional encoding to preserve sequence structure.  
- Alternative:  
  - Could have used learned positional embeddings (like BERT), but sine-cosine encoding allows better generalization to longer sequences.  

##### 3. Sentence Pooling (nn.AdaptiveAvgPool1d)
- Choice: Used mean pooling, averaging token embeddings to create a fixed-size representation of the sentence.  
- Alternative:  
  - CLS Token Approach (as in BERT) – Uses a special [CLS] token representation as the sentence embedding.  
  - Max Pooling – Selects the most activated token feature instead of averaging.  

##### 4. Vocabulary Handling (<UNK> and <PAD> Tokens)
- Choice:  
  - <UNK> (Unknown Token): Replaces words not in the vocabulary.  
  - <PAD> (Padding Token): Ensures uniform sequence lengths by padding shorter sentences.  


##### 5. Fixed max_len=6 for Token Sequences
- Choice: Set max_len=6 for batch processing, meaning sentences longer than 6 tokens are truncated, and shorter ones are padded.  

---

## Task 2

- I will expand the Sentence Transformer to  handle a multi-task learning setting. This will allow the model to handle:
    - Task A: Sentence Classification
    - Task B: Sentiment Analysis


In [7]:
class MultiTaskSentenceTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=128, num_heads=4, num_layers=2, dropout=0.1, max_len=50, num_classes_A=3):
        """
        Multi-Task Sentence Transformer that encodes sentences into fixed-length vectors and supports multiple tasks.
        - vocab_size: no. of words in the vocabulary.
        - d_model: dimensionality of embeddings and transformer hidden size.
        - num_heads: no. of attention heads in the Transformer.
        - num_layers: no. of Transformer encoder layers.
        - dropout: dropout probability.
        - max_len: max seqn length.
        - num_classes_A: no. of output classes for Task A (sentence classification).
        - num_classes_B: no. of output classes for Task B (sentiment analysis).
        """
        super(MultiTaskSentenceTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)  # Convert token indices to embeddings (batch_size, seq_len, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)  # Add positional information

        # transformr encoder layers
        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        
        # Pooling layer to reduce seqn length and get a fixed-size sentence embedding
        self.pooling = nn.AdaptiveAvgPool1d(1)  # Mean pooling over token dimension
        
        # Task A: Sentence Classification Head
        self.task_A_head = nn.Linear(d_model, num_classes_A)  # fully connected layer for classification
        
        # Task B: Sentiment Analysis Head
        self.task_B_head = nn.Linear(d_model, 1)  # output a single sentiment score between -1 and 1
        
        self._init_weights()
    
    def _init_weights(self):

        """ Initialize weights using Kaiming He initialization for better convergence with ReLU. """
        for param in self.parameters():
            if param.dim() > 1:
                nn.init.kaiming_uniform_(param, nonlinearity='relu')
            elif param.dim() == 1:  # ensure biases are not all zeros
                nn.init.zeros_(param)

    
    def forward(self, x):
        """
        Forward pass through the model.
        - x: Input token indices (batch_size, seq_len).
        Returns:
        - task_A_output: Sentence classification logits (batch_size, num_classes_A).
        - task_B_output: Sentiment classification logits (batch_size, num_classes_B).
        """
        x = self.embedding(x)  # Convert token indices to dense vectors, shape: (batch_size, seq_len, d_model)
        x = self.pos_encoder(x)  # Add positional encoding, shape remains (batch_size, seq_len, d_model)
        x = self.transformer_encoder(x)  # Pass through transformer encoder, shape remains (batch_size, seq_len, d_model)
        
        # Pooling step
        x = x.permute(0, 2, 1)  # Rearrange to (batch_size, d_model, seq_len) for pooling
        x = self.pooling(x).squeeze(-1)  # Mean pooling, shape: (batch_size, d_model)
        
        # Separate outputs for each task
        task_A_output = self.task_A_head(x)  # Sentence Classification, shape: (batch_size, num_classes_A)
        task_B_output = torch.tanh(self.task_B_head(x))  # Sentiment score between -1 and 1
        
        # Convert logits to class predictions
        task_A_prediction = torch.argmax(task_A_output, dim=1)  # Predicted class index for sentence classification
        task_B_prediction = torch.argmax(task_B_output, dim=1)  # Predicted class index for sentiment analysis
        
        return task_A_prediction, task_B_prediction

#### Changes Made to the Architecture in Task 2 from Task 1 to Support Multi-Task Learning

In Task 2, I expanded the sentence transformer model to support multi-task learning, allowing it to handle both sentence classification and sentiment analysis simultaneously. The modifications made were:

##### 1. Addition of Multiple Task-Specific Heads  
- Task 1 had a single output head for generating fixed-length sentence embeddings.  
- Task 2 introduces two separate fully connected layers:  
  - `task_A_head`: A linear layer for sentence classification that outputs logits corresponding to predefined categories.  
  - `task_B_head`: A linear layer that outputs a single continuous value representing sentiment, scaled between -1 (negative) and 1 (positive) using torch.tanh().  

##### 2. Shared Transformer Backbone  
- I use the same transformer encoder and positional encoding. 

#### 3. Weight Initialization  
- I initialized weigfhts with `Kaiming He initialization`, which is optimized for deep networks using ReLU activations.  
- I chose Kaiming He over other options like Xavier and Gaussian because Kaiming He can be optimized for ReLU activation
- Bias parameters are initialized to zeros to prevent biases from affecting early learning dynamics.  

---


### Task 3

#### 1. Freezing the Entire Network  
- **Implications:**
  - The model doesn't update any of its parameters during training. So we use frozen networks when adapting pre-trained transformers to a limited data for which the embeddings are already optimized.  
  - In the case of Fetch rewards, you would use a pre-trained model directly on new customer data without any training. This could be either message classification (what the review is about), sentimental analysis, or other tasks.  
- **Advantages:**  
  - When there is little data, training the model again on this could lead to overfitting. Freezing the network solves this issue! 
  - Reduces computational cost and time.

#### 2. Freezing Only the Transformer Backbone  
- **Implications:**  
  - The transformer layers would be frozen, but we train the task-specific heads again on new data and update their paramaters. 
  - For instance, if Fetch already has a pre-trained model that classifies user reviews based on the type of messages, you could add another classification criteria. So we can add a new classification task to classify these reviews based on the type of the product it's. To do this, only the classification heads have to be trained. 
- **Advantages:**  
  - Reduces the computational time and cost and the number of trainable parameters. 
  - Uses the representations from pretraining while still adapting to the new task.  

#### 3. Freezing One of the Task-Specific Heads  
- **Implications:**  
  - One of the tasks will rely entirely on pre-trained weights, while the other task learns new task-specific features.  
  - For instance, if a model classifies sentences well but doesn't predict the sentiment good enough for certain sentences, we can free the classification head and train the sentimental analysis head. 
- **Advantages:**  
  - Useful if one task generalizes well and does not need additional fine-tuning while focusing learning on the second task.  
  - Avoids overfitting on one of the tasks while optimizing the other. 

---

#### Transfer Learning

Considering a scenario at Fetch rewards where transfer learning would be beneficial:
We want to analyze customer feedback on receipts to detect purchase satisfaction and classify receipt categories 

#### 1. Choosing a Pre-Trained Model  
- BERT, RoBERTa, or T5 would be good choices as they have been pre-trained on large language corpora and can be fine-tuned for specific NLP tasks.  
- I would uise RoBERTa as it works well with analyzing sentiment (brand satisfaction in this case), and also because I am more familiar with it, having used it before. 

#### 2. Layers to Freeze/Unfreeze  
- RoBERTa has 12 layers, first 9 for language representation and the last 3 for task specific patterns. I would choose the number of layers to freeze depending on the size of the dataset.
- Smaller the dataset, more the layers that should be frozen. If it's a small dataset, I would free the last 3 (task dependent) layers to prevent overfitting. If the dataset if very big or if the data is different from what the model was pre-trained on, I would freeze 4 or less layers. 
- If I have enough time, I'd just check validation accuracy vs number of frozen layers in [3, 4, 6, 9] to confirm my intuition.

#### 3. Rationale Behind These Choices  
- RoBERTa is ideal for sentiment analysis and receipt classification due to its strong contextual understanding. Freezing more layers (e.g., first 9) retains general language knowledge for small datasets, while freezing fewer layers (e.g., 3-6) allows adaptation to data-specific terminology if the dataset is large or domain-specific. I would start by freezing 9 layers and fine-tune only the last 3, adjusting based on validation accuracy and overfitting trends.  

---

### Task 4

In [8]:
vocab_size = 10000
num_classes_A = 4
d_model = 128
learning_rate = 0.001
epochs = 10
batch_size = 32

model = MultiTaskSentenceTransformer(vocab_size, d_model, num_classes_A=num_classes_A)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion_A = nn.CrossEntropyLoss()
criterion_B = nn.MSELoss()


In [9]:
dataloader = None #hypothetical data

In [None]:
for epoch in range(epochs):
    total_loss_A, total_loss_B = 0, 0
    
    for inputs, labels_A, labels_B in dataloader:
        optimizer.zero_grad()
        
        # Forward pass
        task_A_output, task_B_output = model(inputs)
        
        # Compute losses
        loss_A = criterion_A(task_A_output, labels_A)
        loss_B = criterion_B(task_B_output.squeeze(), labels_B)
        
        # Backpropagation
        total_loss = loss_A + loss_B
        total_loss.backward()
        optimizer.step()
        
        total_loss_A += loss_A.item()
        total_loss_B += loss_B.item()
    
    print(f"epoch {epoch+1}: Loss A = {total_loss_A/len(dataloader)}, Loss B= {total_loss_B/len(dataloader)}")