In [None]:
#Task 1 Implementing a Transformer Encoder


In [None]:
#Reading in the AG News Dataset
import glob
import pandas as pd
ag_news = glob.glob('./archive-2/*.csv')
#Make an empty list
df_list = []
#Convert the files into pandas dataframe
for file in ag_news:
    df = pd.read_csv(file)
    #Append the dataframe to the list
    df_list.append(df)

#Use pandas concat function to make all the dataframes into one
ag_news_pd = pd.concat(df_list,ignore_index=True)

#Sanity Check
print(ag_news_pd.head())
print(ag_news_pd.columns)
    


   Class Index                                              Title  \
0            3                  Fears for T N pension after talks   
1            4  The Race is On: Second Private Team Sets Launc...   
2            4      Ky. Company Wins Grant to Study Peptides (AP)   
3            4      Prediction Unit Helps Forecast Wildfires (AP)   
4            4        Calif. Aims to Limit Farm-Related Smog (AP)   

                                         Description  
0  Unions representing workers at Turner   Newall...  
1  SPACE.com - TORONTO, Canada -- A second\team o...  
2  AP - A company founded by a chemistry research...  
3  AP - It's barely dawn when Mike Fitzpatrick st...  
4  AP - Southern California's smog-fighting agenc...  
Index(['Class Index', 'Title', 'Description'], dtype='object')


In [None]:
#Concatenate the two columns (Title and Description)together
ag_news_pd['Text'] = ag_news_pd['Title'].astype(str) + " " + ag_news_pd['Description'].astype(str)

In [None]:
import nltk
from nltk.tokenize import word_tokenize

#This is a function to tokenize the dataset
def word_tokenizer(text):
    #use nltk's word tokenize function
    tokens = word_tokenize(text)
    return tokens


def pad_truncate(text,max_sentence_length=128, pad_token="<pad>"):
    #If the sentence is over 128
    if len(text) > max_sentence_length:
        text = text[:max_sentence_length]
    #If the sentence is less then or equal to 128, pad it it with the pad_token
    return text + [pad_token] * (max_sentence_length - len(text))

#This is where the attention masking occurs
def create_attention_mask(sentence, pad_token="<pad>"):
    #Casual or masked attention(from the textbook)
    mask = []
    for token in sentence:
        if token == pad_token:
            mask.append(0)
        else:
            mask.append(1)
    return mask

#Apply it to every line in the dataset
ag_news_pd['Text'] = ag_news_pd['Text'].apply(word_tokenizer)

#Apply the pad_truncation
ag_news_pd['Text'] = ag_news_pd['Text'].apply(pad_truncate)

#after applying the word_tokenizing and padding/truncating
#Apply the attention masks for the words
ag_news_pd['Attention Mask'] = ag_news_pd['Text'].apply(create_attention_mask)


<class 'pandas.core.frame.DataFrame'>


In [None]:
#Build vocabulary from padded texts (remove <pad> tokens)
from collections import Counter
all_tokens = []
#Go through every token in the Text column
for token_list in ag_news_pd['Text']:
    # Filter out <pad> tokens for vocabulary building
    real_tokens = [t for t in token_list if t != '<pad>']
    all_tokens.extend(real_tokens)

counter = Counter(all_tokens)

# Create vocabulary (Necessary for Transformer) with special tokens first
# 0 for the pad token, 1 for unknown tokens, and 2 for Classification token
vocab = {'<pad>': 0, '<unk>': 1, '[CLS]': 2}
current_id = 3
#Add tokens to the vocab
for token in counter.keys():
    vocab[token] = current_id
    current_id += 1

#Convert the string tokens to IDs for the Transfomer Encoder
def convert_to_ids(token_list):
    ids = []
    #Check to see the tokens look like
    for token in token_list:
        if token == '[CLS]':
            ids.append(2)
        elif token in vocab:
            ids.append(vocab[token])
        else:
            ids.append(1)  # <unk>
    return ids

#Create a new column with input ids form the text column
ag_news_pd['Input IDs'] = ag_news_pd['Text'].apply(convert_to_ids)

In [None]:
#A way to see what the dataframe looks like now
ag_news_pd.head(3)

In [None]:
#Implementing an encoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, num_classes, d_model = 256, num_heads = 8, d_feedforward = 512, num_layers = 4, max_len = 128, dropout = 0.1):
        super().__init__()
        
        #Need this for the functions down below
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model//num_heads
        self.max_len = max_len
        
        ########### EMBEDDING
        # (initiliaze with vocab size and dimensions of the model)
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        ########### POSITIONAL ENCODING
        #create positional encoding buffers
        pe = torch.zeros(max_len, d_model) #Initialize a positional encoding matrix of 0s size [128, 256]
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        #Create the sine and cosine waves (using position and div_terms)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe.unsqueeze(0) #Make it the size of [1, max_len, d_model]
        self.register_buffer('pe', pe)
        
        ########### ATTENTION COMPONENTS
        #Linear projections for Q, K, and V (for all layers in the encoder)
        self.W_q = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(num_layers)]) #Weights for Q
        self.W_v = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(num_layers)]) #Weights for V
        self.W_k = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(num_layers)]) #Weights for K
        self.W_o = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(num_layers)]) #This is the weight of the output projections
        
        ########### FEEDFORWARD NETWORK 
        self.ffn1 = nn.ModuleList([nn.Linear(d_model,d_feedforward) for _ in range(num_layers)])
        self.ffn2 = nn.ModuleList([nn.Linear(d_feedforward,d_model) for _ in range(num_layers)])

        ########### LAYER NORMALIZATION
        self.norm1 = nn.ModuleList([nn.LayerNorm(d_model) for _ in range(num_layers)])
        self.norm2 = nn.ModuleList([nn.LayerNorm(d_model) for _ in range(num_layers)])
        self.finalNorm = nn.LayerNorm(d_model)
        
        ########### DROPOUT LAYERS
        #Create a dropout with nn.Dropout using the dropout parameter for the model
        self.pos_dropout = nn.Dropout(dropout) #One for the positional encoding
        self.attn_dropout = nn.Dropout(dropout) #For the multihead attention
        self.ffn_dropout = nn.Dropout(dropout) #For the feedforward network
        self.res_dropout = nn.Dropout(dropout) #For the residual connections
        
        ########### CLASSIFICATION HEAD (DENSE LAYER)
        self.cls_dropout = nn.Dropout(dropout) #Dropout for the classification head
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, num_classes)
        )
        
        #store the number of layers
        self.num_layers = num_layers
        
        #Initialize the weights
        self._init_weights()
        
    #A function to initialize the weights for layers
    def _init_weights(self):
        """Xavier uniform initialization for all linear layers"""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
                
    #Multi-Head Attention 
    def _multi_head_attention(self, x, layer_idx, mask = None):
        """
        Multi-head attention for a specific layer in the encoder
        Args:
            x: [batch_size, sequence_length, d_model]
            layer_idx: the encoding layer
            mask: could be [batch_size, sequence_length] or [batch_size, 1, sequence_length, sequence_length]
        """
        batch_size, seq_length,_ = x.shape
        
        # Linear projections for Q, K, V 
        # and split into heads
        Q = self.W_q[layer_idx](x).view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1,2)
        K = self.W_k[layer_idx](x).view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1,2)
        V = self.W_v[layer_idx](x).view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1,2)
        
        
        #Scaled Dot Product Attention - Scores
        scores = torch.matmul(Q,K.transpose(-2,-1)) / math.sqrt(self.d_k)
        
        #Apply the attention mask (if provided)
        if mask is not None:
            if mask.dim() == 2: #[batch_size, seq_length]
                mask = mask.unsqueeze(1).unsqueeze(2)  # [batch, 1, 1, seq_len]
            scores = scores.masked_fill(mask == 0, float('-inf'))
            
        #Softmax and dropout
        attn_weights = F.softmax(scores, dim=1)
        attn_weights = self.attn_dropout(attn_weights)
        
        #Apply the attention to values using torch matmul
        context = torch.matmul(attn_weights, V)
        
        #Concatenate the heads 
        context = context.transpose(1,2).contiguous().view(
            batch_size, seq_length, self.d_model
        )
        
        #Final Linear Projections
        output = self.W_o[layer_idx](context)
        
        return output, attn_weights
    
    def _feed_forward(self, x, layer_idx):
        """
        Position-wise feed-forward neural network for a specific layer
        """
        x = self.ffn1[layer_idx](x)
        x = F.relu(x)
        x = self.ffn_dropout(x)
        x = self.ffn2[layer_idx](x)
        return x
    
    def _encoder_layer(self, x, layer_idx, mask = None):
        """
        Process through one single encoder layer
        """
        #Multihead attention with residual
        attn_output, _ = self._multi_head_attention(x, layer_idx, mask)
        x = x + self.res_dropout(attn_output)
        x = self.norm1[layer_idx](x)
        
        #Feed forward with residual
        ff_output = self._feed_forward(x, layer_idx)
        x = x + self.res_dropout(ff_output)
        x = self.norm2[layer_idx](x)
        
        return x
    
    def _forward(self, input_ids, attention_mask = None):
        """
        Completes the forward pass for the Transformer

        Args:
            input_ids: [batch_size, seq_length]
            attention_mask: [batch_size, seq_length] where 1 = real token, 0 = padding

        Returns:
            logits: [batch_size, num_classes]
        """
        #Get the batch size and sequence length from the input ids
        batch_size, seq_len = input_ids.shape
        
        #Token embedding
        x = self.embedding(input_ids) #Should be [batch_size, seq_len, d_model]
        
        #Add positional encoding
        x = x + self.pe[:, :seq_len, :]
        x = self.pos_dropout(x) #Apply the positional dropout
        
        #Prepare the attention masks
        if attention_mask is not None:
            # Convert to mask format: True/1 for positions to mask
            mask = (attention_mask == 0)  # True for padding tokens
        else:
            mask = None
            
        #Pass through the encoding layers
        for layer_idx in range(self.num_layers):
            x = self._encoder_layer(x, layer_idx, mask) #Using the encoder layer
        
        #Get the final layer norm
        x = self.finalNorm(x)
        
        #Take the classification token
        cls_token = x[:, 0, :]  # [batch, d_model]
        cls_token = self.cls_dropout(cls_token) #Apply cls dropout
        
        #Classification Head (logits)
        logits = self.classifier(cls_token)
        return logits
    #EXTRA HELPER FUNCTIONS
    def get_attention_weights(self, input_ids, attention_mask = None, layer_idx = 0):
        with torch.no_grad():
            x = self.embedding(input_ids)
            x = x + self.pe[:, :input_ids.size(1), :] 
            
            if attention_mask is not None:
                mask = (attention_mask == 0)
            else:
                mask = None
            
            #Get the attention weights from specified layer
            _, attn_weights = self._multi_head_attention(x,layer_idx,mask)
        return attn_weights 
    
    def get_embeddings(self, input_ids):
        """
        Get token embeddings before position encoding
        """
        return self.embedding(input_ids)
    
    def count_parameters(self):
        """
        Count the total trainable parameters
        """
        return sum(p.numel() for p in self.parameters() if p.requires_grad)
        

In [None]:
#Split the dataset and convert them into tensors for PyTorch
import sklearn
from sklearn.model_selection import train_test_split

def dataframe_to_tensors(X_df, y_series):
    """
    This converts a pandas dataframe/series (of lists) to a pytorch tensor 
    Args:
        X_df (Pandas dataframe): The dataframe of attention mask and input ids
        y_series (pandas Series): The categories of each news
    """
    input_ids = torch.tensor(X_df['Input IDs'].tolist(), dtype=torch.long)
    attention_mask = torch.tensor(X_df['Attention Mask'].tolist(), dtype=torch.long)
    labels = torch.tensor(y_series.tolist(), dtype=torch.long)
    return input_ids, attention_mask, labels

#But I need to convert the training and test sets to tensors for PyTorch library
#Split the dataframe into a train/test set (80-20)
X = ag_news_pd[["Attention Mask", "Input IDs"]]
y = ag_news_pd["Class Index"]
X_train_df, X_test_df, y_train_series,y_test_series= train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

#Convert to Torch tensor
train_input_ids, train_attention_mask, train_labels = dataframe_to_tensors(X_train_df, y_train_series)
test_input_ids, test_attention_mask, test_labels = dataframe_to_tensors(X_test_df, y_test_series)

In [None]:
vocab_size = len(vocab)
num_classes = 4
model = TransformerEncoder(vocab_size = vocab_size, num_classes = num_classes, d_model=256, 
                           num_heads=8, d_feedforward=512, num_layers=4, max_len=128,dropout=0.01)
#SANITY CHECK: Check model size
print(f"Model parameters: {model.count_parameters():,}")

batch_input_ids = train_input_ids[:32]  # [32, 128]
batch_masks = train_attention_mask[:32]  # [32, 128]

logits = model(batch_input_ids, batch_masks)
print(f"Logits shape: {logits.shape}")  # [32, 4]

attn_weights = model.get_attention_weights(batch_input_ids[:1], batch_masks[:1])
print(f"Attention weights shape: {attn_weights.shape}")  # [1, 8, 128, 128]


In [None]:
#Train the model
from torch.utils.data import DataLoader
#Use the Dataloader in pytorch
train_loader = DataLoader()
test_loader = DataLoader
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters, lr=1e-2)
epochs = 10
for epoch in range(epochs):
    model.train()
    

In [None]:
#Model's Evaluation

from sklearn import metrics

#Print out the model's accuracy, precision, recall, and f1-score

#Create confusion matrix from sklearn metrics

In [None]:
#Task 2 Setting Up a Transformer Decoder