# Transformer code with components split for better understanding and easier manipulation


In [1]:
import pandas as pd
import numpy as np
import sys
import os 

sys.path.append(os.path.abspath('..')) 

In [2]:
# Import Pre-Processing
from src.pre_processing import clean_dataset

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DCCN9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DCCN9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DCCN9\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("Number of GPUs:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

CUDA available: True
CUDA version: 12.9
Number of GPUs: 1
GPU name: NVIDIA GeForce MX450


In [4]:
train_df = pd.read_json('../Dataset/train.json')
test_df = pd.read_json('../Dataset/test.json')
validation_df = pd.read_json('../Dataset/validation.json')

# Pre Processing


Call the pre-processing fucntion inside the src --> pre-processing.clean_dataset()  
Structure :
`clean_dataset(dataset, MIN_WORD_LENGTH=3, MAX_WORD_LENGTH=50, method=None, pos_filter=False, stop_w=False)`

- dataset
- min_word_len --> Set to 3
- max_word_len --> Set to 60
- method (stem,lemmatize) optional
- pos_filter (optional)
- stop_word (optional)


In [5]:
clean_train_dataset = clean_dataset(train_df, stop_w=False, method='', pos_filter=False)
clean_test_dataset = clean_dataset(test_df,stop_w=False, method=None, pos_filter= False )
clean_validation_dataset = clean_dataset(validation_df,stop_w=False, method=None, pos_filter= False )

# Combine between clean premise and hypothesis
clean_t_dataset = clean_train_dataset['premise'] + clean_train_dataset['hypothesis']

In [6]:
clean_t_dataset.head()

0    [pluto, rotates, once, on, its, axis, every, 6...
1    [glenn, once, per, day, the, earth, rotates, a...
2    [geysers, periodic, gush, of, hot, water, at, ...
3    [facts, liquid, water, droplets, can, be, chan...
4    [by, comparison, the, earth, rotates, on, its,...
dtype: object

## Create a unique list of words from both premise and hypothesis


In [7]:
# Create a unique word list from the cleaned dataset
unique_words = set()
for sentence in clean_t_dataset: # Both Premise and Hypothesis 
    for word in sentence:
        unique_words.add(word)

print(f"Count of unique words: {len(unique_words)}")
# No lemma, stem, stop_w, pos_filter = 22134
# With stop_w = 21917
# With stop_w, lemma (took 25 sec) = 18396
# With stop_w, stem (took 10s) = 15403
# Just Stem = 15565
# Just lemma = 18587
# Just Pos Filtering = 21115
# What are we removing in the pos filtering???

Count of unique words: 22134


Sort the word to always have tem ordered


In [8]:
unique_words_list = sorted(list(unique_words))

## Create a dictionary of word-index


Use the enumerate to map a unique index to a unique word


In [9]:
word2id = {w:i for i,w in enumerate(unique_words_list)}
id2word = {i:w for i,w in enumerate(unique_words_list)}

# Build the Embedding Model


Build the embeddings using the gensim, calling the `embedding_model.py`


In [10]:
from gensim.models import FastText

In [11]:
# Embedding Hyperparameters
emdedding_batch_size = 1024
embedding_learn_rate = 0.001
embedding_size = 200
embedding_no_of_epochs = 8
embedding_window_size = 3
vocab_size = len(unique_words)
# sg -->  0 CBOW, 1 skip-gram 


In [12]:
fast_text_model = FastText(clean_t_dataset, # Both premise and Hypothesis 
                           vector_size=embedding_size,
                           window=embedding_window_size,
                           sg=1,
                           epochs=embedding_no_of_epochs
                           )

In [13]:
fast_text_model.wv.most_similar('saturn', topn=10)

[('saturns', 0.9502517580986023),
 ('neptune', 0.8259016871452332),
 ('1980', 0.8240558505058289),
 ('1979', 0.7945947647094727),
 ('1981', 0.7814427018165588),
 ('voyager', 0.7696138024330139),
 ('1980s', 0.7681811451911926),
 ('180', 0.7674251198768616),
 ('1989', 0.7645599246025085),
 ('80', 0.7556518912315369)]

# Start of the Transformer Implementation


## Lets first add the necessary special tokens [SEP] [PAD]


In [14]:
SPECIAL_TOKENS = ['[PAD]','[UNK]','[CLS]','[SEP]']
for tok in SPECIAL_TOKENS:
    if tok not in word2id:
        idx = len(word2id)
        word2id[tok] = idx
        id2word[idx] = tok

PAD_ID = word2id['[PAD]'] # Padding token 
UNK_ID = word2id['[UNK]'] # Unknown words token 
CLS_ID = word2id['[CLS]'] # Classification token
SEP_ID = word2id['[SEP]'] # Separation token 

In [15]:
# Transformer Hyper params
MAX_SEQ_LEN = 64
#embedding_size

In [16]:
def prepare_transformer_data(df, word2id, max_len):
    input_ids = []
    attention_mask = []
    labels = []

    label_map = {'neutral': 0, 'entails': 1}

    for _, row in df.iterrows():
        premise_toks = row['premise']
        hypothesis_toks = row['hypothesis']
        label = row['label']

        # Add special tokens 
        tokens = [CLS_ID] \
                + [word2id.get(w,UNK_ID) for w in premise_toks] \
                + [SEP_ID] \
                + [word2id.get(w,UNK_ID) for w in hypothesis_toks] 
        # Truncate
        tokens = tokens[:max_len]

        # Attention mask 
        attn = [1] * len(tokens)

        # Pad
        pad_len = max_len- len(tokens) # To fill the [PAD]
        if pad_len > 0:
            tokens += [PAD_ID] * pad_len
            attn += [0] * pad_len # FLag positions as padding 

        input_ids.append(tokens)
        attention_mask.append(attn)
        labels.append(label_map[label])

    return (torch.LongTensor(input_ids),
            torch.LongTensor(attention_mask),
            torch.LongTensor(labels))
    

In [17]:
# Build train/test
x_train, train_masks, y_train = prepare_transformer_data(clean_train_dataset, word2id, MAX_SEQ_LEN)
x_test,  test_masks,  y_test  = prepare_transformer_data(clean_test_dataset,  word2id, MAX_SEQ_LEN)
x_valid,  valid_masks,  y_valid  = prepare_transformer_data(clean_validation_dataset,  word2id, MAX_SEQ_LEN)

print("Shape of x_train:", x_train.shape)
print("Shape of train_masks:", train_masks.shape)
print("Shape of y_train:", y_train.shape)

# Sanity Check 
i = 0
print("\n--- Example ---")
print("First example real length (mask sum):", int(train_masks[i].sum()))
print("First example PAD count:", int((train_masks[i]==0).sum()))

Shape of x_train: torch.Size([22905, 64])
Shape of train_masks: torch.Size([22905, 64])
Shape of y_train: torch.Size([22905])

--- Example ---
First example real length (mask sum): 22
First example PAD count: 42


In [18]:
train_masks[0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [19]:
# sentence = []
# for x in x_train[0]:
#     sentence.append(id2word[int(x)])

# sentence

## Positional Encoding


In [20]:
import math
import torch
import torch.nn as nn


class PositionalEncoding(nn.Module):
    
    def __init__(self,d_model,max_len = 512, dropout = 0.1):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0,max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0,d_model,2).float() * (-math.log(10000.0) / d_model))
        pe[:,0::2] = torch.sin(position * div_term) # For even dims
        pe[:,1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor):
        x = x + self.pe[:, :x.size(1), :] 
        return self.dropout(x)
        

In [None]:

def build_embedding_matrix(word2id, pretrained_vectors, embedding_size):
    vocab_size = len(word2id)
    embedding_matrix = np.zeros((vocab_size, embedding_size), dtype=np.float32)

    # Fill the matrix with pre-trained vectors
    for word, idx in word2id.items():
        if word == '[PAD]':
            continue
        try:
            vec = pretrained_vectors[word]
            if vec.shape[0] == embedding_size:
                embedding_matrix[idx] = vec.astype(np.float32)
            else:
                # fallback if dims don’t match
                embedding_matrix[idx] = np.random.normal(0.0, 0.02, size=(embedding_size,)).astype(np.float32)
        except KeyError:
            # special tokens or OOV start them with random values 
            embedding_matrix[idx] = np.random.normal(0.0, 0.02, size=(embedding_size,)).astype(np.float32)

    print("Embedding matrix created with shape:", embedding_matrix.shape)

    return embedding_matrix

In [22]:
embedding_matrix = build_embedding_matrix(word2id, fast_text_model.wv, embedding_size)

Embedding matrix created with shape: (22138, 200)


In [None]:
# Transformer Hyperparameters
EMBEDDING_DIM = embedding_size
N_HEADS = 3
N_LAYERS = 1
DROPOUT = 0.3
NUM_CLASSES = 2
BATCH_SIZE = 64
LR = 0.0002
WD = 0.2
NUM_EPOCHS = 3

In [25]:
class TransformerClassifier(nn.Module):
    def __init__(self, embedding_matrix, num_classes, n_heads, n_layers, dropout):
        super().__init__()
        vocab_size, embed_dim = embedding_matrix.shape
        self.d_model = embed_dim

        # Embedding Layer
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False, padding_idx=PAD_ID)

        # Positional Encoding
        self.pos_encoder = PositionalEncoding(embed_dim, max_len=MAX_SEQ_LEN, dropout=dropout)

        # Transformer Encoder Layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=n_heads,
            dropout=dropout,
            dim_feedforward=256,
            batch_first=True # Makes working with batch dimension easier
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer,num_layers=n_layers)

        # Final Classification Head
        self.classifier = nn.Linear(embed_dim,num_classes)

    def forward(self, src, src_mask):
        padding_mask = (src_mask == 0)

        # Apply embedding and positional encoding
        embedded = self.embedding(src) * math.sqrt(self.d_model)
        pos_encoded = self.pos_encoder(embedded)

        # Pass trough the transformer encoder
        encoded = self.transformer_encoder(pos_encoded,src_key_padding_mask = padding_mask)

        # Use the output of the [CLS] token (first token) for classification
        cls_output = encoded[:,0,:]

        # Get final logits from the classifier
        logits = self.classifier(cls_output)
        return logits

In [26]:
# First, build the embedding matrix using the function you wrote
embedding_matrix = build_embedding_matrix(word2id, fast_text_model.wv, EMBEDDING_DIM)

# Now, instantiate the model
transformer_model = TransformerClassifier(
    embedding_matrix=embedding_matrix,
    num_classes=NUM_CLASSES,
    n_heads=N_HEADS,
    n_layers=N_LAYERS,
    dropout=DROPOUT
)


Embedding matrix created with shape: (22138, 200)


In [27]:
from torch.utils.data import TensorDataset, DataLoader


In [28]:
# Datasets
train_ds = TensorDataset(x_train, train_masks,y_train)
test_ds = TensorDataset(x_test, test_masks,y_test)
valid_ds = TensorDataset(x_valid, valid_masks,y_valid)

# Dataloaders (shuffle for train)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
valid_loader = DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transformer_model.to(device)

class_weights = torch.tensor([1.0, 1.5]).to(device) 
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(transformer_model.parameters(), lr = LR, weight_decay=WD)

In [30]:
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total = 0

    for xb, mb, yb in loader: # x_train, train_mask, y_train
        xb,mb,yb = xb.to(device), mb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb, mb)
        loss = criterion(logits, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item() * yb.size(0)
        preds = logits.argmax(1)
        total_correct += (preds == yb).sum().item()
        total += yb.size(0)
    return total_loss/total, total_correct/total

@torch.no_grad() #For everything inside this function, don’t track gradients.
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    total = 0
    for input_ids, attention_mask, labels in loader:
        input_ids      = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels         = labels.to(device)

        logits = model(input_ids, attention_mask)
        loss   = criterion(logits, labels)

        total_loss   += loss.item() * labels.size(0)
        preds         = logits.argmax(1)
        total_correct+= (preds == labels).sum().item()
        total        += labels.size(0)
    return total_loss/total, total_correct/total


In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transformer_model.to(device)
print(device)
# Main training loop
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_epoch(transformer_model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(transformer_model, valid_loader, criterion, device)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} - Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

print("Training complete.")

test_loss, test_acc = evaluate(transformer_model, test_loader, criterion, device)
print(f"FINAL TEST - Loss: {test_loss:.4f}, Acc: {test_acc:.4f}")


cuda


  output = torch._nested_tensor_from_mask(


Epoch 1/3 - Train Loss: 0.6915, Train Acc: 0.5838 - Val Loss: 0.6514, Val Acc: 0.6628
Epoch 2/3 - Train Loss: 0.6333, Train Acc: 0.6565 - Val Loss: 0.6546, Val Acc: 0.6674
Epoch 3/3 - Train Loss: 0.5975, Train Acc: 0.6865 - Val Loss: 0.8009, Val Acc: 0.6406
Training complete.
FINAL TEST - Loss: 0.7495, Acc: 0.6899


In [32]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

transformer_model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids      = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels         = labels.to(device)

        logits = transformer_model(input_ids, attention_mask)
        preds = logits.argmax(dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())

print("Accuracy:", accuracy_score(all_labels, all_preds))
print("F1 Score:", f1_score(all_labels, all_preds))
print("Precision:", precision_score(all_labels, all_preds))
print("Recall:", recall_score(all_labels, all_preds))
print("-" * 40)
print(classification_report(all_labels, all_preds, target_names=["neutral","entails"]))

print('-' * 40)
print('Hyperparameters')
print('Batch Size', BATCH_SIZE)
print('Num of Epochs', NUM_EPOCHS)
print('Learning Rate', LR)
print('Num Classes', NUM_CLASSES)
print('Number of Heads', N_HEADS)
print('Numer of Layers', N_LAYERS)
print('Embedding Dimensions', EMBEDDING_DIM)
print('Dropout', DROPOUT)
print('Weight Decay', WD)


Accuracy: 0.6898674242424242
F1 Score: 0.4527986633249791
Precision: 0.7633802816901408
Recall: 0.32185273159144895
----------------------------------------
              precision    recall  f1-score   support

     neutral       0.68      0.93      0.78      1270
     entails       0.76      0.32      0.45       842

    accuracy                           0.69      2112
   macro avg       0.72      0.63      0.62      2112
weighted avg       0.71      0.69      0.65      2112

----------------------------------------
Hyperparameters
Batch Size 128
Num of Epochs 3
Learning Rate 0.0002
Num Classes 2
Number of Heads 4
Numer of Layers 2
Embedding Dimensions 200
Dropout 0.4
Weight Decay 0.2
