#1: Data Loading and Preprocessing

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from transformers import BertModel
from torch.optim import AdamW
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_parquet("hf://datasets/ucirvine/sms_spam/plain_text/train-00000-of-00001.parquet")

# Rename columns for clarity
df.columns = ['sms', 'label']

# Check for missing values
df.dropna(subset=['sms', 'label'], inplace=True)

# Map the labels: 1 for 'spam', 0 for 'ham'
df['label'] = df['label'].apply(lambda x: 1 if x == 'spam' else 0)

# Split the dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['sms'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Initialize the tokenizer (we'll use BERT tokenizer)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and pad sequences
def tokenize_and_pad(texts, tokenizer, max_length=64):
    return tokenizer(
        texts,
        padding=True,      # Pad to max length
        truncation=True,   # Truncate longer than max_length
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings = tokenize_and_pad(train_texts, tokenizer)
val_encodings = tokenize_and_pad(val_texts, tokenizer)

# Create a custom Dataset class
class SMSDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create data loaders for training and validation
train_dataset = SMSDataset(train_encodings, train_labels)
val_dataset = SMSDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# 2 Regular Transformer architecture

In [None]:
import torch.nn as nn
from transformers import BertModel

class SimpleTransformer(nn.Module):
    def __init__(self, hidden_size=768, num_attention_blocks=2, num_classes=2):
        super(SimpleTransformer, self).__init__()

        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Transformer Encoder Blocks (2 blocks in total)
        self.transformer_blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=hidden_size,
                nhead=12,  # Number of attention heads
                dim_feedforward=hidden_size*4
            ) for _ in range(num_attention_blocks)
        ])

        # Output layer for classification
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        # BERT embedding outputs: [batch_size, seq_len, hidden_size]
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state

        # Pass through the transformer blocks (additional layers on top of BERT)
        for block in self.transformer_blocks:
            hidden_states = block(hidden_states)

        # Use the hidden state of the [CLS] token for classification (first token)
        cls_output = hidden_states[:, 0, :]  # Shape: [batch_size, hidden_size]

        # Classify the [CLS] token output
        logits = self.classifier(cls_output)
        return logits

# Instantiate the model
model = SimpleTransformer()

# 3 Training Simple

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:


# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# Training function
def train(model, train_loader, optimizer, loss_fn):
    model.train()
    total_loss = 0
    correct_preds = 0
    total_preds = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        logits = model(input_ids, attention_mask=attention_mask)

        # Compute loss
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        # Compute accuracy
        _, preds = torch.max(logits, dim=1)
        correct_preds += (preds == labels).sum().item()
        total_preds += labels.size(0)

        # Backward pass
        loss.backward()
        optimizer.step()
        #print("batch done")

    return total_loss / len(train_loader), correct_preds / total_preds

# Evaluation function
def evaluate(model, val_loader):
    model.eval()
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            logits = model(input_ids, attention_mask=attention_mask)

            # Compute accuracy
            _, preds = torch.max(logits, dim=1)
            correct_preds += (preds == labels).sum().item()
            total_preds += labels.size(0)

    return correct_preds / total_preds

# Training loop
num_epochs = 1
for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, loss_fn)
    val_acc = evaluate(model, val_loader)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Training loss: {train_loss:.4f}, Training accuracy: {train_acc:.4f}")
    print(f"Validation accuracy: {val_acc:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/1
Training loss: 0.0089, Training accuracy: 0.9930
Validation accuracy: 1.0000


# 4 Custom stuff

In [None]:
# prompt: test the model on a single custom example

import torch

# Assuming 'model' and 'tokenizer' are defined from the previous code

def predict_single_example(text, model, tokenizer):
    # Tokenize the input text
    inputs = tokenizer(text, padding=True, truncation=True, max_length=64, return_tensors="pt")

    # Move inputs to the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Put the model in evaluation mode
    model.eval()

    # Make the prediction
    with torch.no_grad():
      logits = model(input_ids, attention_mask=attention_mask)

    # Get the predicted class (0 or 1)
    predicted_class = torch.argmax(logits, dim=1).item()

    return predicted_class

# Example usage
custom_text = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"  # Example spam text
predicted_label = predict_single_example(custom_text, model, tokenizer)

print(f"Input text: {custom_text}")
print(f"Predicted label: {predicted_label}") # 1 for spam, 0 for ham

Input text: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
Predicted label: 0


# 2 Feelings-Based Transformer architecture

In [None]:


class FeelTransformer(nn.Module):
    def __init__(self, hidden_size=768, num_attention_blocks=4, num_classes=2):
        super(FeelTransformer, self).__init__()

        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Transformer Encoder Blocks (2 blocks in total)
        self.transformer_blocks = nn.ModuleList([
            FeelEncoderLayer(
                hidden_size=hidden_size,
                num_heads=12,  # Number of attention heads
                dim_feedforward=hidden_size*4
            ) for _ in range(num_attention_blocks)
        ])

        # Output layer for classification
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        # BERT embedding outputs: [batch_size, seq_len, hidden_size]
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state

        # Pass through the transformer blocks (additional layers on top of BERT)
        for block in self.transformer_blocks:
            hidden_states = block(hidden_states)

        # Use the hidden state of the [CLS] token for classification (first token)
        cls_output = hidden_states[:, 0, :]  # Shape: [batch_size, hidden_size]

        # Classify the [CLS] token output
        logits = self.classifier(cls_output)
        return logits

# Instantiate the model
model = FeelTransformer()

In [None]:
class FeelEncoderLayer(nn.Module):
    def __init__(self, hidden_size=768, num_heads=12,dim_feedforward=768*4, dropout=0.1, activation_function=nn.ReLU()):
        super(FeelEncoderLayer, self).__init__()

        # Multi-Head Attention Layer
        self.self_attention = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_heads, dropout=dropout)
        self.self_emotion = EmotionLayer(embed_dim=hidden_size, emotion_dim=hidden_size)


       #mihh stuff
      #  self.feeling_matrix = nn.Parameter( torch.randn((3, 4)) )


        # Layer normalization before and after attention
        self.norm1 = nn.LayerNorm(hidden_size)

        # Feedforward network (with custom activation)
        self.feed_forward = nn.Sequential(
            nn.Linear(hidden_size, dim_feedforward),  # First linear layer
            activation_function,                      # Custom activation
            nn.Linear(dim_feedforward, hidden_size)   # Second linear layer
        )

        # Layer normalization before and after the feed-forward network
        self.norm2 = nn.LayerNorm(hidden_size)

        # Dropout layers
        self.dropout1 = nn.Dropout(dropout)  # Dropout after attention
        self.dropout2 = nn.Dropout(dropout)  # Dropout after feed-forward

    def forward(self, x, mask=None):
        # x: [seq_len, batch_size, hidden_size]
        # mask: [batch_size, seq_len] (optional for padding)

        # Multi-Head Self-Attention with residual connection and layer norm
        attn_output, _ = self.self_attention(x, x, x, attn_mask=mask)

        #x=self.self_emotion(x, query_feel=None)

        x = x + self.dropout1(attn_output)  # Add residual connection
        x = self.norm1(x)  # Apply layer normalization



        # Feedforward layer with residual connection and layer norm
        ff_output = self.feed_forward(x)
        x = x + self.dropout2(ff_output)  # Add residual connection
        x = self.norm2(x)  # Apply layer normalization

        return x


In [None]:
#the math

# delta_feel = X * EmotionSpace * downScale
# feel += delta_feel
#

# feel_pdf = softmax ( query_feel * X )
# X += feel_pdf * feel

In [None]:
class EmotionLayer(nn.Module):
    def __init__(self, embed_dim, emotion_dim, downscale=1.0):
        """
        Initialize the EmotionLayer.
        :param embed_dim: Dimensionality of the input X (same as embed_dim of attention layers).
        :param emotion_dim: Dimensionality of the EmotionSpace.
        :param downscale: Scaling factor for delta_feel calculation.
        """
        super(EmotionLayer, self).__init__()
        self.emotion_space = nn.Parameter(torch.randn(emotion_dim, embed_dim))
        self.downscale = downscale
        self.feel = nn.Parameter(torch.zeros(emotion_dim))

    def forward(self, X, query_feel):
        """
        Forward pass of the EmotionLayer.
        :param X: Input tensor of shape (seq_len, batch_size, embed_dim).
        :param query_feel: Query tensor for calculating feel_pdf, shape (batch_size, emotion_dim).
        :return: Updated X tensor of shape (seq_len, batch_size, embed_dim).
        """
        # Transpose X to (batch_size, seq_len, embed_dim) for processing
        X = X.transpose(0, 1)

        # Calculate delta_feel
        delta_feel = torch.einsum('bsi,ij->bsj', X, self.emotion_space)  # (batch_size, seq_len, emotion_dim)
        delta_feel = torch.mean(delta_feel, dim=1) * self.downscale  # (batch_size, emotion_dim)
        self.feel = self.feel + delta_feel  # Update feel

        # Calculate feel_pdf using softmax
        feel_pdf = F.softmax(torch.einsum('bi,ij->bj', query_feel, self.emotion_space.T), dim=-1)  # (batch_size, embed_dim)

        # Update X
        X_updated = X + torch.einsum('bi,bj->bij', feel_pdf, self.feel)  # (batch_size, seq_len, embed_dim)

        # Transpose X back to (seq_len, batch_size, embed_dim)
        return X_updated.transpose(0, 1)


#5 Comparing the FeelTransformer with the regular Tranformer

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# make this single Feeling head
class SingleHeadAttention(nn.Module):
    def __init__(self, embed_dim, dropout=0.1):
        super(SingleHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.dropout = nn.Dropout(dropout)
        self.query_linear = nn.Linear(embed_dim, embed_dim)
        self.key_linear = nn.Linear(embed_dim, embed_dim)
        self.value_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, query, key, value, attention_mask=None):
        # Compute query, key, and value vectors
        query = self.query_linear(query)
        key = self.key_linear(key)
        value = self.value_linear(value)

        # Compute attention scores
        attention_scores = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(self.embed_dim)

        # Apply attention mask if provided
        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask

        # Compute attention weights
        attention_weights = F.softmax(attention_scores, dim=-1)

        # Apply dropout to attention weights
        attention_weights = self.dropout(attention_weights)

        # Compute output
        output = torch.matmul(attention_weights, value)

        return output, attention_weights