# Part III
Using the previous two tutorials, please answer the following using an encorder-decoder approach and an LSTM compared approach.

Please create a transformer-based classifier for English name classification into male or female.

There are several datasets for name for male or female classification. In subseuqent iterations, this could be expanded to included more classifications.

Below is the source from NLTK, which only has male and female available but could be used for the purposes of this assignment.

```
names = nltk.corpus.names
names.fileids()
['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]
['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian', 'Adrien', 'Ajay', 'Alex', 'Alexis',
'Alfie', 'Ali', 'Alix', 'Allie', 'Allyn', 'Andie', 'Andrea', 'Andy', 'Angel',
'Angie', 'Ariel', 'Ashley', 'Aubrey', 'Augustine', 'Austin', 'Averil', ...]
```

In [None]:
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###

# ENCODER DECODER APPROACH

In [1]:
import nltk
nltk.download('names')

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


True

In [2]:
from nltk.corpus import names
import pandas as pd
import random

# Get names from the dataset
male_names = names.words('male.txt')
female_names = names.words('female.txt')

# Create a dataframe with names and their respective labels
data = pd.DataFrame({
    'name': male_names + female_names,
    'gender': ['male'] * len(male_names) + ['female'] * len(female_names)
})

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

In [3]:
# Required Libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
import random
import math

# Data Preparation
from nltk.corpus import names
import pandas as pd

# Get names from the dataset
male_names = names.words("male.txt")
female_names = names.words("female.txt")

# Create a DataFrame with names and labels
data = pd.DataFrame({
    "name": male_names + female_names,
    "gender": ["male"] * len(male_names) + ["female"] * len(female_names),
})

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Split into training and testing datasets
train_data, test_data = train_test_split(data, test_size=0.2)

# Encode labels (male = 0, female = 1)
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data["gender"])
test_labels = label_encoder.transform(test_data["gender"])

# Tokenize the names
char_vocab = set("".join(data["name"].str.lower()))
char_to_idx = {char: idx + 1 for idx, char in enumerate(sorted(char_vocab))}  # 0 is reserved for padding
max_len = max(data["name"].str.len())

# Convert names to sequences of indices
def name_to_indices(name, char_to_idx, max_len):
    indices = [char_to_idx[char] for char in name.lower()]
    if len(indices) < max_len:
        indices += [0] * (max_len - len(indices))  # padding
    return indices

train_sequences = torch.tensor(
    [name_to_indices(name, char_to_idx, max_len) for name in train_data["name"]]
)
test_sequences = torch.tensor(
    [name_to_indices(name, char_to_idx, max_len) for name in test_data["name"]]
)

# Define DataLoader for training and testing
batch_size = 32
train_loader = DataLoader(
    list(zip(train_sequences, train_labels)), batch_size=batch_size, shuffle=True
)
test_loader = DataLoader(
    list(zip(test_sequences, test_labels)), batch_size=batch_size, shuffle=False
)



In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the custom Multihead Attention module
class CustomMultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(CustomMultiheadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        # Learnable parameters for the query, key, and value
        self.query_weight = nn.Linear(embed_dim, embed_dim)
        self.key_weight = nn.Linear(embed_dim, embed_dim)
        self.value_weight = nn.Linear(embed_dim, embed_dim)

        # Output projection after attention
        self.out_proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, query, key, value):
        # Linear transformations
        query = self.query_weight(query)
        key = self.key_weight(key)
        value = self.value_weight(value)

        # Scaled dot-product attention
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) * self.scale
        attention_weights = nn.functional.softmax(attention_scores, dim=-1)

        # Weighted sum
        attention_output = torch.matmul(attention_weights, value)

        # Final output projection
        attention_output = self.out_proj(attention_output)

        return attention_output

# Custom Encoder Module
class CustomEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers):
        super(CustomEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.attention_layers = nn.ModuleList(
            [CustomMultiheadAttention(embed_dim, num_heads) for _ in range(num_layers)]
        )

        # Feedforward layers after each attention layer
        self.linear1 = nn.Linear(embed_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, embed_dim)

        self.activation = nn.ReLU()

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        for attention_layer in self.attention_layers:
            attended = attention_layer(embedded, embedded, embedded)

            # Apply the feedforward network
            ff = self.activation(self.linear1(attended))
            embedded = self.linear2(ff) + attended  # Residual connection

        return embedded

# Custom Decoder Module
class CustomDecoder(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_layers, num_heads):
        super(CustomDecoder, self).__init__()
        self.attention_layers = nn.ModuleList(
            [CustomMultiheadAttention(embed_dim, num_heads) for _ in range(num_layers)]
        )

        # Feedforward layers after each attention layer
        self.linear1 = nn.Linear(embed_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, embed_dim)

        self.activation = nn.ReLU()

    def forward(self, encoded):
        decoded = encoded
        for attention_layer in self.attention_layers:
            attended = attention_layer(decoded, decoded, decoded)

            # Apply the feedforward network
            ff = self.activation(self.linear1(attended))
            decoded = self.linear2(ff) + attended  # Residual connection

        return decoded

# Transformer-based model for classification with encoder and decoder
class Transformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes):
        super(Transformer, self).__init__()
        self.encoder = CustomEncoder(vocab_size, embed_dim, num_heads, hidden_dim, num_layers)
        self.decoder = CustomDecoder(embed_dim, hidden_dim, num_layers, num_heads)
        self.fc_layer = nn.Linear(embed_dim, num_classes)  # Classification head

    def forward(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        pooled_output = decoded.mean(dim=1)  # Global average pooling
        logits = self.fc_layer(pooled_output)  # Classification head

        return logits


In [7]:
# Set up the necessary imports
import torch
import torch.nn as nn
import torch.optim as optim

# Hyperparameters
epochs = 10
learning_rate = 0.001

# Instantiate the transformer model
# Define the necessary parameters
vocab_size = len(char_to_idx) + 1  # Include padding
embedding_dim = 128  # Change based on your model's embedding dimension
num_heads = 8  # Number of attention heads
hidden_dim = 256  # Hidden dimension for the feedforward network
num_layers = 3  # Number of layers in the encoder and decoder
num_classes = 2  # Male and female classification

# Instantiate the transformer model
model = Transformer(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers, num_classes)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    # Iterate over the training data
    for inputs, labels in train_loader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute loss
        loss = loss_fn(outputs, labels)

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()

        # Accumulate loss for monitoring
        epoch_loss += loss.item()

    # Print average loss for the epoch
    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}")

# Testing and evaluation
model.eval()
test_loss = 0
correct = 0
total_samples = 0

with torch.no_grad():
    # Iterate over the test data
    for inputs, labels in test_loader:
        # Forward pass
        outputs = model(inputs)

        # Compute loss
        test_loss += loss_fn(outputs, labels).item()

        # Get predictions
        preds = outputs.argmax(dim=1)

        # Compute accuracy
        correct += (preds == labels).sum().item()
        total_samples += len(labels)

# Compute test loss and accuracy
test_loss /= len(test_loader)
accuracy = correct / total_samples

print(f"Test Loss: {test_loss}, Test Accuracy: {accuracy * 100:.2f}%")


Epoch 1/10, Loss: 0.6616339980058334
Epoch 2/10, Loss: 0.6783966583822241
Epoch 3/10, Loss: 0.6674214619487974
Epoch 4/10, Loss: 0.6651779601921388
Epoch 5/10, Loss: 0.66380307617499
Epoch 6/10, Loss: 0.6699749467959956
Epoch 7/10, Loss: 0.6727512373996141
Epoch 8/10, Loss: 0.6679438891722329
Epoch 9/10, Loss: 0.8821982647006835
Epoch 10/10, Loss: 0.6583422143854688
Test Loss: 0.6562270057201386, Test Accuracy: 63.00%


# LSTM APPROACH

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from nltk.corpus import names

# Data preparation
# Get names from the dataset
male_names = names.words("male.txt")
female_names = names.words("female.txt")

# Create a dataframe with names and their respective labels
data = pd.DataFrame({
    "name": male_names + female_names,
    "gender": ["male"] * len(male_names) + ["female"] * len(female_names),
})

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Encode the labels
label_encoder = LabelEncoder()
data["gender_encoded"] = label_encoder.fit_transform(data["gender"])

# Tokenize the names into character indices
char_vocab = set("".join(data["name"].str.lower()))
char_to_idx = {char: idx + 1 for idx, char in enumerate(sorted(char_vocab))}  # 0 is reserved for padding
max_len = max(data["name"].str.len())

# Convert names to sequences of indices with padding
def name_to_indices(name, char_to_idx, max_len):
    indices = [char_to_idx[char] for char in name.lower()]
    if len(indices) < max_len:
        indices += [0] * (max_len - len(indices))  # padding
    return indices

data["name_indices"] = data["name"].apply(lambda x: name_to_indices(x, char_to_idx, max_len))

# Split into training and testing datasets
train_data, test_data = train_test_split(data, test_size=0.2)

# Define a custom Dataset class
class NameDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.names = torch.tensor(list(data["name_indices"].values.tolist()))
        self.labels = torch.tensor(data["gender_encoded"].values)

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        return self.names[idx], self.labels[idx]

# DataLoader
batch_size = 32
train_loader = DataLoader(NameDataset(train_data), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(NameDataset(test_data), batch_size=batch_size, shuffle=False)

# Define the LSTM classifier
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_classes):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # Embed the input sequences
        x = self.embedding(x)

        # Pass through the LSTM layer
        lstm_out, _ = self.lstm(x)

        # Take the output from the last timestep
        output = lstm_out[:, -1, :]

        # Pass through the fully connected layer
        logits = self.fc(output)

        return logits

# Instantiate the model
embedding_dim = 64  # Set the embedding dimension
hidden_dim = 128  # LSTM hidden dimension
num_layers = 2  # Number of LSTM layers
num_classes = 2  # Male and female classification

vocab_size = len(char_to_idx) + 1  # Include padding
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, num_layers, num_classes)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(inputs)  # Forward pass
        loss = loss_fn(outputs, labels)  # Compute loss
        loss.backward()  # Backward propagation
        optimizer.step()  # Update parameters

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}")

# Test and evaluate the model
model.eval()
test_loss = 0
correct = 0
total_samples = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)  # Forward pass
        test_loss += loss_fn(outputs, labels).item()

        # Get predictions and calculate accuracy
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total_samples += len(labels)

test_loss /= len(test_loader)
accuracy = correct / total_samples

print(f"Test Loss: {test_loss}, Test Accuracy: {accuracy * 100:.2f}%")


Epoch 1/10, Loss: 0.5390259247628888
Epoch 2/10, Loss: 0.432989945438639
Epoch 3/10, Loss: 0.4003668297175786
Epoch 4/10, Loss: 0.3745928277172635
Epoch 5/10, Loss: 0.35526387427170675
Epoch 6/10, Loss: 0.3357337306267652
Epoch 7/10, Loss: 0.3135626289862484
Epoch 8/10, Loss: 0.29492099036523445
Epoch 9/10, Loss: 0.27399233155813646
Epoch 10/10, Loss: 0.2552909168166731
Test Loss: 0.4179069498181343, Test Accuracy: 82.32%


# LSTM has better results and accuracy than Encoder Decoder Approach