<a href="https://colab.research.google.com/github/Natural-Language-Processing-YU/M3_Assignment/blob/main/scripts/m3_assignment_part_III.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part III
Using the previous two tutorials, please answer the following using an encorder-decoder approach and an LSTM compared approach.

Please create a transformer-based classifier for English name classification into male or female.

There are several datasets for name for male or female classification. In subseuqent iterations, this could be expanded to included more classifications.

Below is the source from NLTK, which only has male and female available but could be used for the purposes of this assignment.

```
names = nltk.corpus.names
names.fileids()
['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]
['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian', 'Adrien', 'Ajay', 'Alex', 'Alexis',
'Alfie', 'Ali', 'Alix', 'Allie', 'Allyn', 'Andie', 'Andrea', 'Andy', 'Angel',
'Angie', 'Ariel', 'Ashley', 'Aubrey', 'Augustine', 'Austin', 'Averil', ...]
```

In [None]:
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
None
### END CODE HERE ###

# References
1. https://arxiv.org/pdf/2102.03692.pdf
2. https://alvinntnu.github.io/NTNU_ENC2045_LECTURES/exercise/13-attention.html
3. https://towardsdatascience.com/deep-learning-gender-from-name-lstm-recurrent-neural-networks-448d64553044
4. https://www.nltk.org/book/ch02.html#sec-lexical-resources

**LSTM APPROACH**



[link text](https://)

In [8]:
import nltk

# Download the names corpus
nltk.download('names')


[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


True

In [9]:
import nltk
import random
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load names
male_names = nltk.corpus.names.words('male.txt')
female_names = nltk.corpus.names.words('female.txt')

# Label the names
labeled_names = [(name, 0) for name in male_names] + [(name, 1) for name in female_names]

# Shuffle the labeled names
random.shuffle(labeled_names)

# Feature extraction function
def name_features(name):
    return {'name': name}

# Extract features
featuresets = [(name_features(name), gender) for (name, gender) in labeled_names]

# Split data into train and test sets
train_set, test_set = train_test_split(featuresets, test_size=0.2, random_state=42)

# Maximum length of names
max_name_length = max(len(name) for name, _ in labeled_names)

# Prepare data for LSTM
def prepare_data_for_lstm(data, max_len):
    X, y = [], []
    for features, label in data:
        name_vec = [ord(char) for char in features['name']]
        X.append(name_vec)
        y.append(label)
    X_padded = pad_sequences(X, maxlen=max_len)
    return np.array(X_padded), np.array(y)

# Prepare train and test data
X_train_lstm, y_train_lstm = prepare_data_for_lstm(train_set, max_name_length)
X_test_lstm, y_test_lstm = prepare_data_for_lstm(test_set, max_name_length)

# LSTM model
lstm_model = Sequential([
    Embedding(128, 32, input_length=max_name_length),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

# Compile the model
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=64, validation_data=(X_test_lstm, y_test_lstm))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x791a0c2fe050>

In [10]:
# Evaluate the model
loss, accuracy = lstm_model.evaluate(X_test_lstm, y_test_lstm)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Predict on new names
def predict_gender(name):
    name_vec = np.array([ord(char) for char in name])
    name_padded = pad_sequences([name_vec], maxlen=max_name_length)
    prediction = lstm_model.predict(name_padded)[0][0]
    if prediction >= 0.5:
        return 'female'
    else:
        return 'male'

# Test predictions
print(predict_gender('John'))  # Expected: male
print(predict_gender('Alice'))  # Expected: female


Test Loss: 0.402023583650589
Test Accuracy: 0.8112019896507263
male
female


**ENCODER - DECODER APPROACH**

In [16]:
import nltk
import random
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load names
male_names = nltk.corpus.names.words('male.txt')
female_names = nltk.corpus.names.words('female.txt')

# Label the names
labeled_names = [(name, 0) for name in male_names] + [(name, 1) for name in female_names]

# Shuffle the labeled names
random.shuffle(labeled_names)

# Feature extraction function
def name_features(name):
    return {'name': name}

# Extract features
featuresets = [(name_features(name), gender) for (name, gender) in labeled_names]

# Split data into train and test sets
train_set, test_set = train_test_split(featuresets, test_size=0.2, random_state=42)

# Maximum length of names
max_name_length = max(len(name) for name, _ in labeled_names)

# Prepare data for encoder and decoder
def prepare_data_for_encoder_decoder(data, max_len):
    encoder_input_data, decoder_input_data, decoder_target_data = [], [], []
    for features, label in data:
        name = features['name']
        name_vec = [ord(char) for char in name]
        encoder_input_data.append(name_vec)
        decoder_input_data.append([1] + name_vec)  # Adding start token
        decoder_target_data.append(name_vec + [2])  # Adding end token
    encoder_input_padded = pad_sequences(encoder_input_data, maxlen=max_len)
    decoder_input_padded = pad_sequences(decoder_input_data, maxlen=max_len + 1, padding='post')
    decoder_target_padded = pad_sequences(decoder_target_data, maxlen=max_len + 1, padding='post')
    return encoder_input_padded, decoder_input_padded, decoder_target_padded

# Prepare train and test data for encoder and decoder
X_train_enc_dec, X_train_dec_in, y_train_dec_out = prepare_data_for_encoder_decoder(train_set, max_name_length)
X_test_enc_dec, X_test_dec_in, y_test_dec_out = prepare_data_for_encoder_decoder(test_set, max_name_length)

# Encoder Model
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(128, 32)(encoder_inputs)
encoder_lstm = LSTM(64, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder Model
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(128, 32)(decoder_inputs)
decoder_lstm = LSTM(64, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(128, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([X_train_enc_dec, X_train_dec_in], y_train_dec_out,
          batch_size=64,
          epochs=10,
          validation_data=([X_test_enc_dec, X_test_dec_in], y_test_dec_out))

# Save the model
model.save('encoder_decoder_model.h5')

# Define inference encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Define inference decoder model
decoder_state_input_h = Input(shape=(64,))
decoder_state_input_c = Input(shape=(64,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Function to decode sequence
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = 1  # start token
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 0:
            sampled_char = ''
        else:
            sampled_char = chr(sampled_token_index)
        decoded_sentence += sampled_char
        if sampled_char == '\n' or len(decoded_sentence) > max_name_length:
            stop_condition = True
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    return decoded_sentence.strip()

# Test predictions
def predict_gender(name):
    input_seq = np.array([[ord(char) for char in name]])
    decoded_name = decode_sequence(input_seq)
    return 'female' if decoded_name.strip() == name[1:].strip() else 'male'

print(predict_gender('John'))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


male
