In [1]:
from nltk.corpus import names
import random
from pprint import pprint
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
)

# This example uses Naive Bayes approach to classify names by gender.
# Note that “gender” here is a traditional biological gender with two options.
# In some cultures this notation is considered obsolete.

In [2]:
# Initial data input

labeled_names = (
    [(name, 'female') for name in names.words('female.txt')[:2000]] 
    + [(name, 'male') for name in names.words('male.txt')[:2000]]
)
random.shuffle(labeled_names)

pprint(labeled_names[:10])

[('Caroljean', 'female'),
 ('Merry', 'male'),
 ('Deirdre', 'female'),
 ('Cristionna', 'female'),
 ('Gabie', 'female'),
 ('Augustina', 'female'),
 ('Marion', 'male'),
 ('Lindy', 'male'),
 ('Kingsly', 'male'),
 ('Frans', 'male')]


In [3]:
# Prepare data for training

feature_meta = {
    "index": 0,
    "map": {},
}

def extract_features(name):
    """Map name ending to numeric values"""
    endings = [name[-1], name[-2:]]
    result = []
    for ending in endings:
        if ending in feature_meta["map"]:
            feature_index = feature_meta["map"][ending]
            result.append(feature_index)
        else:
            feature_meta["index"] += 1
            feature_meta["map"][ending] = feature_meta["index"]
            feature_meta["index"] = feature_meta["index"]
            result.append(feature_meta["index"])
    return result


features = [extract_features(item[0]) for item in labeled_names]
labels = [item[1] for item in labeled_names]

print("Features:")
pprint(features[:10])
print(f"{len(features)} total")
print()

print("Labels:")
pprint(labels[:10])
print(f"{len(labels)} total")
print()

# Split data into training and testing
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2)

print("Training features:")
pprint(train_features[:10])
print(f"{len(train_features)} total")
print()

print("Training labels:")
pprint(train_labels[:10])
print(f"{len(train_labels)} total")
print()

print("Testing features:")
pprint(test_features[:10])
print(f"{len(test_features)} total")
print()

print("Testing labels:")
pprint(test_labels[:10])
print(f"{len(test_labels)} total")
print()

Features:
[[1, 2],
 [3, 4],
 [5, 6],
 [7, 8],
 [5, 9],
 [7, 8],
 [1, 10],
 [3, 11],
 [3, 12],
 [13, 14]]
4000 total

Labels:
['female',
 'male',
 'female',
 'female',
 'female',
 'female',
 'male',
 'male',
 'male',
 'male']
4000 total

Training features:
[[46, 63],
 [66, 220],
 [7, 44],
 [1, 10],
 [3, 93],
 [24, 190],
 [7, 44],
 [119, 120],
 [5, 9],
 [17, 41]]
3200 total

Training labels:
['male',
 'male',
 'female',
 'male',
 'male',
 'male',
 'female',
 'male',
 'male',
 'male']
3200 total

Testing features:
[[13, 104],
 [66, 67],
 [5, 9],
 [29, 122],
 [3, 93],
 [24, 50],
 [1, 10],
 [1, 10],
 [7, 88],
 [78, 118]]
800 total

Testing labels:
['female',
 'male',
 'female',
 'male',
 'female',
 'female',
 'female',
 'male',
 'female',
 'male']
800 total



In [7]:
# Train classifier
classifier = GaussianNB()
classifier.fit(train_features, train_labels)

# Classify test features
predicted_test_labels = classifier.predict(test_features)

# Run metrics
accuracy_metric = accuracy_score(predicted_test_labels, test_labels)
recall_metric = recall_score(predicted_test_labels, test_labels, average="weighted")
f1_metric = f1_score(predicted_test_labels, test_labels, average="weighted")

print("Accuracy:", accuracy_metric)
print("Recall:", recall_metric)
print("F1:", f1_metric)

Accuracy: 0.62125
Recall: 0.62125
F1: 0.6437022509240417


In [5]:
# Simple demo
for name in ["Bobert", "Boberta", "Bober", "Boberia"]:
    endings = extract_features(name)
    predictions = classifier.predict([endings])
    pprint(f"{name} {endings} => {predictions}")    

"Bobert [46, 63] => ['male']"
"Boberta [7, 44] => ['female']"
"Bober [54, 60] => ['male']"
"Boberia [7, 59] => ['female']"
