In [1]:
from nltk.corpus import names
from nltk import NaiveBayesClassifier
import random
from pprint import pprint
from nltk.classify import apply_features

# This example uses Naive Bayes approach to classify names by gender.
# Note that “gender” here is a biological gender (XX or XY chromosomes).

In [2]:
# Initial data input

labeled_names = (
    [(name, 'female') for name in names.words('female.txt')] 
    + [(name, 'male') for name in names.words('male.txt')]
)
random.shuffle(labeled_names)

pprint(labeled_names[:10])

[('Chrissie', 'female'),
 ('Olva', 'female'),
 ('Kiersten', 'female'),
 ('Lara', 'female'),
 ('Jammie', 'female'),
 ('Louella', 'female'),
 ('Jordain', 'female'),
 ('Garfield', 'male'),
 ('Belinda', 'female'),
 ('Gusti', 'female')]


In [3]:
# Prepare data for training

def extract_features(name):
    return {"last_1": name[-1], "last_2": name[-2:]}

featuresets = [(extract_features(name), gender) for (name, gender) in labeled_names]
print("Featuresets:")
pprint(featuresets[:10])
print()

split_index = round(len(featuresets) * .8)
train_names, test_names = labeled_names[:split_index], labeled_names[split_index:]
train_featureset = apply_features(extract_features, labeled_names[:split_index:])
test_featureset = apply_features(extract_features, labeled_names[split_index:])

print("Train featureset:")
pprint(train_featureset[:10])
print()

print("Test featureset:")
pprint(test_featureset[:10])
print()

Featuresets:
[({'last_1': 'e', 'last_2': 'ie'}, 'female'),
 ({'last_1': 'a', 'last_2': 'va'}, 'female'),
 ({'last_1': 'n', 'last_2': 'en'}, 'female'),
 ({'last_1': 'a', 'last_2': 'ra'}, 'female'),
 ({'last_1': 'e', 'last_2': 'ie'}, 'female'),
 ({'last_1': 'a', 'last_2': 'la'}, 'female'),
 ({'last_1': 'n', 'last_2': 'in'}, 'female'),
 ({'last_1': 'd', 'last_2': 'ld'}, 'male'),
 ({'last_1': 'a', 'last_2': 'da'}, 'female'),
 ({'last_1': 'i', 'last_2': 'ti'}, 'female')]

Train featureset:
[({'last_1': 'e', 'last_2': 'ie'}, 'female'), ({'last_1': 'a', 'last_2': 'va'}, 'female'), ...]

Test featureset:
[({'last_1': 's', 'last_2': 'es'}, 'male'), ({'last_1': 'a', 'last_2': 'ia'}, 'female'), ...]



In [4]:
# Train classifier
classifier = NaiveBayesClassifier.train(train_featureset)
classifier.show_most_informative_features(10)

Most Informative Features
                  last_2 = 'na'           female : male   =     86.3 : 1.0
                  last_2 = 'ia'           female : male   =     79.1 : 1.0
                  last_2 = 'la'           female : male   =     61.7 : 1.0
                  last_2 = 'rd'             male : female =     39.2 : 1.0
                  last_1 = 'a'            female : male   =     34.3 : 1.0
                  last_2 = 'ra'           female : male   =     33.4 : 1.0
                  last_2 = 'sa'           female : male   =     33.4 : 1.0
                  last_2 = 'us'             male : female =     31.9 : 1.0
                  last_1 = 'k'              male : female =     28.6 : 1.0
                  last_2 = 'ta'           female : male   =     21.0 : 1.0


In [5]:
# Run examples

for name in ["Bobert", "Boberta", "Bober", "Boberia"]:
    prediction = classifier.classify(extract_features(name))
    print(f"{name} => {prediction}")
    

Bobert => male
Boberta => female
Bober => male
Boberia => female
