In [1]:
import nltk
from nltk.corpus import names
import random
from nltk.classify import apply_features
from nltk.classify import NaiveBayesClassifier

Basic Imports, tee everything up

In [2]:
nltk.download('names')

names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\Aman\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\names.zip.


Download and import names data, randomly shuffle it to prevent bias.

In [26]:
train_set = names[1000:]
dev_test_set = names[500:1000]
test_set = names[:500]
names[1]

('Kostas', 'male')

Train Test Split method

In [16]:
def gender_features(name):
    features = {}
    features["last_letter"] = name[-1].lower()
    features["first_letter"] = name[0].lower()
    features["length"] = len(name)
    features["vowel_count"] = sum(1 for letter in name if letter in 'aeiouAEIOU')
    features["consonant_count"] = len(name) - features["vowel_count"]
    features["last_two_letters"] = name[-2:].lower() if len(name) > 1 else name[-1].lower()
    features["first_two_letters"] = name[:2].lower() if len(name) > 1 else name[0].lower()
    features["is_last_vowel"] = (name[-1].lower() in 'aeiou')
    features["is_first_vowel"] = (name[0].lower() in 'aeiou')
    features["num_distinct_letters"] = len(set(name.lower()))
    features["begin_with_vowel"] = name[0].lower() in 'aeiou'
    features["end_with_vowel"] = name[-1].lower() in 'aeiou'
    features["count_vowel_first_half"] = sum(1 for letter in name[:len(name)//2] if letter in 'aeiouAEIOU')
    features["count_vowel_second_half"] = sum(1 for letter in name[len(name)//2:] if letter in 'aeiouAEIOU')
    return features

Randomly guessing on interesting features, derived the following feature set from concepts in the natural language processing book


In [29]:
train_features = apply_features(gender_features, train_set)
dev_test_features = apply_features(gender_features, dev_test_set)
classifier = NaiveBayesClassifier.train(train_features)

print("Accuracy on dev-test set:", nltk.classify.accuracy(classifier, dev_test_features))
classifier.show_most_informative_features(10)



Accuracy on dev-test set: 0.768
Most Informative Features
        last_two_letters = 'na'           female : male   =     90.5 : 1.0
        last_two_letters = 'la'           female : male   =     71.3 : 1.0
        last_two_letters = 'ia'           female : male   =     37.5 : 1.0
             last_letter = 'a'            female : male   =     34.7 : 1.0
        last_two_letters = 'us'             male : female =     33.5 : 1.0
        last_two_letters = 'sa'           female : male   =     33.1 : 1.0
        last_two_letters = 'ta'           female : male   =     31.6 : 1.0
             last_letter = 'k'              male : female =     30.2 : 1.0
             last_letter = 'f'              male : female =     27.6 : 1.0
        last_two_letters = 'ra'           female : male   =     24.7 : 1.0


Lets train and test

In [18]:
test_features = apply_features(gender_features, test_set)

print("Accuracy on test set:", nltk.classify.accuracy(classifier, test_features))

Accuracy on test set: 0.734


Let's Score

Let's redefine the model to just take the two most impactful features. We can name it gender_features_slim

In [27]:
def gender_features_slim(name):
    features = {}
    features["last_two_letters"] = name[-2:].lower() if len(name) > 1 else name[-1].lower()
    features["last_letter"] = name[-1].lower()
    return features

In [None]:
Same Structure

In [30]:
train_features = apply_features(gender_features_slim, train_set)
dev_test_features = apply_features(gender_features_slim, dev_test_set)
classifier = NaiveBayesClassifier.train(train_features)

print("Accuracy on dev-test set:", nltk.classify.accuracy(classifier, dev_test_features))
classifier.show_most_informative_features(10)


Accuracy on dev-test set: 0.802
Most Informative Features
        last_two_letters = 'na'           female : male   =     90.5 : 1.0
        last_two_letters = 'la'           female : male   =     71.3 : 1.0
        last_two_letters = 'ia'           female : male   =     37.5 : 1.0
             last_letter = 'a'            female : male   =     34.7 : 1.0
        last_two_letters = 'us'             male : female =     33.5 : 1.0
        last_two_letters = 'sa'           female : male   =     33.1 : 1.0
        last_two_letters = 'ta'           female : male   =     31.6 : 1.0
             last_letter = 'k'              male : female =     30.2 : 1.0
             last_letter = 'f'              male : female =     27.6 : 1.0
        last_two_letters = 'ra'           female : male   =     24.7 : 1.0


Train and Test

In [31]:
test_features = apply_features(gender_features, test_set)

print("Accuracy on test set:", nltk.classify.accuracy(classifier, test_features))

Accuracy on test set: 0.786


Finally, lets score it.

Essentially, the longer feature set in gender_features includes various features like the first letter, name length, and counts of vowels and consonants, it doesn't necessarily lead to better performance. By focusing on the most informative features, gender_features_slim can achieve similar or better accuracy due to reduced noise and overfitting. The key here is that the most informative features often carry the bulk of the predictive power, and this is reinforced by show_most_informative_features where these two features have most of the prediction value. 