In [46]:
import nltk
from nltk.corpus import names
nltk.download("names")
import random

[nltk_data] Downloading package names to /home/bitmap4/nltk_data...
[nltk_data]   Package names is already up-to-date!


In [47]:
labelled_names =  [(name, "male")   for name in names.words("male.txt")]
labelled_names += [(name, "female") for name in names.words("female.txt")]
print(labelled_names[:10])


[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male'), ('Abbott', 'male'), ('Abby', 'male'), ('Abdel', 'male'), ('Abdul', 'male'), ('Abdulkarim', 'male')]


In [48]:
random.shuffle(labelled_names)
print(labelled_names[:10])

[('Tamara', 'female'), ('Vivi', 'female'), ('Georgie', 'male'), ('Meagan', 'female'), ('Janaye', 'female'), ('Rafaela', 'female'), ('Oralla', 'female'), ('Cori', 'female'), ('Quincy', 'male'), ('Boyd', 'male')]


### Defining Gender Features

In [120]:
def gender_features(name):
    return {"last_two_letters": name[-2:], 'vowel_count': sum(1 for letter in name if letter.lower() in 'aeiou')}

In [121]:
feature_sets = [(gender_features(name), gender) for (name, gender) in labelled_names]
print(feature_sets[:10])

[({'last_two_letters': 'ra', 'vowel_count': 3}, 'female'), ({'last_two_letters': 'vi', 'vowel_count': 2}, 'female'), ({'last_two_letters': 'ie', 'vowel_count': 4}, 'male'), ({'last_two_letters': 'an', 'vowel_count': 3}, 'female'), ({'last_two_letters': 'ye', 'vowel_count': 3}, 'female'), ({'last_two_letters': 'la', 'vowel_count': 4}, 'female'), ({'last_two_letters': 'la', 'vowel_count': 3}, 'female'), ({'last_two_letters': 'ri', 'vowel_count': 2}, 'female'), ({'last_two_letters': 'cy', 'vowel_count': 2}, 'male'), ({'last_two_letters': 'yd', 'vowel_count': 1}, 'male')]


### Splitting into training, testing and dev datasets

In [122]:
train_set, dev_set, test_set = feature_sets[1000:], feature_sets[500:1000], feature_sets[:500]
print(len(train_set), len(dev_set), len(test_set))

6944 500 500


### Training the classifier

In [123]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [151]:
classifier.classify(gender_features("ishaan romil"))

'female'

In [125]:
nltk.classify.accuracy(classifier, dev_set)

0.768

### Likelihood Ratios

In [126]:
classifier.show_most_informative_features()

Most Informative Features
        last_two_letters = 'na'           female : male   =    157.2 : 1.0
        last_two_letters = 'la'           female : male   =     70.3 : 1.0
        last_two_letters = 'ia'           female : male   =     37.0 : 1.0
        last_two_letters = 'ra'           female : male   =     35.0 : 1.0
        last_two_letters = 'rt'             male : female =     33.0 : 1.0
        last_two_letters = 'rd'             male : female =     29.2 : 1.0
        last_two_letters = 'us'             male : female =     25.9 : 1.0
        last_two_letters = 'do'             male : female =     23.9 : 1.0
        last_two_letters = 'ta'           female : male   =     23.0 : 1.0
        last_two_letters = 'ld'             male : female =     21.0 : 1.0


### Errors

In [127]:
errors= []
for (feature, tag), (name, tag) in zip(dev_set, labelled_names[500:1000]):
    guess = classifier.classify(feature)
    if guess != tag:
        errors.append((tag, guess, name))

for error in errors:
    if error[2][-2:] in ("na", "la", "ia", "ra", "rt", "rd"):
        print(f'Tag: {error[0]}\tGuess: {error[1]}\tName: {error[2]}')

Tag: female	Guess: male	Name: Hildegaard
Tag: male	Guess: female	Name: Dana
Tag: female	Guess: male	Name: Nert
