In [1]:
import nltk
from nltk.corpus import names
import random
from nltk.classify import apply_features

In [2]:
# Define the features of the naive bayes classifier.
def gender_feature(word):
    return{'last_letter': word[-1],
            'first_letter': word[0],
            'length': len(word),
            'second_Last': word[-2]}

gender_feature('Shrek')

{'last_letter': 'k', 'first_letter': 'S', 'length': 5, 'second_Last': 'e'}

In [3]:
def gender_feature2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [4]:
gender_feature2('Beltran')

{'first_letter': 'b',
 'last_letter': 'n',
 'count(a)': 1,
 'has(a)': True,
 'count(b)': 1,
 'has(b)': True,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 1,
 'has(e)': True,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 0,
 'has(h)': False,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 0,
 'has(j)': False,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 1,
 'has(l)': True,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 0,
 'has(o)': False,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 1,
 'has(r)': True,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 1,
 'has(t)': True,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [5]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + 
[(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

In [6]:
feature_sets = [(gender_feature(n), gender) for (n, gender) in labeled_names]
train_set, test_set = feature_sets[500:], feature_sets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [7]:
classifier.classify(gender_feature('Beltran'))

'male'

In [8]:
print(nltk.classify.accuracy(classifier, test_set))

0.778


In [9]:
classifier.show_most_informative_features(100)

Most Informative Features
             last_letter = 'k'              male : female =     46.4 : 1.0
             last_letter = 'a'            female : male   =     33.4 : 1.0
             last_letter = 'v'              male : female =     17.3 : 1.0
             last_letter = 'f'              male : female =     17.0 : 1.0
             last_letter = 'p'              male : female =     12.4 : 1.0
             last_letter = 'd'              male : female =      9.6 : 1.0
             last_letter = 'm'              male : female =      9.2 : 1.0
             last_letter = 'o'              male : female =      8.9 : 1.0
             second_Last = 'o'              male : female =      7.3 : 1.0
             last_letter = 'r'              male : female =      7.3 : 1.0
             second_Last = 'u'              male : female =      6.4 : 1.0
             last_letter = 'w'              male : female =      5.3 : 1.0
            first_letter = 'W'              male : female =      4.8 : 1.0

In [10]:
train_set = apply_features(gender_feature, labeled_names[500:])
test_set = apply_features(gender_feature, labeled_names[:500])

In [11]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [12]:
# Naive Bayes classifier algorithm works by looking at a set of features,
# assuming that they are independent of each other, and calculating the probability
# that a thing belongs to a certain category based on those features using Bayes' theorem.
train_set = [(gender_feature(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_feature(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_feature(n), gender) for (n, gender) in test_names]
classifier2 = nltk.NaiveBayesClassifier.train(train_set)

In [13]:
print(nltk.classify.accuracy(classifier,devtest_set))

0.78


In [14]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier2.classify(gender_feature(name))
    if guess != tag:
        errors.append((tag, guess, name))
print(len(errors))

222


In [15]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Allis                         
correct=female   guess=male     name=Barry                         
correct=female   guess=male     name=Bel                           
correct=female   guess=male     name=Blair                         
correct=female   guess=male     name=Buffy                         
correct=female   guess=male     name=Cathleen                      
correct=female   guess=male     name=Charo                         
correct=female   guess=male     name=Christian                     
correct=female   guess=male     name=Chrystal                      
correct=female   guess=male     name=Cleo                          
correct=female   guess=male     name=Cloris                        
correct=female   guess=male     name=Clovis                        
correct=female   guess=male     name=Cristal                       
correct=female   guess=male     name=Daloris                       
correct=female   guess=male     name=Darb       