## Naive Bayes Classifiers
A powerful and intutitive technique. File this one away, it'll often teach you a lot about a problem, even if it doesn't "win" the accuracy game. First some examples from NLTK.

In [None]:
import nltk

from nltk.corpus import names
import random

# Create some labeled observations
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])

# shuffle so that we can have a training and test set
random.shuffle(labeled_names)

In [None]:
# For the purposes of this toy example, we just use the last letters as our only feature
def gender_features(word):
    return {'last_letter': word[-1]}

In [None]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
# Test vs train
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
our_class = "david lane tina jessica kayla anne bailey chris raelin levi dustin jim brandon jess".split()

for student in our_class :
    print(student + " classified as " + classifier.classify(gender_features(student)))

# Three errors on this simple model: Lane, Raelin, Jess
print(1-3/len(our_class))

In [None]:
# let's just look at all the features. Usually you'd only show a few
classifier.show_most_informative_features(26)

# Pretty wrong on "s" for our Jess's. N and E are coinflips, almost....

---

## Examing Errors

It's useful to look some of the values you miss on. The code below will print out the misses so you can build some new features.

In [None]:
test_size = 500
devtest_size = 1000

train_names = labeled_names[(test_size + devtest_size):]
devtest_names = labeled_names[test_size:(test_size + devtest_size)]
test_names = labeled_names[:test_size]

In [None]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

Read the results of the cells below, and form some hypotheses of additional features to add. 

In [None]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

In [None]:
def gender_features_2(word):
    return {'last_letter': word[-1],
            'first_letter' : word[1],
            'last_two' : word[-2],
            'y_s' : len([let for let in word if let == 'y'])}

In [None]:
random.shuffle(labeled_names)

train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features_2(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

In [None]:
print(nltk.classify.accuracy(classifier, test_set))

In [None]:
our_class = "david lane tina jessica kayla anne bailey chris raelin levi dustin jim brandon jess".split()

for student in our_class :
    print(student + " classified as " + classifier.classify(gender_features_2(student)))

# even worse on our class. now misses Levi too. 

In [None]:
classifier.show_most_informative_features(30)