## Naive Bayes Classifiers
A powerful and intutitive technique. File this one away, it'll often teach you a lot about a problem, even if it doesn't "win" the accuracy game. First some examples from NLTK.

In [1]:
import nltk

from nltk.corpus import names
import random

# Create some labeled observations
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])

# shuffle so that we can have a training and test set
random.shuffle(labeled_names)

In [2]:
# For the purposes of this toy example, we just use the last letters as our only feature
def gender_features(word):
    return {'last_letter': word[-1]}

For this next line, read a bit about what's going on with this classifier [here](http://www.nltk.org/book/ch06.html). 

In [3]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [4]:
# Test vs train
print(nltk.classify.accuracy(classifier, test_set))

0.78


In [5]:
our_class = "ethan rachael bryce guedem marty chris ryan omid christabelle hayley mary kelly mike ashley liz hannah jason".split()

for student in our_class :
    print(student + " classified as " + classifier.classify(gender_features(student)))

# Errors on Bryce, Guedem, Marty, Kelly, Mike, Liz
print(1-6/len(our_class)) #65% accuracy

ethan classified as male
rachael classified as male
bryce classified as female
guedem classified as male
marty classified as female
chris classified as male
ryan classified as male
omid classified as male
christabelle classified as female
hayley classified as female
mary classified as female
kelly classified as female
mike classified as female
ashley classified as female
liz classified as male
hannah classified as female
jason classified as male
0.6470588235294117


In [11]:
from collections import Counter

num_males = 0

for item in featuresets :
    dd, gender = item
    
    if gender == "male" :
        num_males += 1
    

    

num_males

2943

In [12]:
Counter([gender for dd, gender in featuresets])

Counter({'female': 5001, 'male': 2943})

In [6]:
# let's just look at all the features. Usually you'd only show a few
classifier.show_most_informative_features(26)

Most Informative Features
             last_letter = 'a'            female : male   =     34.7 : 1.0
             last_letter = 'k'              male : female =     30.5 : 1.0
             last_letter = 'f'              male : female =     17.1 : 1.0
             last_letter = 'p'              male : female =     11.8 : 1.0
             last_letter = 'v'              male : female =     11.1 : 1.0
             last_letter = 'd'              male : female =      9.7 : 1.0
             last_letter = 'm'              male : female =      9.3 : 1.0
             last_letter = 'o'              male : female =      8.8 : 1.0
             last_letter = 'r'              male : female =      7.0 : 1.0
             last_letter = 'w'              male : female =      6.5 : 1.0
             last_letter = 'g'              male : female =      5.4 : 1.0
             last_letter = 'z'              male : female =      4.3 : 1.0
             last_letter = 't'              male : female =      4.1 : 1.0

In [24]:
random.shuffle(labeled_names) # Use this to shuffle in place to build training and test set

In [25]:
test_size = 500
devtest_size = 1000

train_names = labeled_names[(test_size + devtest_size):]
devtest_names = labeled_names[test_size:(test_size + devtest_size)]
test_names = labeled_names[:test_size]

In [26]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

Read the results of the cells below, and form some hypotheses of additional features to add. 

In [27]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Abigail                       
correct=female   guess=male     name=Adel                          
correct=female   guess=male     name=Aigneis                       
correct=female   guess=male     name=Alison                        
correct=female   guess=male     name=Alisun                        
correct=female   guess=male     name=Alleen                        
correct=female   guess=male     name=Allyn                         
correct=female   guess=male     name=Anabel                        
correct=female   guess=male     name=Ardis                         
correct=female   guess=male     name=Arleen                        
correct=female   guess=male     name=Averyl                        
correct=female   guess=male     name=Ayn                           
correct=female   guess=male     name=Bab                           
correct=female   guess=male     name=Beryl                         
correct=female   guess=male     name=Bliss      

In [13]:
import re
hyphen_space = re.compile(r'[ -]') # we haven't covered regular expressions yet....

# build your own function. Here's an example to get you started
def gender_features_2(word):
    ''' This function should take in a word and return a dictionary
        with the name of the feature as the key and the value 
        as the feature value. '''
    ll = word[-1]
    penultimate = word[-2]
    last_3 = word[-3:]
    last_4 = word[-4:]
    
    if hyphen_space.search(word) :
        double = True
    else :
        double = False
    
    ret_dict = {'last_letter':ll,
                'penultimate_y':(penultimate=="y"),
                'last_3_ann_een':(last_3 in {"ann","een"}),
                'last_4_lynn' : (last_4 == "lynn"),
                'double_name' : double}
    
    return (ret_dict)

In [14]:
gender_features_2("jason")

{'double_name': False,
 'last_3_ann_een': False,
 'last_4_lynn': False,
 'last_letter': 'n',
 'penultimate_y': False}

In [21]:
random.sample(our_class,1)

['hannah']

In [14]:
train_set = [(gender_features_2(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features_2(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.765


In [15]:
# Once you're done tweaking your code, run this one. 
print(nltk.classify.accuracy(classifier, test_set))

0.744


In [16]:
classifier.show_most_informative_features(30)

Most Informative Features
             last_letter = 'k'              male : female =     40.4 : 1.0
             last_letter = 'a'            female : male   =     37.4 : 1.0
             last_letter = 'f'              male : female =     28.8 : 1.0
             last_letter = 'p'              male : female =     18.7 : 1.0
             last_letter = 'w'              male : female =     10.5 : 1.0
             last_letter = 'd'              male : female =      9.6 : 1.0
             last_letter = 'v'              male : female =      9.2 : 1.0
             last_letter = 'm'              male : female =      7.8 : 1.0
             last_letter = 'o'              male : female =      7.7 : 1.0
             last_letter = 'r'              male : female =      6.9 : 1.0
             last_letter = 'z'              male : female =      6.4 : 1.0
          last_3_ann_een = True           female : male   =      6.3 : 1.0
             last_letter = 'u'              male : female =      4.7 : 1.0

In [17]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Aeriell                       
correct=female   guess=male     name=Aigneis                       
correct=female   guess=male     name=Allys                         
correct=female   guess=male     name=Anne-Mar                      
correct=female   guess=male     name=Astrid                        
correct=female   guess=male     name=Ayn                           
correct=female   guess=male     name=Babs                          
correct=female   guess=male     name=Beatriz                       
correct=female   guess=male     name=Brook                         
correct=female   guess=male     name=Cameo                         
correct=female   guess=male     name=Candis                        
correct=female   guess=male     name=Carlyn                        
correct=female   guess=male     name=Carolan                       
correct=female   guess=male     name=Caroleen                      
correct=female   guess=male     name=Caryl      