In [11]:
import nltk

In [12]:
def gender_features(word): 
    return {'last_letter': word[-1]} 
gender_features('Shrek') 
{'last_letter': 'k'}

{'last_letter': 'k'}

In [13]:
nltk.download('names')

[nltk_data] Downloading package names to C:\Users\DANG
[nltk_data]     CUONG\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [14]:
from nltk.corpus import names

labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

import random 
random.shuffle(labeled_names)

In [15]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

train_set, test_set = featuresets[500:], featuresets[:500]

classifier = nltk.NaiveBayesClassifier.train(train_set)

In [16]:
classifier.classify(gender_features('Neo')) 

'male'

In [17]:
classifier.classify(gender_features('Trinity')) 

'female'

In [18]:
print(nltk.classify.accuracy(classifier, test_set))

0.778


In [19]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     35.5 : 1.0
             last_letter = 'k'              male : female =     31.6 : 1.0
             last_letter = 'p'              male : female =     18.6 : 1.0
             last_letter = 'f'              male : female =     15.9 : 1.0
             last_letter = 'm'              male : female =     10.4 : 1.0


---

In [20]:
from nltk.classify import apply_features 

In [21]:
train_set = apply_features(gender_features, labeled_names[500:]) 
test_set = apply_features(gender_features, labeled_names[:500])

In [22]:
def gender_features2(name): 
    features = {} 
    features["first_letter"] = name[0].lower() 
    features["last_letter"] = name[-1].lower() 
    for letter in 'abcdefghijklmnopqrstuvwxyz': 
        features["count({})".format(letter)] = name.lower().count(letter) 
        features["has({})".format(letter)] = (letter in name.lower()) 
    return features

In [23]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names] 
train_set, test_set = featuresets[500:], featuresets[:500] 
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print(nltk.classify.accuracy(classifier, test_set)) 

0.792


In [24]:
train_names = labeled_names[1500:] 
devtest_names = labeled_names[500:1500] 
test_names = labeled_names[:500]

In [25]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names] 
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names] 
test_set = [(gender_features(n), gender) for (n, gender) in test_names] 
classifier = nltk.NaiveBayesClassifier.train(train_set)  
print(nltk.classify.accuracy(classifier, devtest_set))

0.755


In [26]:
errors = [] 
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name)) 
    if guess != tag: 
        errors.append( (tag, guess, name) )

In [27]:
for (tag, guess, name) in sorted(errors): 
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name)) 

correct=female   guess=male     name=Abagael                       
correct=female   guess=male     name=Abigail                       
correct=female   guess=male     name=Adriaens                      
correct=female   guess=male     name=Alis                          
correct=female   guess=male     name=Alys                          
correct=female   guess=male     name=Anabel                        
correct=female   guess=male     name=Anais                         
correct=female   guess=male     name=Angil                         
correct=female   guess=male     name=April                         
correct=female   guess=male     name=Arabel                        
correct=female   guess=male     name=Avril                         
correct=female   guess=male     name=Avrit                         
correct=female   guess=male     name=Beatriz                       
correct=female   guess=male     name=Beau                          
correct=female   guess=male     name=Bell       

In [28]:
def gender_features(word): 
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}

In [29]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names] 
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names] 
classifier = nltk.NaiveBayesClassifier.train(train_set) 
print(nltk.classify.accuracy(classifier, devtest_set)) 

0.777


---

In [45]:
from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import KFold

In [36]:
classifier = SklearnClassifier(LogisticRegression())

In [32]:
train_set = apply_features(gender_features, labeled_names[500:]) 
test_set = apply_features(gender_features, labeled_names[:500])

In [37]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names] 

train_set, test_set = featuresets[500:], featuresets[:500] 

classifier = classifier.train(train_set)

In [38]:
print(nltk.classify.accuracy(classifier, test_set)) 

0.816


In [47]:
cross_validation = KFold(n_splits=10, shuffle=True, random_state=42)

---

# Lab 3