In [8]:
import nltk
import random

In [9]:
from nltk.corpus import names
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     /Users/danielyakubov/nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [69]:
male_names = names.words('male.txt')
female_names = names.words('female.txt')
labeled_names = []

def name_feature_extraction(name):
    return {'last_letter': name[-1], 'last_two_letters': name[-2:]}

# build a list of names labeled by gender
for m_name, f_name in zip(male_names, female_names):
    labeled_names.append((m_name, 'M'))
    labeled_names.append((f_name, 'F'))

random.shuffle(labeled_names)

#adding features
labeled_names = [(name_feature_extraction(name), label) for name, label in labeled_names]

# dividing into train, test
tenth = len(labeled_names)//10
train = labeled_names[:tenth*9]
test = labeled_names[tenth*9:]

# training naive bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train)

In [70]:
# some small tests to see effects
for name in ['Neo', 'Trinity', 'Morpheus', 'Daniel', 'Isabela']:
    name_features = name_feature_extraction(name)
    print(name, classifier.classify(name_features))

Neo M
Trinity F
Morpheus M
Daniel M
Isabela F


In [71]:
# actual accuracy
print(nltk.classify.accuracy(classifier, test))

0.7828282828282829


In [72]:
# most informative
classifier.show_most_informative_features()

Most Informative Features
        last_two_letters = 'na'                F : M      =     89.2 : 1.0
        last_two_letters = 'la'                F : M      =     59.8 : 1.0
        last_two_letters = 'us'                M : F      =     36.9 : 1.0
        last_two_letters = 'ia'                F : M      =     35.4 : 1.0
             last_letter = 'a'                 F : M      =     33.1 : 1.0
        last_two_letters = 'ta'                F : M      =     32.8 : 1.0
        last_two_letters = 'rt'                M : F      =     32.3 : 1.0
        last_two_letters = 'ra'                F : M      =     25.9 : 1.0
        last_two_letters = 'sa'                F : M      =     25.7 : 1.0
             last_letter = 'k'                 M : F      =     18.7 : 1.0


In [44]:
print(len(labeled_names))

5886


In [73]:
# different task
# POS tagging

In [92]:
from nltk.corpus import brown

In [114]:
def pos_features(prev_word, word, suc_word):
    return {'suf1': word[-1],
           'suf2': word[-2:],
           'pre1': word[0],
           'pre2': word[:2],
           'previous_word': prev_word,
           'succeeding_word': suc_word}

tagged_data = [(w.lower(), tag) for w, tag in brown.tagged_words(categories='news')]
# first item built in
tagged_features = [(pos_features('NA', tagged_data[0][0], tagged_data[1][0]), tagged_data[0][1])]

for i, (word, tag) in enumerate(list(tagged_data)[1:-1]):
    prev_word, _ = tagged_data[i-1]
    suc_word, _ = tagged_data[i+1]
    tagged_features.append((pos_features(prev_word, word, suc_word), tag))
# last item
tagged_features.append((pos_features(tagged_data[-2][0], tagged_data[-1][0], 'NA'), tagged_data[-1][1]))

random.shuffle(tagged_features)

tenth = len(tagged_features)//10
test_set = tagged_features[:tenth]
dev_set = tagged_features[tenth:tenth*2]
train_set = tagged_features[tenth*2:]

# training the classifier
pos_classifier = nltk.NaiveBayesClassifier.train(train_set)

# sanity check
QA = ['Becky', 'said', 'the', 'car', 'is', 'red']
QA = [w.lower() for w in QA]

QA_features = [pos_features('NA', QA[0], QA[1])]

for i, word in enumerate(QA[1:-1]):
    QA_features.append(pos_features(QA[i-1], word, QA[i+1]))
QA_features.append(pos_features(QA[-2], QA[-1], 'NA'))

for item in QA_features:
    print(pos_classifier.classify(item))

BE
VBD
AT
NN
BEZ
VBN


In [116]:
# most informative features
pos_classifier.show_most_informative_features()

Most Informative Features
                    suf1 = '.'                 . : NN     =   6304.0 : 1.0
                    suf2 = 'he'               AT : NN     =   4946.1 : 1.0
                    pre2 = 'an'               CC : IN     =   4215.1 : 1.0
                    pre1 = 'c'               MD* : IN     =   2765.7 : 1.0
                    suf2 = 'ho'              WPS : NN     =   2688.2 : 1.0
                    pre1 = "'"                '' : NNS    =   2615.6 : 1.0
                    suf2 = 'to'               TO : JJ     =   2007.1 : 1.0
                    pre1 = 'l'               RBR : IN     =   1958.3 : 1.0
                    suf1 = 'h'               ABX : NNS    =   1951.8 : 1.0
                    pre2 = 'it'              PPS : NN     =   1779.8 : 1.0


In [117]:
# accuracy
print(nltk.classify.accuracy(pos_classifier, dev_set))
print(nltk.classify.accuracy(pos_classifier, test_set))

0.8189955246146196
0.8203878667329687
