In [1]:
from nltk.corpus import names
import nltk

In [2]:
import random

In [3]:
names = ([(name,'male') for name in names.words('male.txt')] + [(name,'female') for name in names.words('female.txt')])

In [4]:
random.shuffle(names)

In [5]:
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('shrek')

{'last_letter': 'k'}

In [6]:
featuresets = [(gender_features(n),g) for (n,g) in names]


In [7]:
train_set , test_set = featuresets[500:],featuresets[:500]

In [8]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [9]:
classifier.classify(gender_features('Neo'))


'male'

In [10]:
classifier.classify(gender_features('Trinity'))

'female'

In [11]:
print (nltk.classify.accuracy(classifier,test_set))

0.786


In [12]:
classifier.show_most_informative_features(20)

Most Informative Features
             last_letter = 'a'            female : male   =     34.3 : 1.0
             last_letter = 'k'              male : female =     30.8 : 1.0
             last_letter = 'f'              male : female =     15.9 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0
             last_letter = 'd'              male : female =      9.9 : 1.0
             last_letter = 'm'              male : female =      8.2 : 1.0
             last_letter = 'o'              male : female =      8.1 : 1.0
             last_letter = 'r'              male : female =      6.7 : 1.0
             last_letter = 'w'              male : female =      6.2 : 1.0
             last_letter = 'g'              male : female =      4.9 : 1.0
             last_letter = 'b'              male : female =      4.6 : 1.0
             last_letter = 's'              male : female =      4.2 : 1.0

#### Document classification 

In [13]:
from nltk.corpus import movie_reviews

In [14]:
documents = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

In [15]:
random.shuffle(documents)

In [16]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

In [17]:
word_features = all_words.most_common(2000)

In [18]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contain(%s)' % str(word)] = (word in document_words)
    return features 

In [19]:
print (document_features(movie_reviews.words('pos/cv957_8737.txt')))

{"contain((',', 77717))": False, "contain(('the', 76529))": False, "contain(('.', 65876))": False, "contain(('a', 38106))": False, "contain(('and', 35576))": False, "contain(('of', 34123))": False, "contain(('to', 31937))": False, 'contain(("\'", 30585))': False, "contain(('is', 25195))": False, "contain(('in', 21822))": False, "contain(('s', 18513))": False, 'contain((\'"\', 17612))': False, "contain(('it', 16107))": False, "contain(('that', 15924))": False, "contain(('-', 15595))": False, "contain((')', 11781))": False, "contain(('(', 11664))": False, "contain(('as', 11378))": False, "contain(('with', 10792))": False, "contain(('for', 9961))": False, "contain(('his', 9587))": False, "contain(('this', 9578))": False, "contain(('film', 9517))": False, "contain(('i', 8889))": False, "contain(('he', 8864))": False, "contain(('but', 8634))": False, "contain(('on', 7385))": False, "contain(('are', 6949))": False, "contain(('t', 6410))": False, "contain(('by', 6261))": False, "contain(('be'

In [20]:
featuresets = [(document_features(d),c) for (d,c) in  documents] 

In [21]:
train_set ,test_set = featuresets[100:], featuresets[:100]

In [22]:
classifierdoc = nltk.NaiveBayesClassifier.train(train_set)

In [23]:
print (nltk.classify.accuracy(classifierdoc,test_set))

0.48


### part of speech tagging 

In [29]:
import nltk
from nltk.corpus import brown

suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] +=1
    suffix_fdist[word[-2:]] +=1
    suffix_fdist[word[-3:]] +=1
common_suffixes = []
for suffix in suffix_fdist.most_common(100):
    common_suffixes.append(str(suffix.__getitem__(0)))
print (common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [31]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes :
        features['endswith(%s)' % suffix ] = word.lower().endswith(suffix)
    return features 

In [32]:
tagged_words = brown.tagged_words(categories = 'news')

In [33]:
featuresets  = [(pos_features(n),g) for (n,g) in tagged_words]

In [34]:
size = int(len(featuresets) * 0.1)

In [35]:
train_set , test_set = featuresets[size:] , featuresets[:size]

In [36]:
classifier = nltk.DecisionTreeClassifier.train(train_set)

In [37]:
nltk.classify.accuracy(classifier,test_set)

0.6270512182993535

In [38]:
classifier.classify(pos_features('cats'))

'NNS'

### Sentence ssegmentation 

In [39]:
sents = nltk.corpus.treebank_raw.sents()

In [40]:
tokens = []

In [41]:
boundaries = set()

In [42]:
offset = 0

In [44]:
for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [45]:
def punct_features(tokens,i):
    return {'next-word-capitalized' : tokens[i+1][0].isupper(),'prevword' : tokens[i-1].lower() , 'punct': tokens[i] , 'prev-word-is-one-char' : len(tokens[i-1]) == 1}

In [46]:
featuresets = [(punct_features(tokens,i),(i in boundaries)) for i in range(1,len(tokens)-1) if tokens[i] in '.?!']

In [47]:
size = int(len(featuresets)*0.1)
train_set ,test_set = featuresets[size:],featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier,test_set)

0.936026936026936