In [1]:
import nltk
from nltk.corpus import gutenberg

gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [2]:
author1_train = gutenberg.sents('austen-emma.txt') + gutenberg.sents('austen-persuasion.txt')
author1_test = gutenberg.sents('austen-sense.txt')
print(len(author1_train))
print(len(author1_test))

11464
4999


In [3]:
author2_train = gutenberg.sents('shakespeare-caesar.txt') + gutenberg.sents('shakespeare-hamlet.txt')
author2_test = gutenberg.sents('shakespeare-macbeth.txt')
print(len(author2_train))
print(len(author2_test))

5269
1907


In [4]:
def statistics(gutenberg_data):
    for work in gutenberg_data:
        num_chars = len(gutenberg.raw(work))
        num_words = len(gutenberg.words(work))
        num_sents = len(gutenberg.sents(work))
        num_vocab = len(set(w.lower() for w in gutenberg.words(work)))
        
        print(round(num_chars/num_words),
            round(num_words/num_sents),
            round(num_words/num_vocab),
            work)

gutenberg_data = ['austen-emma.txt', 'austen-persuasion.txt',
                    'austen-sense.txt', 'shakespeare-caesar.txt',
                    'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt']
statistics(gutenberg_data)

5 25 26 austen-emma.txt
5 26 17 austen-persuasion.txt
5 28 22 austen-sense.txt
4 12 9 shakespeare-caesar.txt
4 12 8 shakespeare-hamlet.txt
4 12 7 shakespeare-macbeth.txt


In [5]:
import random
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit

all_sents = [(sent, 'austen') for sent in author1_train]
all_sents += [(sent, 'shakespeare') for sent in author2_train]

print (f"Dataset size = {str(len(all_sents))} sentences")

values = [author for (sent, author) in all_sents]
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
strat_train_set = []
strat_pretest_set = []

for train_index, pretest_index in split.split(all_sents, values):
    strat_train_set = [all_sents[index] for index in train_index]
    strat_pretest_set = [all_sents[index] for index in pretest_index]

Dataset size = 16733 sentences


In [6]:
def cat_prportions(data, cat):
    count = 0
    for item in data:
        if item[1] == cat:
            count += 1
    return float(count) / len(data)

categories = ['austen', 'shakespeare']
rows = []
rows.append(['Category', 'Overall', 'Stratified train', 'Stratified prestest'])

for cat in categories:
    rows.append([cat, f'{cat_prportions(all_sents, cat):0.6f}', 
                 f'{cat_prportions(strat_train_set, cat):0.6f}',
                 f'{cat_prportions(strat_pretest_set, cat):0.6f}'])
    
columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
    print(''.join(' {:{width}} '.format(row[i], width=column_widths[i])
    for i in range(0, len(row))))

 Category     Overall   Stratified train  Stratified prestest 
 austen       0.685113  0.685119          0.685091            
 shakespeare  0.314887  0.314881          0.314909            


In [7]:
test_set = [(sent, 'austen') for sent in author1_test]
test_set += [(sent, 'shakespeare') for sent in author2_test]

In [8]:
def get_featues(text):
    features = {}
    word_list = [word for word in text]
    for word in word_list:
        features[word] = True
    return features

train_features = [(get_featues(sent), label) for sent, label in strat_train_set]
pretest_features = [(get_featues(sent), label) for sent, label in strat_pretest_set]

print(len(train_features))
print(train_features[0][0])
print(train_features[100][0])

13386
{'Why': True, 'aske': True, 'you': True, 'this': True, '?': True}
{'Ham': True, '.': True}


In [9]:
from nltk import NaiveBayesClassifier, classify

classifier = NaiveBayesClassifier.train(train_features)

print (f"Accuracy on the training set = {str(classify.accuracy(classifier,train_features))}")
print (f"Accuracy on the pretest set = " + f"{str(classify.accuracy(classifier, pretest_features))}")
classifier.show_most_informative_features(50)

Accuracy on the training set = 0.9783355744808009
Accuracy on the pretest set = 0.9611592470869436
Most Informative Features
                    King = True           shakes : austen =    202.3 : 1.0
                    thou = True           shakes : austen =    192.2 : 1.0
                    been = True           austen : shakes =    154.5 : 1.0
                    only = True           austen : shakes =    125.2 : 1.0
                     own = True           austen : shakes =    105.6 : 1.0
                       d = True           shakes : austen =     65.2 : 1.0
                    doth = True           shakes : austen =     60.2 : 1.0
                   quite = True           austen : shakes =     52.1 : 1.0
                     Tis = True           shakes : austen =     50.0 : 1.0
                    Lord = True           shakes : austen =     48.9 : 1.0
                   think = True           austen : shakes =     41.5 : 1.0
                     She = True           austen :

In [10]:
test_feature = [(get_featues(sent), label) for sent, label in test_set]

In [11]:
classify.accuracy(classifier, test_feature)

0.8964668404286128

In [12]:
# calculate the number of occurances of each word (feature) in the trainig data

from collections import Counter

words = []

def extract_words(text, words):
    words += set([word for word in text])
    return words

for sents, label in strat_train_set:
    words = extract_words(sents, words)

counts = Counter(words)
print(counts)



In [14]:
from nltk import DecisionTreeClassifier

# select features that appear with more than minimum count and less than maximum count
minimum_count = 200
maximum_count = 0.2 * float(len(strat_train_set))

selected_words = []
for item in counts.items():
    count = float(item[1])
    if count > minimum_count and count < maximum_count:
        selected_words.append(item[0])
print(len(selected_words))

def get_features_dt(text, selected_words):
    features = {}
    word_list = [word for word in text]
    for word in word_list:
        if word in selected_words:
            features[word] = True
            
    return features

train_features = [(get_features_dt(sent, selected_words), label) for sent, label in strat_train_set]
pretest_features = [(get_features_dt(sent, selected_words), label) for sent, label in strat_pretest_set]
test_features = [(get_features_dt(sent, selected_words), label) for sent, label in test_set]

classifier = DecisionTreeClassifier.train(train_features)

165


In [15]:
from nltk import classify

print(f'Accuracy on the trainig features: {classify.accuracy(classifier, train_features)}')
print(f'Accuracy on the pretest features: {classify.accuracy(classifier, pretest_features)}')
print(f'Accuracy on the test features: {classify.accuracy(classifier, test_features)}')

Accuracy on the trainig features: 0.8096518750933811
Accuracy on the pretest features: 0.7914550343591276
Accuracy on the test features: 0.8088618592528236
