In [92]:
from nltk.corpus import brown
import numpy as np
import pandas as pd

In [3]:
from collections import Counter

In [85]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=True)
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import Perceptron
from sklearn import tree
from sklearn.linear_model import LogisticRegression

## The Brown Corpus is a good dataset to begin training with
The corpus is split into genres or categories shown below. 

In [5]:
categories = brown.categories()

In [6]:
# Count of words per section in the Brown Corpus
dict(zip(categories, [len(brown.words(categories=cat)) for cat in categories]))

{'adventure': 69342,
 'belles_lettres': 173096,
 'editorial': 61604,
 'fiction': 68488,
 'government': 70117,
 'hobbies': 82345,
 'humor': 21695,
 'learned': 181888,
 'lore': 110299,
 'mystery': 57169,
 'news': 100554,
 'religion': 39399,
 'reviews': 40704,
 'romance': 70022,
 'science_fiction': 14470}

## Build a classifier to tell us whether a text is Adventure or Romance

In [8]:
classes = ['adventure', 'romance']

In [30]:
adventure_paras = brown.paras(categories='adventure')
adventure_tokens = brown.words(categories='adventure')

In [31]:
romance_paras = brown.paras(categories='romance')
romance_tokens = brown.words(categories='romance')

Before we start extracting features from our data, let's split the data into training, test, and dev sets so that we can evaluate our neural network once we have finished. 

Let's do a 75 / 15 / 10 split between training, test, and dev sets.

In [20]:
def split(dataset, train=0.80, test=0.20, dev=None):
    train_end = round(len(dataset) * train)
    if dev:
        test_end = train_end + round(len(dataset) * test)
        return dataset[:train_end], dataset[train_end:test_end], dataset[test_end:]
    else:
        return dataset[:train_end], dataset[train_end:]

In [21]:
adventure_train, adventure_test, adventure_dev = split(adventure_paras,
                                                       train=0.75,
                                                       test=0.15,
                                                       dev=0.10)

romance_train, romance_test, romance_dev = split(romance_paras,
                                                       train=0.75,
                                                       test=0.15,
                                                       dev=0.10)

Next we need to start extracting features. We already have our tokens, let's start by creating a list of features that say whether or not we saw a given word in the text.

While we do that, think about what other types of features you think would be useful in differentiating between adventure and romance texts.

In [58]:
# count of paragraphs in the adventure training set
print(len(adventure_train))
print(len(romance_train))

1040
940


Our first step in creating a list of features that indicate if we saw a given word is to find out what the vocabulary is. The vocabulary is the set of all word types we encountered.

In [27]:
len(romance_paras)

1253

In [28]:
len(adventure_paras)

1387

In [32]:
# First we need to add in a zero count for all vocab then add counts for each observed
vocab = set(adventure_tokens+romance_tokens)
len(vocab)

13469

In [42]:
def get_feature_dict(paragraph, vocab):
    # take one training example and return a sparse feature vector
    # init feature vec with zeros
    feature_vec = np.zeros(len(vocab), dtype=int)
    feature_dict = dict(zip(vocab, feature_vec))
    for sentence in paragraph: 
        for token in sentence:
            # ensure that feature is already in vocab
            if token in feature_dict.keys():
                feature_dict[token] = 1  # one hot encoding
    return feature_dict

In [44]:
# get_feature_dict(adventure_paras[0], vocab)

In [53]:
# training data
D_train_adv = [get_feature_dict(paragraph, vocab) for paragraph in adventure_train]
D_train_rom = [get_feature_dict(paragraph, vocab) for paragraph in romance_train]

# test data
D_test_adv = [get_feature_dict(paragraph, vocab) for paragraph in adventure_test]
D_test_rom = [get_feature_dict(paragraph, vocab) for paragraph in romance_test]

What is D?

D is a convention of Scikit Learn that represents an array of dictionaries. These dictionaries can be plugged into SKlearn's DictVectorizer to give us our numpy arrays that we will train and test our neural network on later.

Since we have two classes, romance and adventure, we have two dictionaries in our collection D.

Once we have added all of our features and their counts for each set of examples we will fit them into our feature vector using: X = v.fit_transform(D)

In [122]:
# D_train_adv[0]

So far we have a dictionary for each class that shows which word types appear in that text. We have repeated this for each test set (train, test, dev). This set of features will probably be useful for our model, but a more useful set of features might be the bigrams or trigrams that appear. Let's add those to the dictionaries, D.

In [59]:
X_train = v.fit_transform(D_train_adv+D_train_rom)
# X_train_rom = v.fit_transform(D_train_rom)

X_test = v.fit_transform(D_test_adv+D_test_rom)
# X_test_rom = v.fit_transform(D_test_rom)

In [60]:
X_train

<1980x13469 sparse matrix of type '<class 'numpy.float64'>'
	with 78282 stored elements in Compressed Sparse Row format>

In [61]:
X_test

<396x13469 sparse matrix of type '<class 'numpy.float64'>'
	with 14347 stored elements in Compressed Sparse Row format>

In [63]:
len(D_train_adv)

1040

In [71]:
def get_y(D_class1, D_class2):
    # return list of gold labels
    y1 = [0] * len(D_class1)
    y2 = [1] * len(D_class2)
    y = y1+y2
    return y

In [77]:
y_train = get_y(D_train_adv, D_train_rom)
y_test = get_y(D_test_adv, D_test_rom)

In [79]:
classifier = Perceptron().fit(X_train, y_train)

In [81]:
classifier.predict(X_test)

array([0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1,

In [82]:
def fit_and_evaluate_one(classifier, X_train, y_train, X_test, y_test): 
    # fitting
    classifier.fit(X_train, y_train)
    
    # accuracy
    results = {}
    y_hat_train = classifier.predict(X_train)
    accur_train = sum(y_hat_train == y_train) / len(y_train)  # train accuracy
    y_hat_test = classifier.predict(X_test)
    accur_test = sum(y_hat_test == y_test) / len(y_test)  # test accuracy
    results['accur_train'] = accur_train
    results['accur_test'] = accur_test

    return results

In [83]:
fit_and_evaluate_one(Perceptron(), X_train, y_train, X_test, y_test)

{'accur_test': 0.55303030303030298, 'accur_train': 0.97272727272727277}

In [86]:
# Classifiers to be fitted and evaluated
classifiers = [MultinomialNB(), 
               LogisticRegression(), 
               # svm.SVC(kernel='rbf'), 
               Perceptron(), 
               tree.DecisionTreeClassifier()]

In [88]:
def fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test):
    results = {}
    for c in classifiers:
        cname = str(c).split('(')[0]
        print('Training '+cname+ '...')
        results[cname] = fit_and_evaluate_one(c, X_train, y_train, X_test, y_test)
    return results

In [90]:
fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test)

Training MultinomialNB...
Training LogisticRegression...
Training Perceptron...
Training DecisionTreeClassifier...


{'DecisionTreeClassifier': {'accur_test': 0.54292929292929293,
  'accur_train': 1.0},
 'LogisticRegression': {'accur_test': 0.56060606060606055,
  'accur_train': 0.99494949494949492},
 'MultinomialNB': {'accur_test': 0.61111111111111116,
  'accur_train': 0.95303030303030301},
 'Perceptron': {'accur_test': 0.55303030303030298,
  'accur_train': 0.97272727272727277}}

In [93]:
result_dict = fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test)
result_df = pd.DataFrame.from_dict(result_dict).transpose().sort_values('accur_test', ascending=False)
result_df

Training MultinomialNB...
Training LogisticRegression...
Training Perceptron...
Training DecisionTreeClassifier...


Unnamed: 0,accur_test,accur_train
MultinomialNB,0.611111,0.95303
LogisticRegression,0.560606,0.994949
Perceptron,0.55303,0.972727
DecisionTreeClassifier,0.527778,1.0


What does this sparse parameter do?

# What other applications could this type of model work for? Classifying fake news?