In [1]:
from nltk.corpus import brown
import numpy as np
import pandas as pd

In [2]:
from collections import Counter

In [3]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=True)
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import Perceptron
from sklearn import tree
from sklearn.linear_model import LogisticRegression

## The Brown Corpus is a good dataset to begin training with
The corpus is split into genres or categories shown below. 

In [4]:
categories = brown.categories()

In [5]:
# Count of words per section in the Brown Corpus
dict(zip(categories, [len(brown.words(categories=cat)) for cat in categories]))

{'adventure': 69342,
 'belles_lettres': 173096,
 'editorial': 61604,
 'fiction': 68488,
 'government': 70117,
 'hobbies': 82345,
 'humor': 21695,
 'learned': 181888,
 'lore': 110299,
 'mystery': 57169,
 'news': 100554,
 'religion': 39399,
 'reviews': 40704,
 'romance': 70022,
 'science_fiction': 14470}

## Build a classifier to tell us whether a text is News or Lore

In [6]:
classes = ['news', 'lore']

In [7]:
news_paras = brown.paras(categories='news')
news_tokens = brown.words(categories='news')

In [8]:
lore_paras = brown.paras(categories='lore')
lore_tokens = brown.words(categories='lore')

Before we start extracting features from our data, let's split the data into training, test, and dev sets so that we can evaluate our neural network once we have finished. 

Let's do a 75 / 15 / 10 split between training, test, and dev sets.

In [9]:
def split(dataset, train=0.80, test=0.20, dev=None):
    train_end = round(len(dataset) * train)
    if dev:
        test_end = train_end + round(len(dataset) * test)
        return dataset[:train_end], dataset[train_end:test_end], dataset[test_end:]
    else:
        return dataset[:train_end], dataset[train_end:]

In [10]:
news_train, news_test, news_dev = split(news_paras,
                                                       train=0.75,
                                                       test=0.15,
                                                       dev=0.10)

lore_train, lore_test, lore_dev = split(lore_paras,
                                                       train=0.75,
                                                       test=0.15,
                                                       dev=0.10)

Next we need to start extracting features. We already have our tokens, let's start by creating a list of features that say whether or not we saw a given word in the text.

While we do that, think about what other types of features you think would be useful in differentiating between news and lore texts.

In [11]:
# count of paragraphs in the news training set
print(len(news_train))
print(len(lore_train))

1676
902


Our first step in creating a list of features that indicate if we saw a given word is to find out what the vocabulary is. The vocabulary is the set of all word types we encountered.

In [12]:
len(news_paras)

2234

In [13]:
len(lore_paras)

1203

In [14]:
# First we need to add in a zero count for all vocab then add counts for each observed
vocab = set(news_tokens+lore_tokens)
len(vocab)

22522

In [15]:
def get_feature_dict(paragraph, vocab):
    # take one training example and return a sparse feature vector
    # init feature vec with zeros
    feature_vec = np.zeros(len(vocab), dtype=int)
    feature_dict = dict(zip(vocab, feature_vec))
    for sentence in paragraph: 
        for token in sentence:
            # ensure that feature is already in vocab
            if token in feature_dict.keys():
                feature_dict[token] = 1  # one hot encoding
    return feature_dict

In [16]:
# get_feature_dict(lore_paras[0], vocab)

In [32]:
# training data
D_train_news = [get_feature_dict(paragraph, vocab) for paragraph in news_train]
D_train_lore = [get_feature_dict(paragraph, vocab) for paragraph in lore_train]

# test data
D_test_news = [get_feature_dict(paragraph, vocab) for paragraph in news_test]
D_test_lore = [get_feature_dict(paragraph, vocab) for paragraph in lore_test]

# test data
D_dev_news = [get_feature_dict(paragraph, vocab) for paragraph in news_dev]
D_dev_lore = [get_feature_dict(paragraph, vocab) for paragraph in lore_dev]

What is D?

D is a convention of Scikit Learn that represents an array of dictionaries. These dictionaries can be plugged into SKlearn's DictVectorizer to give us our numpy arrays that we will train and test our neural network on later.

Since we have two classes, news and lore, we have two dictionaries in our collection D.

Once we have added all of our features and their counts for each set of examples we will fit them into our feature vector using: X = v.fit_transform(D)

In [18]:
# D_train_news[0]

So far we have a dictionary for each class that shows which word types appear in that text. We have repeated this for each test set (train, test, dev). This set of features will probably be useful for our model, but a more useful set of features might be the bigrams or trigrams that appear. Let's add those to the dictionaries, D.

In [33]:
X_train = v.fit_transform(D_train_news+D_train_lore)
X_test = v.fit_transform(D_test_news+D_test_lore)
X_dev = v.fit_transform(D_dev_news+D_dev_lore)

In [20]:
X_train

<2578x22522 sparse matrix of type '<class 'numpy.float64'>'
	with 106941 stored elements in Compressed Sparse Row format>

In [21]:
X_test

<515x22522 sparse matrix of type '<class 'numpy.float64'>'
	with 25841 stored elements in Compressed Sparse Row format>

In [22]:
def get_y(D_class1, D_class2):
    # return list of gold labels
    y1 = [0] * len(D_class1)
    y2 = [1] * len(D_class2)
    y = y1+y2
    return y

In [34]:
y_train = get_y(D_train_news, D_train_lore)
y_test = get_y(D_test_news, D_test_lore)
y_dev = get_y(D_dev_news, D_dev_lore)

In [24]:
classifier = Perceptron().fit(X_train, y_train)

In [25]:
classifier.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 1,

In [26]:
def fit_and_evaluate_one(classifier, X_train, y_train, X_test, y_test): 
    # fitting
    classifier.fit(X_train, y_train)
    
    # accuracy
    results = {}
    y_hat_train = classifier.predict(X_train)
    accur_train = sum(y_hat_train == y_train) / len(y_train)  # train accuracy
    y_hat_test = classifier.predict(X_test)
    accur_test = sum(y_hat_test == y_test) / len(y_test)  # test accuracy
    results['accur_train'] = accur_train
    results['accur_test'] = accur_test

    return results

In [27]:
fit_and_evaluate_one(Perceptron(), X_train, y_train, X_test, y_test)

{'accur_test': 0.67766990291262141, 'accur_train': 0.99650892164468585}

In [28]:
# Classifiers to be fitted and evaluated
classifiers = [MultinomialNB(), 
               LogisticRegression(), 
               # svm.SVC(kernel='rbf'), 
               Perceptron(), 
               tree.DecisionTreeClassifier()]

In [29]:
def fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test):
    results = {}
    for c in classifiers:
        cname = str(c).split('(')[0]
        print('Training '+cname+ '...')
        results[cname] = fit_and_evaluate_one(c, X_train, y_train, X_test, y_test)
    return results

In [30]:
fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test)

Training MultinomialNB...
Training LogisticRegression...
Training Perceptron...
Training DecisionTreeClassifier...


{'DecisionTreeClassifier': {'accur_test': 0.68737864077669908,
  'accur_train': 1.0},
 'LogisticRegression': {'accur_test': 0.72233009708737861,
  'accur_train': 0.99844840961986037},
 'MultinomialNB': {'accur_test': 0.72427184466019412,
  'accur_train': 0.98564778898370831},
 'Perceptron': {'accur_test': 0.67766990291262141,
  'accur_train': 0.99650892164468585}}

In [31]:
result_dict = fit_and_evaluate_all(classifiers, X_train, y_train, X_test, y_test)
result_df = pd.DataFrame.from_dict(result_dict).transpose().sort_values('accur_test', ascending=False)
result_df

Training MultinomialNB...
Training LogisticRegression...
Training Perceptron...
Training DecisionTreeClassifier...


Unnamed: 0,accur_test,accur_train
MultinomialNB,0.724272,0.985648
LogisticRegression,0.72233,0.998448
DecisionTreeClassifier,0.704854,1.0
Perceptron,0.67767,0.996509


# What other applications could this type of model work for?

# How could we approach a task like classifying fake news?

# Now that we have run a few quick classifiers at this dataset, how do you think the neural network will compare?