AIM:  We're challenged to predict the author of excerpts from horror stories by Edgar Allan Poe, Mary Shelley, and HP Lovecraft


In [1]:
# We get the data:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv('./train.csv') # read the train csv file into the program 



## Preprocessing and Feature Engineering


In [2]:
# default top 5 rows of the dataframe
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
df.dropna(axis=0) #dropping the first column
df.set_index('id', inplace = True) # set the dataframe index to id 
df.head() # default top 5 rows of the dataframe

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
from nltk import word_tokenize, pos_tag # import for pre processing and tagging

def countPartOfSpeech(text, partOfSpeech):
    text = word_tokenize(text) # tokenize
    tags = pos_tag(text)  # tag each token to a part of speech 

    counter = 0 # counter to track the count of words (pos)
    
    if (partOfSpeech == 'adjectives'):
        for word in tags: # loop thru tags
            if(word[1] == 'JJ' or word[1] == 'JJR' or word[1] == 'JJS'): # tags return a tuple, so to get pos, we need to access 1 index
                counter += 1
                    
    if (partOfSpeech == 'nouns'):
        for word in tags:
            if(word[1] == 'NN' or word[1] == 'NNS' or word[1] == 'NNPS' or word[1] == 'NNP'):  # tags return a tuple, so to get pos, we need to access 1 index
                counter += 1          
        
    if (partOfSpeech == 'verbs'):
        for word in tags:
            if(word[1] == 'VB' or word[1] == 'VBD' or word[1] == 'VBG' or word[1] == 'VBN' or word[1] == 'VBP'or word[1] == 'VBZ'): # tags return a tuple, so to get pos, we need to access 1 index
                counter += 1
                
    return counter 

In [5]:
from nltk import word_tokenize, pos_tag, pos_tag_sents
import re
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

#creating a function to encapsulate preprocessing, to mkae it easy to replicate on  submission data
def processing(df):
    #lowering and removing punctuation
    df['processed'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
#     df['tokens'] = df['processed'].apply(word_tokenize).apply(pos_tag)
#     tags = nltk.pos_tag(tokens)
#     df['tags'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))
    
    #numerical feature engineering
    #total length of sentence
    df['length'] = df['processed'].apply(lambda x: len(x))
    #get number of words
    df['words'] = df['processed'].apply(lambda x: len(x.split(' ')))
    df['words_not_stopword'] = df['processed'].apply(lambda x: len([t for t in x.split(' ') if t not in stopWords]))
    #get the average word length
    df['avg_word_length'] = df['processed'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len([len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    #get the average word length
    df['commas'] = df['text'].apply(lambda x: x.count(','))
    #get the count of part of speech : adjectives
    df['adjectives'] = df['processed'].apply(lambda word: countPartOfSpeech(word, 'adjectives'))
    #get the count of part of speech : nouns
    df['nouns'] = df['processed'].apply(lambda word: countPartOfSpeech(word, 'nouns'))
    #get the count of part of speech : verbs
    df['verbs'] = df['processed'].apply(lambda word: countPartOfSpeech(word, 'verbs'))
                                            
    
#     df['nouns'] = df['processed'].apply(lambda x: nltk.pos_tag(x))


    return(df)

df = processing(df)

df.head()

Unnamed: 0_level_0,text,author,processed,length,words,words_not_stopword,avg_word_length,commas,adjectives,nouns,verbs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
id26305,"This process, however, afforded me no means of...",EAP,this process however afforded me no means of a...,224,41,21,6.380952,4,2,12,6
id17569,It never once occurred to me that the fumbling...,HPL,it never once occurred to me that the fumbling...,70,14,6,6.166667,0,1,2,2
id11008,"In his left hand was a gold snuff box, from wh...",EAP,in his left hand was a gold snuff box from whi...,195,36,19,5.947368,4,5,10,4
id27763,How lovely is spring As we looked from Windsor...,MWS,how lovely is spring as we looked from windsor...,202,34,21,6.47619,3,6,10,5
id12958,"Finding nothing else, not even gold, the Super...",HPL,finding nothing else not even gold the superin...,170,27,16,7.1875,2,1,6,6


### Creating a Pipeline

First step, split your data into training and testing.

In [6]:
from sklearn.model_selection import train_test_split

features= [c for c in df.columns.values if c  not in ['id','text','author']]
numeric_features= [c for c in df.columns.values if c  not in ['id','text','author','processed']]
target = 'author'

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42)
X_train.head()

Unnamed: 0_level_0,processed,length,words,words_not_stopword,avg_word_length,commas,adjectives,nouns,verbs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
id19417,this panorama is indeed glorious and i should ...,91,18,6,6.666667,1,1,4,2
id09522,there was a simple natural earnestness about h...,240,44,18,6.277778,4,7,8,7
id22732,who are you pray that i duc de lomelette princ...,387,74,38,5.552632,9,3,18,10
id10351,he had gone in the carriage to the nearest tow...,118,24,11,5.363636,0,1,8,3
id24580,there is no method in their proceedings beyond...,71,13,5,7.0,1,0,4,1


In [7]:
df[target]

id
id26305    EAP
id17569    HPL
id11008    EAP
id27763    MWS
id12958    HPL
          ... 
id17718    EAP
id08973    EAP
id05267    EAP
id17513    EAP
id00393    HPL
Name: author, Length: 19579, dtype: object

Now for the tricky parts.

First thing I want to do is define how to process my variables. The standard preprocessing apply the same preprocessing to the whole dataset, but in cases where you have heterogeneous data, this doesn't quite work. So first thing I'm going to do is create a selector transformer that simply returns the one column in the dataset by the key value I pass. 

I was having difficulty getting the selector to play nicely, so I made two different selectors for either text or numeric columns. The return type is different, but other than that they work the same.

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

text = Pipeline([
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

text.fit_transform(X_train)

<13117x21516 sparse matrix of type '<class 'numpy.float64'>'
	with 148061 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.preprocessing import StandardScaler

length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])

length.fit_transform(X_train)

array([[-0.50769254],
       [ 0.88000324],
       [ 2.24907223],
       ...,
       [-0.46112557],
       [-0.14447015],
       [-0.39593181]])

We can see that the transformer pipeline returns a matrix for the column it's called on, so now all that's left to do is join the results from several transformed variables into a single dataset. I'll go ahead and make a pipeline for every variable in the data, then join them all together. 

First, I'll transform all the numeric columns with the standard scaler, but of course you can change the scaler for any column as you desire.

In [11]:
words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])
adjectives =  Pipeline([
                ('selector', NumberSelector(key='adjectives')),
                ('standard', StandardScaler()),
            ])
nouns =  Pipeline([
                ('selector', NumberSelector(key='nouns')),
                ('standard', StandardScaler()),
            ])
verbs =  Pipeline([
                ('selector', NumberSelector(key='verbs')),
                ('standard', StandardScaler()),
            ])


 Use a FeatureUnion to join the feature processing pipelines.

The syntax is the same as a regular pipeline, it's just an array of tuple, with the (name, object) format. 

The feature union itself is not a pipeline, it's just a union, so you need to do *one more step* to make it useable: pass it to a pipeline, with the same structure, an array of tuples, with the simple (name, object) format. . As you can see, we get a pipeline-ception going on the more complex you get! 

You can then apply all those transformations at once with a single fit, transform, or fit_transform call.

In [12]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas),
                      ('adjectives', adjectives),
                     ('verbs', verbs),
                     ('nouns', nouns)])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

<13117x21524 sparse matrix of type '<class 'numpy.float64'>'
	with 252997 stored elements in Compressed Sparse Row format>

To add a model to the mix and generate predictions as well, you can add a model at the end of the pipeline. The syntax is, you guessed it, an array of tuples, merging the transformations with a model. 

We can see the raw accuracy is at 63%. Not bad for a start.


In [13]:
from sklearn.linear_model import LogisticRegression

logModel = LogisticRegression()
pipeline = Pipeline([
    ('features',feats),
    ('classifier', logModel),
])

pipeline.fit(X_train, y_train)


preds = pipeline.predict(X_test)
np.mean(preds == y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7785515320334262

In [14]:
print(pipeline.score(X_train, y_train))

0.9223907905771137


In [15]:
# specify hyperparameter to tune for logistic regression
hyperparameters = [
    { 'classifier__penalty' : ['l2','none'],
     'classifier__solver' : ['newton-cg', 'sag'],
    }
]

In [16]:

pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'features', 'classifier', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__verbose', 'features__text', 'features__length', 'features__words', 'features__words_not_stopword', 'features__avg_word_length', 'features__commas', 'features__adjectives', 'features__verbs', 'features__nouns', 'features__text__memory', 'features__text__steps', 'features__text__verbose', 'features__text__selector', 'features__text__tfidf', 'features__text__selector__key', 'features__text__tfidf__analyzer', 'features__text__tfidf__binary', 'features__text__tfidf__decode_error', 'features__text__tfidf__dtype', 'features__text__tfidf__encoding', 'features__text__tfidf__input', 'features__text__tfidf__lowercase', 'features__text__tfidf__max_df', 'features__text__tfidf__max_features', 'features__text__tfidf__min_df', 'features__text__tfidf__ngram_range', 'features__text__tfidf__norm', 'features__text__tfidf__preprocessor', 'features_

In [19]:
from sklearn.model_selection import GridSearchCV #import for grid search cross validation

In [20]:
clf = GridSearchCV(pipeline, hyperparameters, cv = 2) # specify the cv for 2 fold and hyper parameters

In [21]:
import warnings
warnings.filterwarnings('ignore')

In [22]:
best_clf = clf.fit(X_train, y_train) # fit the model with cv = 2

In [23]:
best_clf 

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text',
                                                                        Pipeline(steps=[('selector',
                                                                                         TextSelector(key='processed')),
                                                                                        ('tfidf',
                                                                                         TfidfVectorizer(stop_words='english'))])),
                                                                       ('length',
                                                                        Pipeline(steps=[('selector',
                                                                                         NumberSelector(key='length')),
                                                                                        ('stan

In [24]:
clf.best_params_  # display the best parameter for cv = 2

{'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}

In [25]:
#refitting on entire training data using best settings
clf.refit

preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)

np.mean(preds == y_test)

0.7793252862890746

In [26]:
# model performance on test set for the cv = 2
submission = pd.read_csv('./test.csv')

#preprocessing
submission = processing(submission)
predictions = clf.predict_proba(submission)

preds = pd.DataFrame(data=predictions, columns = clf.best_estimator_.named_steps['classifier'].classes_)

#generating a submission file
result = pd.concat([submission[['id']], preds], axis=1)
result.set_index('id', inplace = True)
result.head()

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.239048,0.064894,0.696058
id24541,0.830118,0.03869,0.131193
id00134,0.236574,0.700423,0.063003
id27757,0.696089,0.197475,0.106436
id04081,0.684022,0.224643,0.091335


In [91]:
# import numpy as np
# coefs=logModel.coef_[0]
# feat_name = logModel.intercept_[0]
# top_three = np.argpartition(coefs, -3)[-3:]
# top_three
# top_three = np.argpartition(coefs, -3)[-3:]
    
    

array([ 8133, 12361, 11427], dtype=int64)

In [27]:
clf = GridSearchCV(pipeline, hyperparameters, cv = 10) # specify the cv for 10 fold and hyper parameters


In [28]:
best_clf = clf.fit(X_train, y_train) # fit the model with cv = 10

In [29]:
print(clf.best_params_) # display the best parameter for cv = 10
 

{'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}


In [30]:
# refitting on entire training data using best settings
clf.refit

preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)

np.mean(preds == y_test)


0.7793252862890746

In [31]:
# model performance on test set for the cv = 10
submission = pd.read_csv('./test.csv')

#preprocessing
submission = processing(submission)
predictions = clf.predict_proba(submission)

preds = pd.DataFrame(data=predictions, columns = clf.best_estimator_.named_steps['classifier'].classes_)

#generating a submission file
result = pd.concat([submission[['id']], preds], axis=1)
result.set_index('id', inplace = True)
result.head()

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.239048,0.064894,0.696058
id24541,0.830118,0.03869,0.131193
id00134,0.236574,0.700423,0.063003
id27757,0.696089,0.197475,0.106436
id04081,0.684022,0.224643,0.091335


In [32]:
clf = GridSearchCV(pipeline, hyperparameters, cv = 20, verbose = 20) # specify the cv for 20 fold and hyper parameters

In [33]:
# fit the model with cv = 20
clf.fit(X_train, y_train)

Fitting 20 folds for each of 4 candidates, totalling 80 fits
[CV 1/20; 1/4] START classifier__penalty=l2, classifier__solver=newton-cg.......
[CV 1/20; 1/4] END classifier__penalty=l2, classifier__solver=newton-cg;, score=0.809 total time=   2.2s
[CV 2/20; 1/4] START classifier__penalty=l2, classifier__solver=newton-cg.......
[CV 2/20; 1/4] END classifier__penalty=l2, classifier__solver=newton-cg;, score=0.785 total time=   2.6s
[CV 3/20; 1/4] START classifier__penalty=l2, classifier__solver=newton-cg.......
[CV 3/20; 1/4] END classifier__penalty=l2, classifier__solver=newton-cg;, score=0.790 total time=   2.4s
[CV 4/20; 1/4] START classifier__penalty=l2, classifier__solver=newton-cg.......
[CV 4/20; 1/4] END classifier__penalty=l2, classifier__solver=newton-cg;, score=0.773 total time=   2.6s
[CV 5/20; 1/4] START classifier__penalty=l2, classifier__solver=newton-cg.......
[CV 5/20; 1/4] END classifier__penalty=l2, classifier__solver=newton-cg;, score=0.777 total time=   2.9s
[CV 6/20;

[CV 5/20; 3/4] END classifier__penalty=none, classifier__solver=newton-cg;, score=0.623 total time= 1.2min
[CV 6/20; 3/4] START classifier__penalty=none, classifier__solver=newton-cg.....
[CV 6/20; 3/4] END classifier__penalty=none, classifier__solver=newton-cg;, score=0.639 total time=  50.5s
[CV 7/20; 3/4] START classifier__penalty=none, classifier__solver=newton-cg.....
[CV 7/20; 3/4] END classifier__penalty=none, classifier__solver=newton-cg;, score=0.648 total time=  37.5s
[CV 8/20; 3/4] START classifier__penalty=none, classifier__solver=newton-cg.....
[CV 8/20; 3/4] END classifier__penalty=none, classifier__solver=newton-cg;, score=0.651 total time=  45.1s
[CV 9/20; 3/4] START classifier__penalty=none, classifier__solver=newton-cg.....
[CV 9/20; 3/4] END classifier__penalty=none, classifier__solver=newton-cg;, score=0.559 total time=  47.8s
[CV 10/20; 3/4] START classifier__penalty=none, classifier__solver=newton-cg....
[CV 10/20; 3/4] END classifier__penalty=none, classifier__so

GridSearchCV(cv=20,
             estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text',
                                                                        Pipeline(steps=[('selector',
                                                                                         TextSelector(key='processed')),
                                                                                        ('tfidf',
                                                                                         TfidfVectorizer(stop_words='english'))])),
                                                                       ('length',
                                                                        Pipeline(steps=[('selector',
                                                                                         NumberSelector(key='length')),
                                                                                        ('sta

In [34]:
clf.best_params_ # display the best parameter for cv = 20

{'classifier__penalty': 'l2', 'classifier__solver': 'newton-cg'}

In [35]:
#refitting on entire training data using best settings
clf.refit

preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)

np.mean(preds == y_test)

0.7793252862890746

In [36]:
# model performance on test set for the cv = 20
submission = pd.read_csv('./test.csv')

#preprocessing
submission = processing(submission)
predictions = clf.predict_proba(submission)

preds = pd.DataFrame(data=predictions, columns = clf.best_estimator_.named_steps['classifier'].classes_)

#generating a submission file
result = pd.concat([submission[['id']], preds], axis=1)
result.set_index('id', inplace = True)
result.head()

Unnamed: 0_level_0,EAP,HPL,MWS
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id02310,0.239048,0.064894,0.696058
id24541,0.830118,0.03869,0.131193
id00134,0.236574,0.700423,0.063003
id27757,0.696089,0.197475,0.106436
id04081,0.684022,0.224643,0.091335


# Feature Analysis


In [32]:
X_train_feature_analysis = X_train[numeric_features]
# mapping the labels to 0, 1, 2 to segregate the classes

y_train_feature_analysis = y_train.map({'EAP':0, 'HPL':1, 'MWS':2})

In [33]:
newModel = LogisticRegression()

In [34]:
newModel.fit(X_train_feature_analysis, y_train_feature_analysis)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [35]:
abs_coef = np.absolute(newModel.coef_[0]) # take absolute value of all  feature weights


In [36]:
d = {'feature_n': pd.Series(X_train_feature_analysis.columns.values), 'weights': abs_coef}
new_df = pd.DataFrame(data=d)

In [37]:
new_df.sort_values(by= 'weights' , axis = 0, ascending=False) # sort the feature weights in descending order

Unnamed: 0,feature_n,weights
4,commas,0.252792
7,verbs,0.182029
2,words_not_stopword,0.082888
3,avg_word_length,0.081358
6,nouns,0.080528
1,words,0.075052
5,adjectives,0.063915
0,length,0.001379


In [38]:
new_df.iloc[5:8] # display the weights learnt for the three features added in HW#9. 


Unnamed: 0,feature_n,weights
5,adjectives,0.063915
6,nouns,0.080528
7,verbs,0.182029


In [39]:
# importance = clf.coef_
# # summarize feature importance
# for i,v in enumerate(importance):
# 	print('Feature: %0d, Score: %.5f' % (i,v))
for i in range(0, logModel.coef_.shape[0]):
    top10_indices = np.argsort(logModel.coef_[i])[:10]

print(top10_indices)

[19047  2903 19048 16537 12361  8836 18515 10679 16801  5702]


In [40]:
# get importance
# importance = newModel.coef_[0]
# summarize feature importance
# for i,v in enumerate(importance):
#     print('Feature: %0d, Importance: (%.5f)' % (i,v))
#     list.append(v)
#     print(i)
#     print(v)


In [41]:
from sklearn.metrics import confusion_matrix

In [42]:
cm = confusion_matrix(y_test, preds)

In [43]:
cm #confusion matrix for 3 classes

array([[2162,  209,  216],
       [ 354, 1376,  122],
       [ 407,  123, 1493]], dtype=int64)

In [44]:
# get data-points for which the model prediction is wrong
X_test["actual"] = y_test
X_test["predicted"] = preds

incorrect = X_test[X_test["actual"] != X_test["predicted"]]

In [45]:
# get 10 data-points for which the model prediction is wrong
incorrect[:10]

Unnamed: 0_level_0,processed,length,words,words_not_stopword,avg_word_length,commas,adjectives,nouns,verbs,actual,predicted
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
id16303,he had seen so many customs and witnessed so g...,399,69,33,6.909091,1,8,17,11,MWS,HPL
id14743,she listened to me as she had done to the narr...,279,56,18,6.722222,5,2,10,11,MWS,EAP
id07281,his chief amusements were gunning and fishing ...,214,35,17,7.470588,2,1,11,6,EAP,MWS
id23995,i will content myself with saying in addition ...,166,30,12,7.083333,5,3,9,5,EAP,MWS
id18564,johns i bade the knocker enter but was answere...,70,14,7,5.714286,2,0,6,3,HPL,EAP
id13058,at fifteen or even at twenty one for i had now...,133,27,13,5.461538,1,3,5,4,EAP,MWS
id15700,though not as yet licenced physicians we now h...,145,25,12,7.0,2,1,4,5,HPL,EAP
id18277,the tide had turned and was coming in now and ...,89,19,7,5.428571,1,0,3,6,HPL,MWS
id13971,burkes reflections on the french revolution,43,6,4,8.25,0,1,3,0,MWS,EAP
id27104,my best girl he had said relieves me from thes...,58,11,5,6.0,2,1,3,2,MWS,EAP


In [46]:
# display all wrong predictions
incorrect

Unnamed: 0_level_0,processed,length,words,words_not_stopword,avg_word_length,commas,adjectives,nouns,verbs,actual,predicted
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
id16303,he had seen so many customs and witnessed so g...,399,69,33,6.909091,1,8,17,11,MWS,HPL
id14743,she listened to me as she had done to the narr...,279,56,18,6.722222,5,2,10,11,MWS,EAP
id07281,his chief amusements were gunning and fishing ...,214,35,17,7.470588,2,1,11,6,EAP,MWS
id23995,i will content myself with saying in addition ...,166,30,12,7.083333,5,3,9,5,EAP,MWS
id18564,johns i bade the knocker enter but was answere...,70,14,7,5.714286,2,0,6,3,HPL,EAP
...,...,...,...,...,...,...,...,...,...,...,...
id04979,in these various brochures the aim is always s...,126,21,10,7.300000,0,3,5,4,EAP,MWS
id16577,at the same time that he taught me by their me...,209,41,17,6.176471,2,5,9,6,MWS,EAP
id15067,alas i even now look back with disgust at my a...,131,23,12,6.583333,2,2,7,2,MWS,EAP
id02639,the assassins must have escaped through the ot...,56,9,4,6.500000,0,1,2,2,EAP,HPL
