Import modules

In [46]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

Import train / val as vectors

In [3]:
train = pd.read_csv('../data/train_features_scaled_standard.csv', index_col=0)
val = pd.read_csv('../data/validation_features_scaled_standard.csv', index_col=0)

In [4]:
number_of_features = int((train.shape[1] - 1) / 2)
assert number_of_features == 938

In [5]:
a_columns = ['A_{}'.format(i) for i in range(number_of_features)]
b_columns = ['B_{}'.format(i) for i in range(number_of_features)]

In [10]:
train_A = train[a_columns]
train_B = train[b_columns]

val_A = val[a_columns]
val_B = val[b_columns]

# rename columns so that both datasets have the same columns
train_A.columns = range(number_of_features)
train_B.columns = range(number_of_features)

val_A.columns = range(number_of_features)
val_B.columns = range(number_of_features)

Create train datasets - each observation is the absolute difference in A and B

In [19]:
train_X = abs(train_A - train_B)
val_X = abs(val_A - val_B)

train_y = train['different_author']
val_y = val['different_author']

Fit a Random Forest classifier

No prunning:

In [59]:
for n in [100, 150, 200, 250, 275]:
    model = ExtraTreesClassifier(n_estimators=n, n_jobs=-1, oob_score=True, bootstrap=True)
    model.fit(train_X, train_y)
    print('\nn_estimators: ', n)
    print('train score: ', model.score(train_X, train_y))
    print('val score: ', model.score(val_X, val_y))
    print('oob score: ', model.oob_score_)


n_estimators:  100
train score:  1.0
val score:  0.679245283019
oob score:  0.719099221131

n_estimators:  150
train score:  1.0
val score:  0.711492281304
oob score:  0.739417541483

n_estimators:  200
train score:  1.0
val score:  0.714579759863
oob score:  0.750253979004

n_estimators:  250
train score:  1.0
val score:  0.720068610635
oob score:  0.764476803251

n_estimators:  275
train score:  1.0
val score:  0.710120068611
oob score:  0.758211987809


In [60]:
for n in [100, 150, 200, 250, 275]:
    model = ExtraTreesClassifier(n_estimators=n, n_jobs=-1, oob_score=True, bootstrap=True, max_depth=3)
    model.fit(train_X, train_y)
    print('\nn_estimators: ', n)
    print('train score: ', model.score(train_X, train_y))
    print('val score: ', model.score(val_X, val_y))
    print('oob score: ', model.oob_score_)


n_estimators:  100
train score:  0.714866237724
val score:  0.684391080617
oob score:  0.668134100914

n_estimators:  150
train score:  0.726718591263
val score:  0.677186963979
oob score:  0.668642058923

n_estimators:  200
train score:  0.729427700643
val score:  0.68576329331
oob score:  0.663223840163

n_estimators:  250
train score:  0.732814087369
val score:  0.682675814751
oob score:  0.676430748391

n_estimators:  275
train score:  0.732136810024
val score:  0.682675814751
oob score:  0.683711479851


In [30]:
features = pd.read_csv('../data/features/feature_names.csv', index_col=0)

In [33]:
features['importance'] = model.feature_importances_

In [35]:
features.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
203,Spaces,0.026237
123,Mean paragraph Length,0.012651
112,Lexical diversity,0.005444
121,Mean Sentence Length,0.004961
1,Ari Index,0.004742
46,Dale Chall Known Fraction,0.004519
56,Gunning Fog Index,0.004378
6,Colons,0.004320
7,Commas,0.004218
53,Flesch Kincaid Grade,0.004066
