Import modules

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score


Import train / val as vectors

In [2]:
train = pd.read_csv('../data/train_features_scaled_standard.csv', index_col=0)
val = pd.read_csv('../data/validation_features_scaled_standard.csv', index_col=0)

In [3]:
number_of_features = int((train.shape[1] - 1) / 2)
assert number_of_features == 938

Import features index to name mapping

In [57]:
features = pd.read_csv('../data/features/feature_names.csv', index_col=0)

assert features.shape[0] == 938

In [4]:
a_columns = ['A_{}'.format(i) for i in range(number_of_features)]
b_columns = ['B_{}'.format(i) for i in range(number_of_features)]

In [5]:
train_A = train[a_columns]
train_B = train[b_columns]

val_A = val[a_columns]
val_B = val[b_columns]

# rename columns so that both datasets have the same columns
train_A.columns = range(number_of_features)
train_B.columns = range(number_of_features)

val_A.columns = range(number_of_features)
val_B.columns = range(number_of_features)

Create train datasets - each observation is the absolute difference in A and B

In [6]:
train_X = abs(train_A - train_B)
val_X = abs(val_A - val_B)

train_y = train['different_author']
val_y = val['different_author']

Fit a Random Forest classifier

No prunning:

In [8]:
# for n in [50, 100, 150, 200, 250, 275]:
n=150
model = RandomForestClassifier(n_estimators=n, n_jobs=-1, oob_score=True, bootstrap=True)
model.fit(train_X, train_y)
print('\nn_estimators: ', n)
print('train score: ', model.score(train_X, train_y))
print('val score: ', model.score(val_X, val_y))
print('oob score: ', model.oob_score_)


n_estimators:  150
train score:  1.0
val score:  0.720411663808
oob score:  0.738232306129


In [16]:
features['model'] = model.feature_importances_
features.sort_values('model', ascending=False)

Unnamed: 0,feature,model
123,Mean paragraph Length,0.012921
112,Lexical diversity,0.005720
46,Dale Chall Known Fraction,0.005501
47,Dale Chall Score,0.004421
7,Commas,0.004387
204,Special characters,0.004142
6,Colons,0.003914
113,Lix Index,0.003844
53,Flesch Kincaid Grade,0.003830
50,Even more special characters,0.003652


With prunning based on:
    - min leaf samples
    - max depth

In [17]:
for depth in [2, 3, 4, 5, 6, 7]:
    model = RandomForestClassifier(n_estimators=200, n_jobs=-1, oob_score=True, bootstrap=True, max_depth=depth)
    model.fit(train_X, train_y)
    print('\ndepth: ', depth)
    print('train score: ', model.score(train_X, train_y))
    print('val score: ', model.score(val_X, val_y))
    print('oob score: ', model.oob_score_)


depth:  2
train score:  0.722146969184
val score:  0.682332761578
oob score:  0.678123941754

depth:  3
train score:  0.757534710464
val score:  0.68576329331
oob score:  0.692516085337

depth:  4
train score:  0.793430409753
val score:  0.700857632933
oob score:  0.708770741619

depth:  5
train score:  0.843379613952
val score:  0.703945111492
oob score:  0.718083305113


KeyboardInterrupt: 

In [18]:
for samples in [100, 125, 150, 170, 175, 200, 225, 250, 275, 300]:
    model = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True, bootstrap=True,
                                   max_depth=4, min_samples_leaf=samples)
    model.fit(train_X, train_y)
    print('\nmin_leaf_samples: ', samples)
    print('train score: ', model.score(train_X, train_y))
    print('val score: ', model.score(val_X, val_y))
    print('oob score: ', model.oob_score_)


min_leaf_samples:  100
train score:  0.771249576702
val score:  0.696397941681
oob score:  0.692346766001

min_leaf_samples:  125
train score:  0.768201828649
val score:  0.695025728988
oob score:  0.695225194717

min_leaf_samples:  150
train score:  0.760751777853
val score:  0.70154373928
oob score:  0.691500169319

min_leaf_samples:  170
train score:  0.751608533695
val score:  0.690222984563
oob score:  0.685235353877

min_leaf_samples:  175
train score:  0.761598374534
val score:  0.70051457976
oob score:  0.688621740603

min_leaf_samples:  200
train score:  0.752455130376
val score:  0.693996569468
oob score:  0.687436505249

min_leaf_samples:  225
train score:  0.747544869624
val score:  0.689536878216
oob score:  0.680663731798

min_leaf_samples:  250
train score:  0.741110734846
val score:  0.68782161235
oob score:  0.684727395869

min_leaf_samples:  275
train score:  0.744497121571
val score:  0.695711835334
oob score:  0.678631899763

min_leaf_samples:  300
train score:  0.

In [50]:
model_prunned = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True, bootstrap=True,
                               max_depth=4, min_samples_leaf=275)
model_prunned.fit(train_X, train_y)
print('\nmax depth: ', 4)
print('\nmin_leaf_samples: ', 150)
print('train score: ', model_prunned.score(train_X, train_y))
print('val score: ', model_prunned.score(val_X, val_y))
print('oob score: ', model_prunned.oob_score_)


max depth:  4

min_leaf_samples:  150
train score:  0.74365052489
val score:  0.695025728988
oob score:  0.686420589231


In [58]:
features.head()

Unnamed: 0,feature
0,A
1,Ari Index
2,B
3,C
4,CC_DT_NN


In [59]:
features['importance'] = model_prunned.feature_importances_

In [60]:
# features = features[['feature', 'importance']].sort_values('importance', ascending=False)
features.to_csv('../data/features/prunne')
features.head()

Unnamed: 0,feature,importance_prunned
123,Mean paragraph Length,0.051432
112,Lexical diversity,0.036576
787,some,0.027229
46,Dale Chall Known Fraction,0.022091
675,other,0.021045


In [26]:
features['rank'] = features['model'].rank(ascending=False)
features['rank_prunned'] = features['model_prunned'].rank(ascending=False)

In [48]:
features['model'].sum()

0.99999999999999989

In [28]:
features.sort_values('model', ascending=False).to_csv('bla.csv')

In [29]:
features.sort_values('model_prunned', ascending=False).to_csv('bla_prunned.csv')

In [39]:
sorted_ = model_prunned.feature_importances_.argsort()[::-1]

In [49]:
for n in [5,10,20, 25, 30,35, 40, 50, 70, 90, 100]:
    model = RandomForestClassifier(n_estimators=n, n_jobs=-1, min_samples_leaf=50)
    model.fit(train_X[sorted_[:n]], train_y)
    print('\nn: ', n)
    print('train score: ', model.score(train_X[sorted_[:n]], train_y))
    print('val score: ', model.score(val_X[sorted_[:n]], val_y))


n:  5
train score:  0.66288520149
val score:  0.595197255575

n:  10
train score:  0.706400270911
val score:  0.628816466552

n:  20
train score:  0.736539112767
val score:  0.669982847341

n:  25
train score:  0.753979004402
val score:  0.669296740995

n:  30
train score:  0.761429055198
val score:  0.685420240137

n:  35
train score:  0.77175753471
val score:  0.683018867925

n:  40
train score:  0.775313240772
val score:  0.68782161235

n:  50
train score:  0.791398577718
val score:  0.684391080617

n:  70
train score:  0.808669150017
val score:  0.695711835334

n:  90
train score:  0.812394175415
val score:  0.69845626072

n:  100
train score:  0.825431764307
val score:  0.709090909091


Grid search:

In [65]:
rfc = RandomForestClassifier(n_jobs=-1, bootstrap=True, oob_score=True) 

param_grid = { 
    'n_estimators': [50, 75, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [3, 4, 5, 6],
    'criterion': ['entropy', 'gini'],
}

model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
model.fit(train_X, train_y)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=True, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50, 75, 100], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [3, 4, 5, 6], 'criterion': ['entropy', 'gini']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [73]:
model.score(train_X, train_y)

0.87588892651540806

In [70]:
model.score(val_X, val_y)

0.71526586620926247

In [69]:
pd.DataFrame(model.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,rank_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.472406,0.108852,0.672367,0.754149,entropy,3,auto,50,"{'criterion': 'entropy', 'max_depth': 3, 'max_...",55,...,0.676545,0.745397,0.651143,0.748148,0.686706,0.763810,0.029418,0.002379,0.011671,0.006891
1,0.711917,0.109194,0.681510,0.758127,entropy,3,auto,75,"{'criterion': 'entropy', 'max_depth': 3, 'max_...",38,...,0.678239,0.750476,0.665538,0.756825,0.720576,0.771217,0.049149,0.001134,0.021882,0.007984
2,0.983586,0.109236,0.665256,0.758508,entropy,3,auto,100,"{'criterion': 'entropy', 'max_depth': 3, 'max_...",63,...,0.655377,0.754074,0.654530,0.764021,0.674852,0.753439,0.055501,0.001649,0.008680,0.007916
3,0.540163,0.107169,0.673552,0.758127,entropy,3,sqrt,50,"{'criterion': 'entropy', 'max_depth': 3, 'max_...",51,...,0.660457,0.744974,0.670618,0.768677,0.689246,0.754074,0.079387,0.001290,0.012357,0.008874
4,0.708900,0.107333,0.686759,0.757196,entropy,3,sqrt,75,"{'criterion': 'entropy', 'max_depth': 3, 'max_...",30,...,0.676545,0.751323,0.670618,0.765714,0.701101,0.758942,0.080133,0.001711,0.012148,0.007722
5,0.978851,0.109036,0.677955,0.761768,entropy,3,sqrt,100,"{'criterion': 'entropy', 'max_depth': 3, 'max_...",44,...,0.670618,0.757672,0.661304,0.765714,0.703641,0.755767,0.086849,0.001612,0.014199,0.005995
6,0.378157,0.108821,0.656620,0.743438,entropy,3,log2,50,"{'criterion': 'entropy', 'max_depth': 3, 'max_...",72,...,0.662151,0.744127,0.638442,0.759788,0.674005,0.745397,0.041386,0.001831,0.017853,0.010922
7,0.590269,0.107226,0.662547,0.748899,entropy,3,log2,75,"{'criterion': 'entropy', 'max_depth': 3, 'max_...",66,...,0.660457,0.745185,0.653683,0.754497,0.675699,0.750265,0.039103,0.001684,0.008320,0.006970
8,0.729171,0.109402,0.662547,0.751651,entropy,3,log2,100,"{'criterion': 'entropy', 'max_depth': 3, 'max_...",66,...,0.646909,0.740952,0.652837,0.757884,0.667231,0.744974,0.045068,0.001568,0.016869,0.010317
9,0.575485,0.109377,0.680494,0.788097,entropy,4,auto,50,"{'criterion': 'entropy', 'max_depth': 4, 'max_...",39,...,0.693480,0.788571,0.673158,0.779259,0.682472,0.792381,0.033159,0.000789,0.007436,0.005349


In [107]:
model2 = RandomForestClassifier(n_estimators=100, n_jobs=-1, oob_score=True, bootstrap=True,
                                   max_depth=4, min_samples_leaf=200)
model2.fit(train_X, train_y)
print('\nmin_leaf_samples: ', samples)
print('train score: ', model2.score(train_X, train_y))
print('val score: ', model2.score(val_X, val_y))
print('oob score: ', model2.oob_score_)


min_leaf_samples:  300
train score:  0.751269895022
val score:  0.704288164666
oob score:  0.699458178124
