In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
df_train = pd.read_csv('../data/500_ngrams/train_features_scaled_standard.csv', index_col=0)
df_val = pd.read_csv('../data/500_ngrams/validation_features_scaled_standard.csv', index_col=0)

In [3]:
y_train = list(map(lambda y: int(y), df_train['different_author']))
y_val = list(map(lambda y: int(y), df_val['different_author']))

In [4]:
N = int((len(df_train.columns) - 1) / 2)
N

935

In [5]:
df_train_a = df_train.iloc[:, :N]
df_train_b = df_train.iloc[:, N:-1]

df_train_a = df_train_a.rename(columns={x:y for x,y in zip(df_train_a.columns,range(0,len(df_train_a.columns)))})
df_train_b = df_train_b.rename(columns={x:y for x,y in zip(df_train_b.columns,range(0,len(df_train_b.columns)))})


df_val_a = df_val.iloc[:, :N]
df_val_b = df_val.iloc[:, N:-1]

df_val_a = df_val_a.rename(columns={x:y for x,y in zip(df_val_a.columns,range(0,len(df_val_a.columns)))})
df_val_b = df_val_b.rename(columns={x:y for x,y in zip(df_val_b.columns,range(0,len(df_val_b.columns)))})

In [6]:
df_train_sqdiff = (df_train_a - df_train_b) ** 2
df_train_absdiff = abs(df_train_a - df_train_b)

df_val_sqdiff = (df_val_a - df_val_b) ** 2
df_val_absdiff = abs(df_val_a - df_val_b)

In [7]:
del df_train
del df_val
del df_train_a
del df_train_b
del df_val_a
del df_val_b

In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    pd.concat([df_train_absdiff, df_val_absdiff]),
    y_train + y_val,
    test_size=0.4, random_state=42
)

In [9]:
clf = RandomForestClassifier(n_estimators=175)

In [10]:
cv = ShuffleSplit(n_splits=10, test_size=0.4, random_state=42)

In [11]:
scores = cross_val_score(clf,
                         X_train,
                         y_train,
                         cv=cv,
                         n_jobs=-1,
                         verbose=10)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.737837, total=  13.6s
[CV]  ................................................................
[CV] ................................. , score=0.743505, total=  13.8s
[CV] ................................. , score=0.741143, total=  13.9s
[CV]  ................................................................


[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   14.2s remaining:   33.1s


[CV] ................................. , score=0.738781, total=  14.1s
[CV] ................................. , score=0.735947, total=  14.0s
[CV] ................................. , score=0.739726, total=  14.0s


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   14.5s remaining:   14.5s


[CV] ................................. , score=0.752952, total=  14.0s
[CV] ................................. , score=0.732641, total=  13.9s


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   14.7s remaining:    6.3s


[CV] ................................. , score=0.752952, total=   7.3s
[CV] ................................. , score=0.745867, total=   7.2s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   21.4s finished


In [12]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.74 (+/- 0.01)


In [13]:
clf = RandomForestClassifier(n_estimators=175)

In [14]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=175, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [15]:
clf.score(X_val, y_val)

0.76792292434117315

In [23]:
list(clf.feature_importances_)

[0.0015984912209194744,
 0.0032777201781549058,
 0.0015597507643465554,
 0.0017781662620120226,
 0.0012808527812103322,
 0.0026028349896386749,
 0.0038629845855526015,
 0.0034248937959581826,
 0.0016681324044153494,
 0.00077439990331011372,
 0.00069022781573498419,
 0.0019326708415848051,
 0.0010819455726828787,
 0.00063619877007957501,
 0.0010483968380693275,
 0.0017266866475832982,
 0.0015629017692027076,
 0.00068057994391151834,
 0.0010882980148942789,
 0.0015236060713677158,
 0.0012623419602479089,
 0.0014142749006712402,
 0.0011994859421615026,
 0.00079886773967824561,
 0.0014633401578718331,
 0.0011638549518805104,
 0.0016947067266811975,
 0.0012305348541535492,
 0.0015280455118522043,
 0.0010528679762260617,
 0.00092015298726326824,
 0.00096623928994540452,
 0.00071174069725535294,
 0.00086235745243225226,
 0.0010766901399292415,
 0.0022667811084764603,
 0.00090171519377541941,
 0.00076801185302324728,
 0.0012232089905634441,
 0.0012970220980561579,
 0.0014123742509773252,
 0.00

In [21]:
df_feature_names = pd.read_csv('../data/500_ngrams/feature_names.csv', index_col=0)

In [25]:
df_feature_names['importances'] = clf.feature_importances_

In [28]:
df_feature_names.sort_values('importances', ascending=False)

Unnamed: 0,0,importances
173,Number of letters,0.024757
121,Mean paragraph Length,0.009843
52,Flesch Kincaid Grade,0.004767
55,Gunning Fog Index,0.004606
111,Lix Index,0.004596
110,Lexical diversity,0.004559
47,Dale Chall Score,0.004454
119,Mean Sentence Length,0.004432
53,Flesch Reading Ease,0.004251
201,Smog Index,0.004176


In [43]:
forest_150 = set(list(df_feature_names.sort_values('importances', ascending=False).index)[:150])

In [44]:
selector_150 = set([  1,   3,   5,   6,   7,  11,  46,  47,  52,  53,  55,  56,  57,
        59,  95,  99, 110, 111, 119, 120, 121, 122, 139, 152, 173, 174,
       186, 198, 199, 201, 202, 236, 241, 263, 279, 283, 290, 294, 310,
       325, 328, 337, 341, 342, 344, 349, 350, 352, 354, 362, 364, 366,
       368, 402, 403, 409, 410, 416, 417, 425, 426, 428, 440, 446, 451,
       455, 457, 471, 482, 492, 496, 499, 501, 502, 503, 507, 508, 514,
       515, 516, 524, 531, 544, 552, 561, 562, 563, 571, 572, 589, 592,
       593, 594, 603, 608, 613, 614, 631, 635, 639, 641, 658, 659, 665,
       676, 683, 684, 690, 694, 699, 704, 707, 710, 712, 722, 725, 726,
       729, 732, 734, 750, 752, 753, 754, 758, 766, 772, 773, 775, 789,
       793, 797, 808, 812, 813, 839, 846, 850, 866, 870, 882, 884, 894,
       898, 902, 908, 914, 918, 919, 925])

In [45]:
len(forest_150)

150

In [46]:
len(selector_150)

150

In [51]:
df_feature_names.iloc[list(forest_150 & selector_150)]

Unnamed: 0,0,importances
1,Ari Index,0.003278
3,C,0.001778
5,Coleman Liau Index,0.002603
6,Colons,0.003863
7,Commas,0.003425
263,acti,0.00177
775,side,0.001886
902,which,0.0021
11,DT_JJ_NN,0.001933
139,NN_IN_DT_JJ,0.001754


In [55]:
list(forest_150 | selector_150)

[0,
 1,
 2,
 3,
 5,
 6,
 7,
 8,
 11,
 15,
 16,
 26,
 35,
 42,
 45,
 46,
 47,
 49,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 63,
 66,
 68,
 69,
 81,
 84,
 88,
 91,
 95,
 97,
 98,
 99,
 103,
 108,
 109,
 110,
 111,
 112,
 114,
 119,
 120,
 121,
 122,
 123,
 128,
 130,
 133,
 136,
 138,
 139,
 140,
 146,
 149,
 152,
 154,
 160,
 163,
 166,
 168,
 170,
 173,
 174,
 175,
 176,
 186,
 187,
 189,
 193,
 197,
 198,
 199,
 200,
 201,
 202,
 203,
 204,
 212,
 213,
 219,
 225,
 227,
 232,
 233,
 234,
 236,
 240,
 241,
 242,
 243,
 244,
 248,
 251,
 256,
 263,
 273,
 276,
 279,
 282,
 283,
 288,
 290,
 292,
 294,
 299,
 310,
 317,
 325,
 328,
 330,
 331,
 333,
 337,
 341,
 342,
 344,
 349,
 350,
 352,
 354,
 357,
 362,
 364,
 366,
 368,
 379,
 402,
 403,
 409,
 410,
 416,
 417,
 425,
 426,
 428,
 439,
 440,
 446,
 451,
 455,
 456,
 457,
 458,
 471,
 473,
 476,
 482,
 492,
 496,
 499,
 501,
 502,
 503,
 507,
 508,
 512,
 514,
 515,
 516,
 521,
 524,
 531,
 537,
 543,
 544,
 552,
 561,
 562