In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
df_train = pd.read_csv('../data/500_ngrams/train_features_scaled_standard.csv', index_col=0)
df_val = pd.read_csv('../data/500_ngrams/validation_features_scaled_standard.csv', index_col=0)

In [3]:
y_train = list(map(lambda y: int(y), df_train['different_author']))
y_val = list(map(lambda y: int(y), df_val['different_author']))

In [4]:
N = int((len(df_train.columns) - 1) / 2)
N

935

In [5]:
df_train_a = df_train.iloc[:, :N]
df_train_b = df_train.iloc[:, N:-1]

df_train_a = df_train_a.rename(columns={x:y for x,y in zip(df_train_a.columns,range(0,len(df_train_a.columns)))})
df_train_b = df_train_b.rename(columns={x:y for x,y in zip(df_train_b.columns,range(0,len(df_train_b.columns)))})


df_val_a = df_val.iloc[:, :N]
df_val_b = df_val.iloc[:, N:-1]

df_val_a = df_val_a.rename(columns={x:y for x,y in zip(df_val_a.columns,range(0,len(df_val_a.columns)))})
df_val_b = df_val_b.rename(columns={x:y for x,y in zip(df_val_b.columns,range(0,len(df_val_b.columns)))})

In [6]:
df_train_sqdiff = (df_train_a - df_train_b) ** 2
df_train_absdiff = abs(df_train_a - df_train_b)

df_val_sqdiff = (df_val_a - df_val_b) ** 2
df_val_absdiff = abs(df_val_a - df_val_b)

In [7]:
del df_train
del df_val
del df_train_a
del df_train_b
del df_val_a
del df_val_b

In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    pd.concat([df_train_absdiff, df_val_absdiff]),
    y_train + y_val,
    test_size=0.4, random_state=42
)

In [9]:
clf = RandomForestClassifier(n_estimators=175)

In [10]:
cv = ShuffleSplit(n_splits=10, test_size=0.4, random_state=42)

In [11]:
scores = cross_val_score(clf,
                         X_train,
                         y_train,
                         cv=cv,
                         n_jobs=-1,
                         verbose=10)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ....................... , score=0.7227208313651393, total=  10.6s
[CV]  ................................................................
[CV] ....................... , score=0.7383089277279169, total=  10.5s
[CV]  ................................................................
[CV] ....................... , score=0.7529522909777988, total=  10.7s
[CV]  ................................................................
[CV] ....................... , score=0.7392536608408125, total=  10.6s
[CV]  ................................................................
[CV] ....................... , score=0.7335852621634388, total=  10.5s
[CV]  ................................................................


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   21.5s remaining:   21.5s


[CV] ....................... , score=0.7435049598488427, total=  10.5s
[CV] ....................... , score=0.7463391591875296, total=  10.8s
[CV]  ................................................................
[CV] ....................... , score=0.7553141237600378, total=  10.6s


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   21.7s remaining:    9.3s


[CV] ....................... , score=0.7416154936230515, total=   7.1s
[CV] ....................... , score=0.7704298535663675, total=   7.0s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   28.7s finished


In [12]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.74 (+/- 0.02)


In [13]:
clf = RandomForestClassifier(n_estimators=175)

In [14]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=175, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
clf.score(X_val, y_val)

0.7605553981297818