Import modules

In [6]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine, braycurtis, canberra, cityblock, chebyshev, minkowski
from sklearn.linear_model import LogisticRegression

Import train / val as vectors

In [3]:
train = pd.read_csv('../data/train_features_scaled_standard.csv', index_col=0)
val = pd.read_csv('../data/validation_features_scaled_standard.csv', index_col=0)

In [33]:
train_minmax = pd.read_csv('../data/train_features_scaled_minmax.csv', index_col=0)
val_minmax = pd.read_csv('../data/validation_features_scaled_minmax.csv', index_col=0)

In [4]:
number_of_features = int((train.shape[1] - 1) / 2)
assert number_of_features == 938

In [5]:
features = pd.read_csv('../data/features/prunned_RF_scores.csv', index_col=0)
assert features.shape[0] == 938

Define similarity/distance measures

In [7]:
def minmax(a, b):
    return sum(np.minimum(a, b)) / sum(np.maximum(a, b))

def similarities(vectors, features):
    a = [vectors['A_{}'.format(i)] for i in features]
    b = [vectors['B_{}'.format(i)] for i in features]
    
    return (minmax(a,b),
            cosine(a, b),
            braycurtis(a, b),
            canberra(a, b),
            cityblock(a, b))

## all similarity measures, all features, no weights, standard scaling

In [9]:
best_features = features.index
similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

# computing train similarity measures
train_similarities = train.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
train_similarities.columns = similarity_measures
train_similarities['different_author'] = train['different_author']

# computing val similarity measures
val_similarities = val.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
val_similarities.columns = similarity_measures
val_similarities['different_author'] = val['different_author']

In [10]:
train_similarities.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance,different_author
0,-1.055474,0.912823,0.71696,398.480153,605.604589,True
1,-1.053584,0.916162,0.721877,400.251424,608.193625,True
2,-1.205837,0.937646,0.647488,289.809889,619.982495,True
3,-1.049284,0.763965,0.597759,326.068801,566.108841,False
4,-0.71436,0.733612,0.650287,393.313379,560.358946,False


In [11]:
t_x = train_similarities[similarity_measures]
t_y = train_similarities['different_author']
v_x = val_similarities[similarity_measures]
v_y = val_similarities['different_author']

In [12]:
model = LogisticRegression()
model.fit(t_x, t_y)

print('train accuracy: ', (model.predict(t_x) == t_y).mean())
print('val accuracy: ', (model.predict(v_x) == v_y).mean())

train accuracy:  0.756688113783
val accuracy:  0.755060034305


In [13]:
model.coef_

array([[ -8.70601118e-01,   8.02077120e+00,   3.69531677e+00,
          1.65867040e-02,  -4.09080968e-03]])

## all similarity measures, top 250 features, no weights, standard scaling

In [18]:
best_features = features.index[:250]
similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

# computing train similarity measures
train_similarities = train.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
train_similarities.columns = similarity_measures
train_similarities['different_author'] = train['different_author']

# computing val similarity measures
val_similarities = val.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
val_similarities.columns = similarity_measures
val_similarities['different_author'] = val['different_author']

In [19]:
train_similarities.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance,different_author
0,-1.009311,0.917542,0.763281,133.515881,193.086606,True
1,-1.007293,0.926835,0.775934,134.477952,195.210343,True
2,-1.141144,1.011542,0.739324,103.656973,210.593952,True
3,-1.2026,0.83081,0.682978,118.203572,187.267711,False
4,-0.966087,0.84515,0.739493,131.710383,175.077734,False


In [20]:
t_x = train_similarities[similarity_measures]
t_y = train_similarities['different_author']
v_x = val_similarities[similarity_measures]
v_y = val_similarities['different_author']

In [21]:
model = LogisticRegression()
model.fit(t_x, t_y)

print('train accuracy: ', (model.predict(t_x) == t_y).mean())
print('val accuracy: ', (model.predict(v_x) == v_y).mean())

train accuracy:  0.725194717237
val accuracy:  0.708061749571


In [22]:
model.coef_

array([[-0.11353554,  3.41691025, -0.60498984,  0.05530624,  0.00701974]])

## all similarity measures, top 50 features, no weights, standard scaling

In [23]:
best_features = features.index[:50]
similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

# computing train similarity measures
train_similarities = train.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
train_similarities.columns = similarity_measures
train_similarities['different_author'] = train['different_author']

# computing val similarity measures
val_similarities = val.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
val_similarities.columns = similarity_measures
val_similarities['different_author'] = val['different_author']

In [24]:
train_similarities.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance,different_author
0,-1.535213,0.817837,0.767573,31.320675,38.099962,True
1,-1.506314,0.862629,0.826922,32.111646,40.167714,True
2,-3.830221,0.902731,0.68845,22.320092,34.840475,True
3,-2.149088,0.724622,0.54648,23.526565,31.120812,False
4,-0.765444,0.789979,0.72777,30.466596,41.722644,False


In [25]:
t_x = train_similarities[similarity_measures]
t_y = train_similarities['different_author']
v_x = val_similarities[similarity_measures]
v_y = val_similarities['different_author']

In [26]:
model = LogisticRegression()
model.fit(t_x, t_y)

print('train accuracy: ', (model.predict(t_x) == t_y).mean())
print('val accuracy: ', (model.predict(v_x) == v_y).mean())

train accuracy:  0.698442262106
val accuracy:  0.678559176672


In [27]:
model.coef_

array([[ -2.16018374e-04,   1.87326891e+00,  -1.93321498e+00,
          2.25325872e-01,  -6.50850105e-03]])

## all similarity measures, top 500 features, no weights, standard scaling

In [28]:
best_features = features.index[:500]
similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

# computing train similarity measures
train_similarities = train.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
train_similarities.columns = similarity_measures
train_similarities['different_author'] = train['different_author']

# computing val similarity measures
val_similarities = val.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
val_similarities.columns = similarity_measures
val_similarities['different_author'] = val['different_author']

In [29]:
train_similarities.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance,different_author
0,-1.172092,0.863151,0.705553,220.744873,324.355065,True
1,-1.170327,0.867921,0.712047,221.701823,326.411914,True
2,-1.219706,0.941539,0.663145,165.698371,352.899886,True
3,-0.856537,0.78079,0.651156,197.052317,350.026051,False
4,-0.680357,0.750625,0.677364,227.910023,320.45641,False


In [30]:
t_x = train_similarities[similarity_measures]
t_y = train_similarities['different_author']
v_x = val_similarities[similarity_measures]
v_y = val_similarities['different_author']

In [31]:
model = LogisticRegression()
model.fit(t_x, t_y)

print('train accuracy: ', (model.predict(t_x) == t_y).mean())
print('val accuracy: ', (model.predict(v_x) == v_y).mean())

train accuracy:  0.745174398916
val accuracy:  0.741680960549


In [32]:
model.coef_

array([[ -4.85691250e-01,   4.74615266e+00,   2.85469318e+00,
          2.78374324e-02,   1.43687516e-03]])

## all similarity measures, all features, no weights, minmax scaling

In [36]:
best_features = features.index
similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

# computing train similarity measures
train_similarities = train_minmax.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
train_similarities.columns = similarity_measures
train_similarities['different_author'] = train_minmax['different_author']

# computing val similarity measures
val_similarities = val_minmax.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
val_similarities.columns = similarity_measures
val_similarities['different_author'] = val_minmax['different_author']

In [37]:
train_similarities.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance,different_author
0,0.32387,0.37091,0.510722,350.857241,60.335068,True
1,0.321832,0.37234,0.513051,351.620079,60.596062,True
2,0.291532,0.43431,0.548549,244.395348,62.421801,True
3,0.351154,0.325159,0.480216,288.902834,56.24509,False
4,0.406145,0.230983,0.422329,343.745518,54.321482,False


In [38]:
t_x = train_similarities[similarity_measures]
t_y = train_similarities['different_author']
v_x = val_similarities[similarity_measures]
v_y = val_similarities['different_author']

In [39]:
model = LogisticRegression()
model.fit(t_x, t_y)

print('train accuracy: ', (model.predict(t_x) == t_y).mean())
print('val accuracy: ', (model.predict(v_x) == v_y).mean())

train accuracy:  0.755164239756
val accuracy:  0.755060034305


In [40]:
model.coef_

array([[-9.97205012,  8.58506587,  5.33226986,  0.03324554, -0.09349579]])

## canberra distance, all features, no weights, minmax scaling

In [41]:
best_features = features.index
similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

# computing train similarity measures
train_similarities = train_minmax.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
train_similarities.columns = similarity_measures
train_similarities['different_author'] = train_minmax['different_author']

# computing val similarity measures
val_similarities = val_minmax.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
val_similarities.columns = similarity_measures
val_similarities['different_author'] = val_minmax['different_author']

In [42]:
train_similarities.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance,different_author
0,0.32387,0.37091,0.510722,350.857241,60.335068,True
1,0.321832,0.37234,0.513051,351.620079,60.596062,True
2,0.291532,0.43431,0.548549,244.395348,62.421801,True
3,0.351154,0.325159,0.480216,288.902834,56.24509,False
4,0.406145,0.230983,0.422329,343.745518,54.321482,False


In [43]:
t_x = train_similarities['canberra_distance']
t_y = train_similarities['different_author']
v_x = val_similarities['canberra_distance']
v_y = val_similarities['different_author']

In [47]:
model = LogisticRegression()
model.fit(t_x.values.reshape(-1, 1), t_y)

print('train accuracy: ', (model.predict(t_x.values.reshape(-1, 1)) == t_y).mean())
print('val accuracy: ', (model.predict(v_x.values.reshape(-1, 1)) == v_y).mean())

train accuracy:  0.652726041314
val accuracy:  0.681646655232


In [48]:
model.coef_

array([[ 0.01961463]])

## canberra distance, all features, no weights, standard scaling

In [49]:
best_features = features.index
similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

# computing train similarity measures
train_similarities = train.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
train_similarities.columns = similarity_measures
train_similarities['different_author'] = train['different_author']

# computing val similarity measures
val_similarities = val.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
val_similarities.columns = similarity_measures
val_similarities['different_author'] = val['different_author']

In [50]:
train_similarities.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance,different_author
0,-1.055474,0.912823,0.71696,398.480153,605.604589,True
1,-1.053584,0.916162,0.721877,400.251424,608.193625,True
2,-1.205837,0.937646,0.647488,289.809889,619.982495,True
3,-1.049284,0.763965,0.597759,326.068801,566.108841,False
4,-0.71436,0.733612,0.650287,393.313379,560.358946,False


In [51]:
t_x = train_similarities['canberra_distance']
t_y = train_similarities['different_author']
v_x = val_similarities['canberra_distance']
v_y = val_similarities['different_author']

In [52]:
model = LogisticRegression()
model.fit(t_x.values.reshape(-1, 1), t_y)

print('train accuracy: ', (model.predict(t_x.values.reshape(-1, 1)) == t_y).mean())
print('val accuracy: ', (model.predict(v_x.values.reshape(-1, 1)) == v_y).mean())

train accuracy:  0.653403318659
val accuracy:  0.678216123499


In [53]:
model.coef_

array([[ 0.01657386]])