In [3]:
import sys
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine, braycurtis, canberra, cityblock, chebyshev, minkowski


sys.path.append("..")

In [4]:
# NUMBER_OF_FEATURES = 83

In [5]:
train = pd.read_csv('../data/500_ngrams/train_features_scaled_minmax.csv', index_col=0)
val = pd.read_csv('../data/500_ngrams/validation_features_scaled_minmax.csv', index_col=0)

In [5]:
train.head()

Unnamed: 0,A_0,A_1,A_2,A_3,A_4,A_5,A_6,A_7,A_8,A_9,...,B_926,B_927,B_928,B_929,B_930,B_931,B_932,B_933,B_934,different_author
0,0.591167,0.238369,0.393659,0.654621,0.23,0.544193,0.0,0.184063,0.470222,0.0,...,0.0,0.0,0.0,0.0,0.095192,0.0,0.0,0.0,0.072784,True
1,0.500083,0.186608,0.325804,0.428397,0.0,0.448047,0.121928,0.203754,0.409647,0.069231,...,0.0,0.0,0.0,0.0,0.7425,0.0,0.0,0.0,0.0,True
2,0.58996,0.25575,0.393005,0.653535,0.23,0.55254,0.0,0.184063,0.469346,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,0.519795,0.224352,0.295311,0.506321,0.0,0.55302,0.0,0.11405,0.482341,0.0,...,0.0,0.0,0.0,0.0,0.247191,0.0,0.0,0.0,0.0,False
4,0.616025,0.30425,0.248293,0.63909,0.0,0.62905,0.048339,0.173098,0.417508,0.134831,...,0.0,0.0,0.0,0.0,0.081481,0.0,0.0,0.0,0.0,False


In [56]:
def min_max_similarity(vectors):
    numerator = 0
    denominator = 0
    
    for i in range(NUMBER_OF_FEATURES):
        a_feature = vectors['A_{}'.format(i)]
        b_feature = vectors['B_{}'.format(i)]
        numerator += min(a_feature, b_feature)
        denominator += max(a_feature, b_feature)
    
    return numerator / denominator

def cosine_similarity(vectors):
    a = [vectors['A_{}'.format(i)] for i in range(NUMBER_OF_FEATURES)]
    b = [vectors['B_{}'.format(i)] for i in range(NUMBER_OF_FEATURES)]

    return 1 - cosine(a, b)
    
train['min_max_similarity'] = train.apply(lambda vectors: min_max_similarity(vectors[:-1]), axis=1)
val['min_max_similarity'] = val.apply(lambda vectors: min_max_similarity(vectors[:-1]), axis=1)

train['cos_similarity'] = train.apply(lambda vectors: cosine_similarity(vectors[:-1]), axis=1)
val['cos_similarity'] = val.apply(lambda vectors: cosine_similarity(vectors[:-1]), axis=1)

In [57]:
train.head()

Unnamed: 0,A_0,A_1,A_2,A_3,A_4,A_5,A_6,A_7,A_8,A_9,...,B_826,B_827,B_828,B_829,B_830,B_831,B_832,different_author,min_max_similarity,cos_similarity
0,0.591167,0.238369,0.393659,0.654621,0.23,0.544193,0.0,0.0,0.470222,0.0,...,0.0,0.0,0.095192,0.0,0.0,0.0,0.072784,True,0.341655,0.670623
1,0.500083,0.186608,0.325804,0.428397,0.0,0.448047,0.0,0.0,0.409647,0.069231,...,0.0,0.0,0.7425,0.0,0.0,0.0,0.0,True,0.339214,0.668914
2,0.58996,0.25575,0.393005,0.653535,0.23,0.55254,0.0,0.0,0.469346,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.319101,0.626929
3,0.519795,0.224352,0.295311,0.506321,0.0,0.55302,0.0,0.0,0.482341,0.0,...,0.0,0.0,0.247191,0.0,0.0,0.0,0.0,False,0.366793,0.708512
4,0.616025,0.30425,0.248293,0.63909,0.0,0.62905,0.0,0.0,0.417508,0.134831,...,0.0,0.0,0.081481,0.0,0.0,0.0,0.0,False,0.416457,0.773555


In [6]:
train.groupby('different_author').mean()['min_max_similarity']

different_author
False    0.338874
True     0.319502
Name: min_max_similarity, dtype: float64

In [7]:
val.groupby('different_author').mean()['min_max_similarity']

different_author
False    0.337823
True     0.321173
Name: min_max_similarity, dtype: float64

In [74]:
results = {}

for threshold in np.arange(0.28, 0.38, step=0.0005):
    prediction = train['min_max_similarity'].apply(lambda x: x < threshold)
    results[threshold] = (train['different_author'] == prediction).mean()
    
results

{0.28000000000000003: 0.50846596681341005,
 0.28050000000000003: 0.50711141212326449,
 0.28100000000000003: 0.50677277345072802,
 0.28150000000000003: 0.50812732814087369,
 0.28200000000000003: 0.50880460548594653,
 0.28250000000000003: 0.5099898408398239,
 0.28300000000000003: 0.50982052150355572,
 0.28350000000000003: 0.51083643752116492,
 0.28400000000000003: 0.51236031154757877,
 0.28450000000000003: 0.5130375888926515,
 0.28500000000000003: 0.51286826955638332,
 0.28550000000000003: 0.51523874026413818,
 0.28600000000000003: 0.51557737893667455,
 0.28650000000000003: 0.51642397561801556,
 0.28700000000000003: 0.51693193362682022,
 0.28750000000000003: 0.5171012529630884,
 0.28800000000000003: 0.51862512698950225,
 0.28850000000000003: 0.51964104300711145,
 0.28900000000000003: 0.52065695902472064,
 0.28950000000000004: 0.52099559769725701,
 0.29000000000000004: 0.52218083305113439,
 0.29050000000000004: 0.52218083305113439,
 0.29100000000000004: 0.52336606840501187,
 0.29150000000

In [75]:
opt_threshold = 0.3475

In [76]:
prediction = val['min_max_similarity'].apply(lambda x: x < opt_threshold)
(val['different_author'] == prediction).mean()

0.60891938250428812

---

In [59]:
print(train.groupby('different_author').mean()['cos_similarity'])
print(val.groupby('different_author').mean()['cos_similarity'])

different_author
False    0.636205
True     0.620226
Name: cos_similarity, dtype: float64
different_author
False    0.633924
True     0.622803
Name: cos_similarity, dtype: float64


In [79]:
cos_results = {}

for threshold in np.arange(0.6, 0.7, step=0.0005):
    prediction = train['cos_similarity'].apply(lambda x: x < threshold)
    cos_results[threshold] = (train['different_author'] == prediction).mean()

cos_results

{0.59999999999999998: 0.53081611920081273,
 0.60049999999999992: 0.53166271588215375,
 0.60099999999999987: 0.53098543853708091,
 0.60149999999999981: 0.53200135455469011,
 0.60199999999999976: 0.5321706738909584,
 0.6024999999999997: 0.53250931256349476,
 0.60299999999999965: 0.53403318658990862,
 0.60349999999999959: 0.53403318658990862,
 0.60399999999999954: 0.53521842194378599,
 0.60449999999999948: 0.53640365729766337,
 0.60499999999999943: 0.53674229597019985,
 0.60549999999999937: 0.53725025397900439,
 0.60599999999999932: 0.53741957331527257,
 0.60649999999999926: 0.53741957331527257,
 0.60699999999999921: 0.53843548933288177,
 0.60749999999999915: 0.53995936335929562,
 0.6079999999999991: 0.54063664070436845,
 0.60849999999999904: 0.54097527937690482,
 0.60899999999999899: 0.54232983406705049,
 0.60949999999999893: 0.54402302742973252,
 0.60999999999999888: 0.54351506942092787,
 0.61049999999999882: 0.54368438875719605,
 0.61099999999999877: 0.54470030477480524,
 0.61149999999

In [86]:
opt_cos_threshold = 0.66

prediction = val['cos_similarity'].apply(lambda x: x < opt_cos_threshold)
(val['different_author'] == prediction).mean()

0.57255574614065186

In [87]:
from sklearn.preprocessing import MinMaxScaler

In [105]:
scaler_minmax_sim = MinMaxScaler().fit(train['min_max_similarity'].values.reshape(-1, 1))

train['min_max_similarity_scaled'] = scaler_minmax_sim.transform(train['min_max_similarity'].values.reshape(-1, 1))
val['min_max_similarity_scaled'] = scaler_minmax_sim.transform(val['min_max_similarity'].values.reshape(-1, 1))

In [106]:
scaler_cos_sim = MinMaxScaler().fit(train['cos_similarity'].values.reshape(-1, 1))

train['cos_similarity_scaled'] = scaler_cos_sim.transform(train['cos_similarity'].values.reshape(-1, 1))
val['cos_similarity_scaled'] = scaler_cos_sim.transform(val['cos_similarity'].values.reshape(-1, 1))

In [107]:
train.head()

Unnamed: 0,A_0,A_1,A_2,A_3,A_4,A_5,A_6,A_7,A_8,A_9,...,B_828,B_829,B_830,B_831,B_832,different_author,min_max_similarity,cos_similarity,min_max_similarity_scaled,cos_similarity_scaled
0,0.591167,0.238369,0.393659,0.654621,0.23,0.544193,0.0,0.0,0.470222,0.0,...,0.095192,0.0,0.0,0.0,0.072784,True,0.341655,0.670623,0.639588,0.716054
1,0.500083,0.186608,0.325804,0.428397,0.0,0.448047,0.0,0.0,0.409647,0.069231,...,0.7425,0.0,0.0,0.0,0.0,True,0.339214,0.668914,0.633601,0.713424
2,0.58996,0.25575,0.393005,0.653535,0.23,0.55254,0.0,0.0,0.469346,0.0,...,0.0,0.0,0.0,0.0,0.0,True,0.319101,0.626929,0.584265,0.648829
3,0.519795,0.224352,0.295311,0.506321,0.0,0.55302,0.0,0.0,0.482341,0.0,...,0.247191,0.0,0.0,0.0,0.0,False,0.366793,0.708512,0.701247,0.774349
4,0.616025,0.30425,0.248293,0.63909,0.0,0.62905,0.0,0.0,0.417508,0.134831,...,0.081481,0.0,0.0,0.0,0.0,False,0.416457,0.773555,0.823062,0.874422


In [112]:
val.head()

Unnamed: 0,A_0,A_1,A_2,A_3,A_4,A_5,A_6,A_7,A_8,A_9,...,B_828,B_829,B_830,B_831,B_832,different_author,min_max_similarity,cos_similarity,min_max_similarity_scaled,cos_similarity_scaled
0,0.530822,0.224139,0.13064,0.556357,0.0,0.394825,0.0,0.0,0.426404,0.0,...,0.166667,0.0,0.0,0.0,0.0,True,0.319091,0.652089,0.584242,0.687538
1,0.550263,0.219075,0.114079,0.433775,0.232323,0.390259,0.0,0.0,0.41033,0.0,...,0.119855,0.0,0.0,0.0,0.0,True,0.331378,0.646179,0.61438,0.678446
2,0.647249,0.272249,0.18415,0.398342,0.0,0.334283,0.0,0.0,0.364741,0.0,...,0.147321,0.0,0.0,0.0,0.0,False,0.329187,0.632442,0.609007,0.657311
3,0.440422,0.268487,0.306748,0.674705,0.205357,0.524594,0.0,0.0,0.32954,0.0,...,0.0,0.0,0.0,0.0,0.0,False,0.321452,0.629388,0.590034,0.652612
4,0.472994,0.267232,0.27382,0.416469,0.0,0.451413,0.0,0.0,0.551084,0.0,...,0.095376,0.0,0.0,0.0,0.0,True,0.333545,0.657252,0.619694,0.695483


In [113]:
train['avg_similarity_scaled'] = train.apply(lambda x: (x.min_max_similarity_scaled + x.cos_similarity_scaled) / 2, axis=1)
val['avg_similarity_scaled'] = val.apply(lambda x: (x.min_max_similarity_scaled + x.cos_similarity_scaled) / 2, axis=1)

In [114]:
train.head()

Unnamed: 0,A_0,A_1,A_2,A_3,A_4,A_5,A_6,A_7,A_8,A_9,...,B_829,B_830,B_831,B_832,different_author,min_max_similarity,cos_similarity,min_max_similarity_scaled,cos_similarity_scaled,avg_similarity_scaled
0,0.591167,0.238369,0.393659,0.654621,0.23,0.544193,0.0,0.0,0.470222,0.0,...,0.0,0.0,0.0,0.072784,True,0.341655,0.670623,0.639588,0.716054,0.677821
1,0.500083,0.186608,0.325804,0.428397,0.0,0.448047,0.0,0.0,0.409647,0.069231,...,0.0,0.0,0.0,0.0,True,0.339214,0.668914,0.633601,0.713424,0.673512
2,0.58996,0.25575,0.393005,0.653535,0.23,0.55254,0.0,0.0,0.469346,0.0,...,0.0,0.0,0.0,0.0,True,0.319101,0.626929,0.584265,0.648829,0.616547
3,0.519795,0.224352,0.295311,0.506321,0.0,0.55302,0.0,0.0,0.482341,0.0,...,0.0,0.0,0.0,0.0,False,0.366793,0.708512,0.701247,0.774349,0.737798
4,0.616025,0.30425,0.248293,0.63909,0.0,0.62905,0.0,0.0,0.417508,0.134831,...,0.0,0.0,0.0,0.0,False,0.416457,0.773555,0.823062,0.874422,0.848742


In [115]:
print(train.groupby('different_author').mean()['avg_similarity_scaled'])
print(val.groupby('different_author').mean()['avg_similarity_scaled'])

different_author
False    0.647932
True     0.611883
Name: avg_similarity_scaled, dtype: float64
different_author
False    0.644889
True     0.615914
Name: avg_similarity_scaled, dtype: float64


In [118]:
avg_results = {}

for threshold in np.arange(0.60, 0.69, step=0.0005):
    prediction = train['avg_similarity_scaled'].apply(lambda x: x < threshold)
    avg_results[threshold] = (train['different_author'] == prediction).mean()

avg_results

{0.59999999999999998: 0.56738909583474428,
 0.60049999999999992: 0.56789705384354894,
 0.60099999999999987: 0.56806637317981712,
 0.60149999999999981: 0.56959024720623097,
 0.60199999999999976: 0.57094480189637653,
 0.6024999999999997: 0.57162207924144937,
 0.60299999999999965: 0.57246867592279038,
 0.60349999999999959: 0.57246867592279038,
 0.60399999999999954: 0.57314595326786322,
 0.60449999999999948: 0.57246867592279038,
 0.60499999999999943: 0.57314595326786322,
 0.60549999999999937: 0.57382323061293605,
 0.60599999999999932: 0.57348459194039958,
 0.60649999999999926: 0.57399254994920423,
 0.60699999999999921: 0.57517778530308161,
 0.60749999999999915: 0.57568574331188627,
 0.6079999999999991: 0.57653233999322717,
 0.60849999999999904: 0.57619370132069081,
 0.60899999999999899: 0.57602438198442263,
 0.60949999999999893: 0.57534710463934979,
 0.60999999999999888: 0.57568574331188627,
 0.61049999999999882: 0.57534710463934979,
 0.61099999999999877: 0.57517778530308161,
 0.6114999999

In [128]:
opt_avg_threshold = 0.6685

prediction = val['avg_similarity_scaled'].apply(lambda x: x < opt_avg_threshold)
(val['different_author'] == prediction).mean()

0.59759862778730699

-----

## Use only BEST k features

In [6]:
BEST = [ 1,   3,   5,   6,   7,  11,  46,  47,  52,  53,  55,  56,  57,
        59,  95,  99, 110, 111, 119, 120, 121, 122, 139, 152, 173, 174,
       186, 198, 199, 201, 202, 236, 241, 263, 279, 283, 290, 294, 310,
       325, 328, 337, 341, 342, 344, 349, 350, 352, 354, 362, 364, 366,
       368, 402, 403, 409, 410, 416, 417, 425, 426, 428, 440, 446, 451,
       455, 457, 471, 482, 492, 496, 499, 501, 502, 503, 507, 508, 514,
       515, 516, 524, 531, 544, 552, 561, 562, 563, 571, 572, 589, 592,
       593, 594, 603, 608, 613, 614, 631, 635, 639, 641, 658, 659, 665,
       676, 683, 684, 690, 694, 699, 704, 707, 710, 712, 722, 725, 726,
       729, 732, 734, 750, 752, 753, 754, 758, 766, 772, 773, 775, 789,
       793, 797, 808, 812, 813, 839, 846, 850, 866, 870, 882, 884, 894,
       898, 902, 908, 914, 918, 919, 925]

In [174]:
# def min_max_similarity_best(vectors):
#     numerator = 0
#     denominator = 0
    
#     for i in BEST:
#         a_feature = vectors['A_{}'.format(i)]
#         b_feature = vectors['B_{}'.format(i)]
#         numerator += min(a_feature, b_feature)
#         denominator += max(a_feature, b_feature)
    
#     return numerator / denominator

# def cosine_similarity_best(vectors):
#     a = [vectors['A_{}'.format(i)] for i in BEST]
#     b = [vectors['B_{}'.format(i)] for i in BEST]

#     return 1 - cosine(a, b)

# def braycurtis_similarity(vectors):
#     a = [vectors['A_{}'.format(i)] for i in BEST]
#     b = [vectors['B_{}'.format(i)] for i in BEST]

#     return 1 - braycurtis(a, b)

# def canberra_distance(vectors):
#     a = [vectors['A_{}'.format(i)] for i in BEST]
#     b = [vectors['B_{}'.format(i)] for i in BEST]

#     return canberra(a, b)

# def cityblock_distance(vectors):
#     a = [vectors['A_{}'.format(i)] for i in BEST]
#     b = [vectors['B_{}'.format(i)] for i in BEST]

#     return cityblock(a, b) 

# def chebyshev_distance(vectors):
#     a = [vectors['A_{}'.format(i)] for i in BEST]
#     b = [vectors['B_{}'.format(i)] for i in BEST]

#     return chebyshev(a, b)  

# def minkowski_2_distance(vectors):
#     a = [vectors['A_{}'.format(i)] for i in BEST]
#     b = [vectors['B_{}'.format(i)] for i in BEST]

#     return minkowski(a, b, p=2)

# def minkowski_3_distance(vectors):
#     a = [vectors['A_{}'.format(i)] for i in BEST]
#     b = [vectors['B_{}'.format(i)] for i in BEST]

#     return minkowski(a, b, p=3)


    
# train['min_max_similarity_BEST'] = train.apply(lambda vectors: min_max_similarity_best(vectors[:-1]), axis=1)
# val['min_max_similarity_BEST'] = val.apply(lambda vectors: min_max_similarity_best(vectors[:-1]), axis=1)

# train['cos_similarity_BEST'] = train.apply(lambda vectors: cosine_similarity_best(vectors[:-1]), axis=1)
# val['cos_similarity_BEST'] = val.apply(lambda vectors: cosine_similarity_best(vectors[:-1]), axis=1)

# train['bray_similarity_BEST'] = train.apply(lambda vectors: braycurtis_similarity(vectors[:-1]), axis=1)
# val['bray_similarity_BEST'] = val.apply(lambda vectors: braycurtis_similarity(vectors[:-1]), axis=1)

# train['canberra_distance_BEST'] = train.apply(lambda vectors: canberra_distance(vectors[:-1]), axis=1)
# val['canberra_distance_BEST'] = val.apply(lambda vectors: canberra_distance(vectors[:-1]), axis=1)

# train['cityblock_distance_BEST'] = train.apply(lambda vectors: cityblock_distance(vectors[:-1]), axis=1)
# val['cityblock_distance_BEST'] = val.apply(lambda vectors: cityblock_distance(vectors[:-1]), axis=1)

# train['chebyshev_distance_BEST'] = train.apply(lambda vectors: chebyshev_distance(vectors[:-1]), axis=1)
# val['chebyshev_distance_BEST'] = val.apply(lambda vectors: chebyshev_distance(vectors[:-1]), axis=1)

# train['minkowski_2_distance_BEST'] = train.apply(lambda vectors: minkowski_2_distance(vectors[:-1]), axis=1)
# val['minkowski_2_distance_BEST'] = val.apply(lambda vectors: minkowski_2_distance(vectors[:-1]), axis=1)

# train['minkowski_3_distance_BEST'] = train.apply(lambda vectors: minkowski_3_distance(vectors[:-1]), axis=1)
# val['minkowski_3_distance_BEST'] = val.apply(lambda vectors: minkowski_3_distance(vectors[:-1]), axis=1)

SyntaxError: invalid syntax (<ipython-input-174-bfed4356fb10>, line 58)

In [7]:
def minmax(a, b):
    return sum(np.minimum(a, b)) / sum(np.maximum(a, b))

def similarities(vectors):
    a = [vectors['A_{}'.format(i)] for i in BEST]
    b = [vectors['B_{}'.format(i)] for i in BEST]
    
    return (minmax(a,b), #minmax similarity
            cosine(a, b),
            braycurtis(a, b),
            canberra(a, b),
            cityblock(a, b))

In [10]:
similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

# computing train similarity measures
train_similarities = train.apply(lambda vectors: similarities(vectors), axis=1).apply(pd.Series)
train_similarities.columns = similarity_measures
train_similarities['different_author'] = train['different_author']

# computing val similarity measures
val_similarities = val.apply(lambda vectors: similarities(vectors), axis=1).apply(pd.Series)
val_similarities.columns = similarity_measures
val_similarities['different_author'] = val['different_author']

del train, val

In [11]:
train_similarities.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance,different_author
0,0.462521,0.2125,0.367502,50.275361,9.875315,True
1,0.451114,0.217858,0.378251,51.071265,10.169369,True
2,0.453668,0.194304,0.37583,40.643491,9.433641,True
3,0.444428,0.201608,0.384631,57.348183,11.515821,False
4,0.501943,0.164008,0.331609,61.360097,11.091895,False


In [12]:
val_similarities.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance,different_author
0,0.456671,0.212586,0.372993,50.842291,10.248057,True
1,0.434174,0.278934,0.394531,49.250785,11.453147,True
2,0.418594,0.22741,0.409847,50.624078,10.600369,False
3,0.365536,0.280048,0.464626,53.736054,12.144423,False
4,0.46147,0.19396,0.368486,50.968624,10.488887,True


### BEST minmax sim

In [12]:
train_similarities.groupby('different_author').mean()['min_max_similarity_BEST']

different_author
False    0.478304
True     0.433523
Name: min_max_similarity_BEST, dtype: float64

In [13]:
train_similarities.groupby('different_author').mean()['min_max_similarity_BEST']

different_author
False    0.472529
True     0.436617
Name: min_max_similarity_BEST, dtype: float64

In [17]:
results_BEST = {}

for threshold in np.arange(0.42, 0.50, step=0.0025):
    prediction = train_similarities['min_max_similarity_BEST'].apply(lambda x: x < threshold)
    results_BEST[threshold] = (train_similarities['different_author'] == prediction).mean()
    
results_BEST

{0.41999999999999998: 0.61378259397223167,
 0.42249999999999999: 0.61716898069759563,
 0.42499999999999999: 0.62428039282086012,
 0.42749999999999999: 0.63037588892651542,
 0.42999999999999999: 0.63410091432441584,
 0.4325: 0.63562478835082969,
 0.435: 0.63714866237724344,
 0.4375: 0.63918049441246194,
 0.44: 0.64696918388079916,
 0.4425: 0.65086352861496788,
 0.44500000000000001: 0.6525567219776498,
 0.44750000000000001: 0.65289536065018627,
 0.45000000000000001: 0.65306467998645445,
 0.45250000000000001: 0.65509651202167285,
 0.45500000000000002: 0.6556044700304775,
 0.45750000000000002: 0.65780562140196408,
 0.46000000000000002: 0.65729766339315954,
 0.46250000000000002: 0.65916017609210975,
 0.46500000000000002: 0.66153064679986451,
 0.46750000000000003: 0.66153064679986451,
 0.47000000000000003: 0.65831357941076873,
 0.47250000000000003: 0.65475787334913649,
 0.47500000000000003: 0.65424991534033183,
 0.47750000000000004: 0.65204876396884526,
 0.48000000000000004: 0.65069420927869

In [19]:
opt_BEST_threshold = 0.465

prediction = val['min_max_similarity_BEST'].apply(lambda x: x < opt_BEST_threshold)
(val['different_author'] == prediction).mean()

0.62572898799313892

### BEST cos sim

In [27]:
train_similarities.groupby('different_author').mean()['cos_similarity_BEST']

different_author
False    0.780522
True     0.743643
Name: cos_similarity_BEST, dtype: float64

In [28]:
train_similarities.groupby('different_author').mean()['cos_similarity_BEST']

different_author
False    0.774388
True     0.750337
Name: cos_similarity_BEST, dtype: float64

In [29]:
results_BEST_cos = {}

for threshold in np.arange(0.74, 0.82, step=0.0005):
    prediction = train_similarities['cos_similarity_BEST'].apply(lambda x: x < threshold)
    results_BEST_cos[threshold] = (train_similarities['different_author'] == prediction).mean()
    
results_BEST_cos

{0.73999999999999999: 0.60497798848628515,
 0.74049999999999994: 0.60582458516762616,
 0.74099999999999988: 0.60667118184896718,
 0.74149999999999983: 0.606501862512699,
 0.74199999999999977: 0.6063325431764307,
 0.74249999999999972: 0.606501862512699,
 0.74299999999999966: 0.60734845919404001,
 0.74349999999999961: 0.60700982052150354,
 0.74399999999999955: 0.60717913985777172,
 0.7444999999999995: 0.60751777853030819,
 0.74499999999999944: 0.60853369454791739,
 0.74549999999999939: 0.60836437521164921,
 0.74599999999999933: 0.60887233322045375,
 0.74649999999999928: 0.60870301388418557,
 0.74699999999999922: 0.60921097189299023,
 0.74749999999999917: 0.60887233322045375,
 0.74799999999999911: 0.60904165255672194,
 0.74849999999999905: 0.60921097189299023,
 0.748999999999999: 0.60938029122925841,
 0.74949999999999894: 0.60921097189299023,
 0.74999999999999889: 0.60971892990179477,
 0.75049999999999883: 0.61056552658313579,
 0.75099999999999878: 0.60988824923806295,
 0.7514999999999987

In [30]:
opt_BEST_cos_threshold = 0.795

prediction = val['cos_similarity_BEST'].apply(lambda x: x < opt_BEST_cos_threshold)
(val['different_author'] == prediction).mean()

0.57838765008576332

### BEST braycurtis sim

In [31]:
train_similarities.groupby('different_author').mean()['bray_similarity_BEST']

different_author
False    0.644707
True     0.602949
Name: bray_similarity_BEST, dtype: float64

In [32]:
train_similarities.groupby('different_author').mean()['bray_similarity_BEST']

different_author
False    0.63911
True     0.60586
Name: bray_similarity_BEST, dtype: float64

In [34]:
results_BEST_braycurtis = {}

for threshold in np.arange(0.60, 0.66, step=0.0025):
    prediction = train_similarities['bray_similarity_BEST'].apply(lambda x: x < threshold)
    results_BEST_braycurtis[threshold] = (train_similarities['different_author'] == prediction).mean()
    
results_BEST_braycurtis

{0.59999999999999998: 0.63342363697934301,
 0.60249999999999992: 0.63427023366068402,
 0.60499999999999987: 0.63748730104977991,
 0.60749999999999982: 0.63934981374873012,
 0.60999999999999976: 0.64307483914663055,
 0.61249999999999971: 0.65187944463257708,
 0.61499999999999966: 0.6515408059600406,
 0.6174999999999996: 0.65408059600406365,
 0.61999999999999955: 0.65408059600406365,
 0.6224999999999995: 0.65594310870301387,
 0.62499999999999944: 0.65458855401286831,
 0.62749999999999939: 0.65797494073823226,
 0.62999999999999934: 0.65729766339315954,
 0.63249999999999929: 0.65932949542837793,
 0.63499999999999923: 0.66203860480866916,
 0.63749999999999918: 0.66085336945479178,
 0.63999999999999913: 0.65695902472062306,
 0.64249999999999907: 0.65509651202167285,
 0.64499999999999902: 0.65408059600406365,
 0.64749999999999897: 0.65221808330511344,
 0.64999999999999891: 0.64950897392482221,
 0.65249999999999886: 0.64900101591601755,
 0.65499999999999881: 0.64781578056214018,
 0.65749999999

In [40]:
opt_BEST_threshold_braycurtis = 0.645

prediction = val['bray_similarity_BEST'].apply(lambda x: x < opt_BEST_threshold_braycurtis)
(val['different_author'] == prediction).mean()

0.62744425385934821

### BEST canberra dist

In [57]:
train_similarities.groupby('different_author').mean()['canberra_distance_BEST']

different_author
False    40.764737
True     50.644562
Name: canberra_similarity_BEST, dtype: float64

In [58]:
train_similarities.groupby('different_author').mean()['canberra_distance_BEST']

different_author
False    41.758544
True     50.619321
Name: canberra_similarity_BEST, dtype: float64

In [59]:
results_BEST_canberra = {}

for threshold in np.arange(40, 52, step=0.5):
    prediction = train_similarities['canberra_distance_BEST'].apply(lambda x: x > threshold)
    results_BEST_canberra[threshold] = (train_similarities['different_author'] == prediction).mean()
    
results_BEST_canberra

{40.0: 0.66220792414493734,
 40.5: 0.66813410091432446,
 41.0: 0.67033525228581103,
 41.5: 0.6737216390111751,
 42.0: 0.67592279038266168,
 42.5: 0.67829326109041654,
 43.0: 0.68235692516085333,
 43.5: 0.683711479850999,
 44.0: 0.68252624449712163,
 44.5: 0.68523535387741275,
 45.0: 0.68506603454114456,
 45.5: 0.68320352184219435,
 46.0: 0.68269556383338981,
 46.5: 0.68591263122248558,
 47.0: 0.68777514392143579,
 47.5: 0.6857433118862174,
 48.0: 0.68455807653234002,
 48.5: 0.68269556383338981,
 49.0: 0.67846258042668472,
 49.5: 0.67389095834744328,
 50.0: 0.67236708432102943,
 50.5: 0.66694886556044697,
 51.0: 0.6616999661361328,
 51.5: 0.65628174737555034}

In [71]:
opt_BEST_threshold_canberra = 46

prediction = val['canberra_distance_BEST'].apply(lambda x: x > opt_BEST_threshold_canberra)
(val['different_author'] == prediction).mean()

0.66929674099485426

### BEST cityblock dist

In [76]:
train_similarities.groupby('different_author').mean()['cityblock_distance_BEST']

different_author
False     9.807928
True     11.533602
Name: cityblock_distance_BEST, dtype: float64

In [77]:
val.groupby('different_author').mean()['cityblock_distance_BEST']

different_author
False    10.160012
True     11.401366
Name: cityblock_distance_BEST, dtype: float64

In [79]:
results_BEST_cityblock = {}

for threshold in np.arange(9, 12, step=0.125):
    prediction = train_similarities['cityblock_distance_BEST'].apply(lambda x: x > threshold)
    results_BEST_cityblock[threshold] = (train_similarities['different_author'] == prediction).mean()
    
results_BEST_cityblock

{9.0: 0.62292583813071456,
 9.125: 0.6305452082627836,
 9.25: 0.63664070436843889,
 9.375: 0.64442939383677611,
 9.5: 0.65069420927869959,
 9.625: 0.6576363020656959,
 9.75: 0.66085336945479178,
 9.875: 0.66644090755164243,
 10.0: 0.66694886556044697,
 10.125: 0.6716898069759567,
 10.25: 0.67202844564849307,
 10.375: 0.67490687436505248,
 10.5: 0.67219776498476125,
 10.625: 0.66931933626820184,
 10.75: 0.66711818489671515,
 10.875: 0.66491703352522857,
 11.0: 0.66474771418896039,
 11.125: 0.662715882153742,
 11.25: 0.66017609210971895,
 11.375: 0.65780562140196408,
 11.5: 0.65035557060616322,
 11.625: 0.64595326786318996,
 11.75: 0.64121232644768034,
 11.875: 0.63596342702336606}

In [87]:
opt_BEST_threshold_cityblock = 10.5

prediction = val['cityblock_distance_BEST'].apply(lambda x: x > opt_BEST_threshold_cityblock)
(val['different_author'] == prediction).mean()

0.62058319039451115

---

### BEST chebyshev dist

In [125]:
train_similarities.groupby('different_author').mean()['chebyshev_distance_BEST']

different_author
False    0.543820
True     0.584682
Name: chebyshev_distance_BEST, dtype: float64

In [126]:
val.groupby('different_author').mean()['chebyshev_distance_BEST']

different_author
False    0.551264
True     0.571469
Name: chebyshev_distance_BEST, dtype: float64

In [132]:
results_BEST_chebyshev = {}

for threshold in np.arange(0.5, 0.59, step=0.0025):
    prediction = train_similarities['chebyshev_distance_BEST'].apply(lambda x: x > threshold)
    results_BEST_chebyshev[threshold] = (train_similarities['different_author'] == prediction).mean()
    
results_BEST_chebyshev

{0.5: 0.55231967490687439,
 0.50249999999999995: 0.55299695225194723,
 0.50499999999999989: 0.55536742295970198,
 0.50749999999999984: 0.55604470030477482,
 0.50999999999999979: 0.55672197764984765,
 0.51249999999999973: 0.55756857433118867,
 0.51499999999999968: 0.55875380968506605,
 0.51749999999999963: 0.56095496105655263,
 0.51999999999999957: 0.56044700304774808,
 0.52249999999999952: 0.55858449034879787,
 0.52499999999999947: 0.55926176769387059,
 0.52749999999999941: 0.5582458516762614,
 0.52999999999999936: 0.55993904503894343,
 0.53249999999999931: 0.56112428039282081,
 0.53499999999999925: 0.56061632238401626,
 0.5374999999999992: 0.55909244835760241,
 0.53999999999999915: 0.55993904503894343,
 0.54249999999999909: 0.55943108703013888,
 0.54499999999999904: 0.55739925499492038,
 0.54749999999999899: 0.55672197764984765,
 0.54999999999999893: 0.55384354893328813,
 0.55249999999999888: 0.55265831357941075,
 0.55499999999999883: 0.55231967490687439,
 0.55749999999999877: 0.55248

In [133]:
opt_BEST_threshold_chebyshev = 0.53

prediction = val['chebyshev_distance_BEST'].apply(lambda x: x > opt_BEST_threshold_chebyshev)
(val['different_author'] == prediction).mean()

0.53138936535162951

---

### BEST minkowski_2 dist

In [134]:
train_similarities.groupby('different_author').mean()['minkowski_2_distance_BEST']

different_author
False    1.501683
True     1.644552
Name: minkowski_2_distance_BEST, dtype: float64

In [135]:
val.groupby('different_author').mean()['minkowski_2_distance_BEST']

different_author
False    1.538390
True     1.617354
Name: minkowski_2_distance_BEST, dtype: float64

In [137]:
results_BEST_minkowski_2 = {}

for threshold in np.arange(1.48, 1.65, step=0.005):
    prediction = train_similarities['minkowski_2_distance_BEST'].apply(lambda x: x > threshold)
    results_BEST_minkowski_2[threshold] = (train_similarities['different_author'] == prediction).mean()
    
results_BEST_minkowski_2

{1.48: 0.62038604808669151,
 1.4849999999999999: 0.62004740941415515,
 1.4899999999999998: 0.62089400609549605,
 1.4949999999999997: 0.61818489671520482,
 1.4999999999999996: 0.61767693870640028,
 1.5049999999999994: 0.61699966136132744,
 1.5099999999999993: 0.61632238401625461,
 1.5149999999999992: 0.61581442600745007,
 1.5199999999999991: 0.61716898069759563,
 1.524999999999999: 0.6164917033525229,
 1.5299999999999989: 0.61429055198103621,
 1.5349999999999988: 0.6144598713173044,
 1.5399999999999987: 0.61462919065357269,
 1.5449999999999986: 0.61429055198103621,
 1.5499999999999985: 0.61479850998984087,
 1.5549999999999984: 0.61412123264476803,
 1.5599999999999983: 0.6144598713173044,
 1.5649999999999982: 0.612428039282086,
 1.5699999999999981: 0.61327463596342702,
 1.574999999999998: 0.61479850998984087,
 1.5799999999999979: 0.61310531662715884,
 1.5849999999999977: 0.61073484591940397,
 1.5899999999999976: 0.60954961056552659,
 1.5949999999999975: 0.60921097189299023,
 1.5999999999

In [147]:
opt_BEST_threshold_minkowski_2 = 1.2

prediction = val['minkowski_2_distance_BEST'].apply(lambda x: x > opt_BEST_threshold_minkowski_2)
(val['different_author'] == prediction).mean()

0.516295025728988

---

### BEST minkowski_3 dist

In [148]:
train_similarities.groupby('different_author').mean()['minkowski_3_distance_BEST']

different_author
False    0.889298
True     0.960285
Name: minkowski_3_distance_BEST, dtype: float64

In [149]:
val.groupby('different_author').mean()['minkowski_3_distance_BEST']

different_author
False    0.907721
True     0.941107
Name: minkowski_3_distance_BEST, dtype: float64

In [151]:
results_BEST_minkowski_3 = {}

for threshold in np.arange(0.86, 0.98, step=0.005):
    prediction = train_similarities['minkowski_3_distance_BEST'].apply(lambda x: x > threshold)
    results_BEST_minkowski_3[threshold] = (train_similarities['different_author'] == prediction).mean()
    
results_BEST_minkowski_3

{0.85999999999999999: 0.59210971892990183,
 0.86499999999999999: 0.59143244158482899,
 0.87: 0.59244835760243819,
 0.875: 0.59092448357602434,
 0.88: 0.59160176092109718,
 0.88500000000000001: 0.59210971892990183,
 0.89000000000000001: 0.59278699627497455,
 0.89500000000000002: 0.59143244158482899,
 0.90000000000000002: 0.58973924822214696,
 0.90500000000000003: 0.58804605485946493,
 0.91000000000000003: 0.5843210294615645,
 0.91500000000000004: 0.58449034879783268,
 0.92000000000000004: 0.58449034879783268,
 0.92500000000000004: 0.58211987809007792,
 0.93000000000000005: 0.58025736539112771,
 0.93500000000000005: 0.57737893667456819,
 0.94000000000000006: 0.57619370132069081,
 0.94500000000000006: 0.57348459194039958,
 0.95000000000000007: 0.57450050795800878,
 0.95500000000000007: 0.5743311886217406,
 0.96000000000000008: 0.57466982729427696,
 0.96500000000000008: 0.5733152726041314,
 0.97000000000000008: 0.57043684388757199,
 0.97500000000000009: 0.57009820521503551}

In [152]:
opt_BEST_threshold_minkowski_3 = 0.89

prediction = val['minkowski_3_distance_BEST'].apply(lambda x: x > opt_BEST_threshold_minkowski_3)
(val['different_author'] == prediction).mean()

0.55025728987993139

In [40]:
# chebyshev and minkowski are not included as they perform poorly 

similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

t_x = train_similarities[similarity_measures]
t_y = train_similarities['different_author']
v_x = val_similarities[similarity_measures]
v_y = val_similarities['different_author']

In [41]:
# from sklearn.preprocessing import StandardScaler, MinMaxScaler

# standard_scaler = StandardScaler()
# standard_scaler.fit(t_x)

# minmax_scaler = MinMaxScaler()
# minmax_scaler.fit(t_x)

# t_x = minmax_scaler.transform(t_x)
# v_x = minmax_scaler.transform(v_x)

In [51]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(t_x, t_y)

print('train accuracy: ', (model.predict(t_x) == t_y).mean())
print('val accuracy: ', (model.predict(v_x) == v_y).mean())

train accuracy:  0.730782255334
val accuracy:  0.69845626072


In [44]:
model.coef_

array([[-5.67519843,  5.48266274,  2.52341968,  0.12660608, -0.24286138]])

In [45]:
t_x.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance
0,0.462521,0.2125,0.367502,50.275361,9.875315
1,0.451114,0.217858,0.378251,51.071265,10.169369
2,0.453668,0.194304,0.37583,40.643491,9.433641
3,0.444428,0.201608,0.384631,57.348183,11.515821
4,0.501943,0.164008,0.331609,61.360097,11.091895


In [46]:
from sklearn.svm import SVC

In [47]:
model_svc = SVC()
model_svc.fit(t_x, t_y)

print('train accuracy: ', (model_svc.predict(t_x) == t_y).mean())
print('val accuracy: ', (model_svc.predict(v_x) == v_y).mean())

train accuracy:  0.712834405689
val accuracy:  0.670668953688


## Use top 150 from selector + top 150 from random forest

In [1]:
BEST = [0, 1, 2, 3, 5, 6, 7, 8, 11, 15, 16, 26, 35, 42, 45, 46, 47, 49, 51, 52, 53, 54, 55, 56, 57, 58, 59,
        60, 63, 66, 68, 69, 81, 84, 88, 91, 95, 97, 98, 99, 103, 108, 109, 110, 111, 112, 114, 119, 120, 121,
        122, 123, 128, 130, 133, 136, 138, 139, 140, 146, 149, 152, 154, 160, 163, 166, 168, 170, 173, 174, 175,
        176, 186, 187, 189, 193, 197, 198, 199, 200, 201, 202, 203, 204, 212, 213, 219, 225, 227, 232, 233, 234,
        236, 240, 241, 242, 243, 244, 248, 251, 256, 263, 273, 276, 279, 282, 283, 288, 290, 292, 294, 299, 310,
        317, 325, 328, 330, 331, 333, 337, 341, 342, 344, 349, 350, 352, 354, 357, 362, 364, 366, 368, 379, 402,
        403, 409, 410, 416, 417, 425, 426, 428, 439, 440, 446, 451, 455, 456, 457, 458, 471, 473, 476, 482, 492,
        496, 499, 501, 502, 503, 507, 508, 512, 514, 515, 516, 521, 524, 531, 537, 543, 544, 552, 561, 562, 563,
        568, 571, 572, 589, 592, 593, 594, 603, 608, 613, 614, 627, 631, 635, 639, 641, 644, 654, 658, 659, 663,
        665, 672, 673, 674, 675, 676, 683, 684, 690, 694, 699, 704, 707, 710, 712, 722, 725, 726, 729, 732, 734,
        750, 752, 753, 754, 758, 766, 772, 773, 775, 783, 784, 789, 793, 797, 808, 812, 813, 826, 828, 835, 837,
        839, 840, 846, 849, 850, 866, 870, 882, 884, 894, 898, 899, 902, 908, 909, 914, 915, 918, 919, 925]

In [2]:
len(BEST)

259

In [6]:
def minmax(a, b):
    return sum(np.minimum(a, b)) / sum(np.maximum(a, b))

def similarities(vectors):
    a = [vectors['A_{}'.format(i)] for i in BEST]
    b = [vectors['B_{}'.format(i)] for i in BEST]
    
    return (minmax(a,b), #minmax similarity
            cosine(a, b),
            braycurtis(a, b),
            canberra(a, b),
            cityblock(a, b))

similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

# computing train similarity measures
train_similarities = train.apply(lambda vectors: similarities(vectors), axis=1).apply(pd.Series)
train_similarities.columns = similarity_measures
train_similarities['different_author'] = train['different_author']

# computing val similarity measures
val_similarities = val.apply(lambda vectors: similarities(vectors), axis=1).apply(pd.Series)
val_similarities.columns = similarity_measures
val_similarities['different_author'] = val['different_author']

del train, val

In [7]:
train_similarities.head()

Unnamed: 0,minmax_similarity,cosine_distance,braycurtis_distance,canberra_distance,cityblock_distance,different_author
0,0.517347,0.166647,0.31809,104.640771,20.850093,True
1,0.51232,0.168996,0.322472,105.428127,21.13288,True
2,0.455313,0.241103,0.374274,87.602779,24.214927,True
3,0.518682,0.155061,0.316932,103.651134,20.392671,False
4,0.573467,0.120926,0.271079,103.058471,19.18213,False


In [8]:
similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

t_x = train_similarities[similarity_measures]
t_y = train_similarities['different_author']
v_x = val_similarities[similarity_measures]
v_y = val_similarities['different_author']

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(t_x, t_y)

print('train accuracy: ', (model.predict(t_x) == t_y).mean())
print('val accuracy: ', (model.predict(v_x) == v_y).mean())

train accuracy:  0.738232306129
val accuracy:  0.714922813036


In [10]:
model.coef_

array([[-6.97790263,  4.05926716,  1.44823056,  0.11080992, -0.12487768]])

### BEST canberra dist

In [11]:
train_similarities.groupby('different_author').mean()['canberra_distance']

different_author
False    84.338491
True     97.998915
Name: canberra_distance, dtype: float64

In [13]:
val_similarities.groupby('different_author').mean()['canberra_distance']

different_author
False    85.601155
True     97.913699
Name: canberra_distance, dtype: float64

In [14]:
results_BEST_canberra = {}

for threshold in np.arange(83, 99, step=0.5):
    prediction = train_similarities['canberra_distance'].apply(lambda x: x > threshold)
    results_BEST_canberra[threshold] = (train_similarities['different_author'] == prediction).mean()
    
results_BEST_canberra

{83.0: 0.67795462241788007,
 83.5: 0.67778530308161189,
 84.0: 0.68405011852353537,
 84.5: 0.68912969861158146,
 85.0: 0.69268540467321371,
 85.5: 0.69844226210633253,
 86.0: 0.70149001015916013,
 86.5: 0.70098205215035558,
 87.0: 0.70470707754825601,
 87.5: 0.70944801896376564,
 88.0: 0.70978665763630211,
 88.5: 0.7138503217067389,
 89.0: 0.71554351506942093,
 89.5: 0.71605147307822559,
 90.0: 0.71655943108703013,
 90.5: 0.71757534710463933,
 91.0: 0.71808330511344398,
 91.5: 0.71706738909583478,
 92.0: 0.71622079241449377,
 92.5: 0.71672875042329831,
 93.0: 0.71808330511344398,
 93.5: 0.71435827971554355,
 94.0: 0.71097189299017949,
 94.5: 0.70860142228242462,
 95.0: 0.70606163223840157,
 95.5: 0.7028445648493058,
 96.0: 0.69641043007111414,
 96.5: 0.69437859803589574,
 97.0: 0.69319336268201825,
 97.5: 0.69116153064679986,
 98.0: 0.68625126989502205,
 98.5: 0.68303420250592617}

In [21]:
opt_BEST_threshold_canberra = 91

prediction = val_similarities['canberra_distance'].apply(lambda x: x > opt_BEST_threshold_canberra)
(val_similarities['different_author'] == prediction).mean()

0.69433962264150939