In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine, braycurtis, canberra, cityblock, chebyshev, minkowski
import sklearn
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest

In [2]:
train = pd.read_csv('../data/train_features_scaled_standard.csv', index_col=0)
val = pd.read_csv('../data/validation_features_scaled_standard.csv', index_col=0)

In [3]:
train_minmax = pd.read_csv('../data/train_features_scaled_minmax.csv', index_col=0)
val_minmax = pd.read_csv('../data/validation_features_scaled_minmax.csv', index_col=0)

In [4]:
number_of_features = int((train.shape[1] - 1) / 2)
assert number_of_features == 938

In [5]:
features = pd.read_csv('../data/features/prunned_RF_scores.csv', index_col=0)
assert features.shape[0] == 938

In [6]:
def minmax(a, b):
    return sum(np.minimum(a, b)) / sum(np.maximum(a, b))

def similarities(vectors, features):
    a = [vectors['A_{}'.format(i)] for i in features]
    b = [vectors['B_{}'.format(i)] for i in features]
    
    return (minmax(a,b),
            cosine(a, b),
            braycurtis(a, b),
            canberra(a, b),
            cityblock(a, b))

## all features, standard scaling

In [7]:
best_features = features.index
similarity_measures = ['minmax_similarity', 'cosine_distance', 'braycurtis_distance',
                               'canberra_distance', 'cityblock_distance']

# computing train similarity measures
train_similarities = train.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
train_similarities.columns = similarity_measures
train_similarities['different_author'] = train['different_author']

# computing val similarity measures
val_similarities = val.apply(lambda vectors: similarities(vectors, best_features), axis=1).apply(pd.Series)
val_similarities.columns = similarity_measures
val_similarities['different_author'] = val['different_author']

In [8]:
t_x = train_similarities[similarity_measures]
t_y = train_similarities['different_author']
v_x = val_similarities[similarity_measures]
v_y = val_similarities['different_author']

In [9]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
#     ('var_filter', VarianceThreshold()),
#     ('selector', SelectKBest()),
    ('clf', SVC(probability=True)),
])

In [10]:
pipe.fit(pd.concat([t_x, v_x]), pd.concat([t_y, v_y]))

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [11]:
pipe.score(t_x, t_y)

0.7814087368777515

In [12]:
sklearn.externals.joblib.dump(pipe, '../data/models/svm.pk')

['../data/models/svm.pk']