Import modules

In [1]:
import sys
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

sys.path.append("..")

Load train / val / imposters as vectors

In [2]:
train = pd.read_csv('../data/train_features.csv', index_col=0)
val = pd.read_csv('../data/validation_features.csv', index_col=0)
imposters = pd.read_csv('../data/imposters/imposters_subset_features.csv', index_col=0)

Split train to A and B vectors

In [6]:
number_of_features = 938

assert train.shape[1] == number_of_features * 2 + 1
assert val.shape[1] == number_of_features * 2 + 1
assert imposters.shape[1] == number_of_features

In [9]:
a_columns = ['A_{}'.format(i) for i in range(number_of_features)]
b_columns = ['B_{}'.format(i) for i in range(number_of_features)]
imposters_columns = ['I_{}'.format(i) for i in range(number_of_features)]

train_A = train[a_columns]
train_B = train[b_columns]

val_A = val[a_columns]
val_B = val[b_columns]

Fit standard and minmax scaler to train vectors

In [8]:
standard_scaler = StandardScaler()
min_max_scaler = MinMaxScaler(feature_range=(0, 1))

standard_scaler.fit(train_A)
min_max_scaler.fit(train_A)

MinMaxScaler(copy=True, feature_range=(0, 1))

Transform train / val / imposters using fitted scalers

In [10]:
# Standard Scaler

train_standard_scaled = pd.concat([pd.DataFrame(standard_scaler.transform(train_A), columns=a_columns),
                                   pd.DataFrame(standard_scaler.transform(train_B), columns=b_columns),
                                   train['different_author']],
                                   axis=1)

val_standard_scaled = pd.concat([pd.DataFrame(standard_scaler.transform(val_A), columns=a_columns),
                                 pd.DataFrame(standard_scaler.transform(val_B), columns=b_columns),
                                 val['different_author']],
                                 axis=1)

imposters_standard_scaled = pd.DataFrame(standard_scaler.transform(imposters), columns=imposters_columns)


# MinMax Scaler

train_minmax_scaled = pd.concat([pd.DataFrame(min_max_scaler.transform(train_A), columns=a_columns),
                                 pd.DataFrame(min_max_scaler.transform(train_B), columns=b_columns),
                                 train['different_author']],
                                 axis=1)

val_minmax_scaled = pd.concat([pd.DataFrame(min_max_scaler.transform(val_A), columns=a_columns),
                               pd.DataFrame(min_max_scaler.transform(val_B), columns=b_columns),
                               val['different_author']],
                               axis=1)

imposters_minmax_scaled = pd.DataFrame(min_max_scaler.transform(imposters), columns=imposters_columns)

Write scaled data

In [11]:
train_standard_scaled.to_csv('../data/train_features_scaled_standard.csv')
val_standard_scaled.to_csv('../data/validation_features_scaled_standard.csv')
imposters_standard_scaled.to_csv('../data/imposters/imposters_scaled_standard.csv')

train_minmax_scaled.to_csv('../data/train_features_scaled_minmax.csv')
val_minmax_scaled.to_csv('../data/validation_features_scaled_minmax.csv')
imposters_minmax_scaled.to_csv('../data/imposters/imposters_scaled_minmax.csv')

Serialize scalers

In [12]:
import dill

with open('../data/standard_scaler.pk', 'wb') as f:
    dill.dump(standard_scaler, f)

with open('../data/minmax_scaler.pk', 'wb') as f:
    dill.dump(min_max_scaler, f)