In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import h5py
from scipy.spatial.distance import euclidean
from sklearn.metrics import pairwise_distances
from imblearn.under_sampling import ClusterCentroids
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import LogisticRegression as LR
from xgboost import XGBClassifier as XGB
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from keras.layers.advanced_activations import LeakyReLU

Using TensorFlow backend.


### Open file

can open any saved in 'decomposition.ipynb' file

In [2]:
path = os.getcwd()
filename = '/data/decomposed_tfidf_GMM.h5'  # decomposed_BOW_{decmposition}.h5 if BOW encoding

with h5py.File(path + filename, 'r') as h5file:
    X_train = h5file['train'][:]
    Y_train = h5file['target'][:]
    h5file.close()
    
print(X_train.shape)
print(Y_train.shape)

(10868, 402)
(10868,)


### Downsample the dataset

Original idea was to remove iteratively items that are similar to ones that are in dataset already (if similarity is bigger than some threshold)

In [3]:
# minority = np.array([X_train[row, :] for row in Y_train if row == 1])
# majority = np.array([X_train[row, :] for row in Y_train if row == 0])
# minority.shape, majority.shape

# threshold = 0.9  # how similar can be two records between each other

# def remove_similar(sim_matrix, threshold):
#     pass

# similarity_matrix = 1 - pairwise_distances(X_train, metric="cosine")
# truncated = remove_similar(similarity_matrix, threshold)

In the end, I've decided to use function from 'imblearn' that can cluster the similar items in the majority class

In [4]:
cc = ClusterCentroids(ratio='majority', random_state=42)
X_down, Y_down = cc.fit_sample(X_train, Y_train)
X_down.shape, Y_down.shape

((5202, 402), (5202,))

### Normalize the data

In [5]:
ss = StandardScaler()
X_norm = ss.fit_transform(X_down)

### Classification

In [6]:
def train_model(X, Y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)
    model.fit(X_train, y_train)
    
    Y_pred = model.predict_proba(X_test)
    print('AUC', roc_auc_score(y_test, Y_pred[:, 1]))

    Y_pred = model.predict(X_test)
    print('Accuracy', accuracy_score(y_test, Y_pred))
    
    
def train_all_models(X, Y):
    
    print('Logistic Regression')
    lr = LR()
    train_model(X, Y, lr)
    print('---------\n')
    
    print('Linear Discriminant Analysis')
    lda = LDA()
    train_model(X, Y, lda)
    print('---------\n')

    print('AdaBoost')
    abc = ABC(n_estimators=300)
    train_model(X, Y, abc)
    print('---------\n')
    
    print('XGBoost')
    xgb = XGB(n_estimators=300)
    train_model(X, Y, xgb)
    print('---------\n')
    
    print('Random Forest')
    rf = RF(n_estimators=300)
    train_model(X, Y, rf)
    print('---------\n')
    
    print('MLP')
    train_MLP(X, Y)
    print('---------\n')

In [7]:
def train_MLP(X_train, Y_train):
    np.random.seed(14)  # fix the random numbers generator state

    batch_size = 16
    input_shape = X_down.shape[1]
    nb_epochs = 10
    nb_classes = 1
    dropout = 0.05
    early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=2, verbose=1)

    model = Sequential()
    model.add(Dense(32, input_dim=input_shape))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(dropout))
    model.add(Dense(nb_classes))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', metrics=['binary_accuracy'], optimizer='adam', )

    model.fit(X_train, Y_train, batch_size=batch_size, epochs=nb_epochs, verbose=1, callbacks=[early_stopping],
                        validation_split=0.15)

In [8]:
# train downsampled data
train_all_models(X_down, Y_down)

Logistic Regression
AUC 0.9540232146370253
Accuracy 0.882202304737516
---------

Linear Discriminant Analysis




AUC 0.9586136795855466
Accuracy 0.9026888604353394
---------

AdaBoost
AUC 0.9599383566135484
Accuracy 0.9065300896286812
---------

XGBoost
AUC 0.9807987409010426


  if diff:


Accuracy 0.9295774647887324
---------

Random Forest
AUC 0.9751524690143616
Accuracy 0.9180537772087067
---------

MLP
Train on 4421 samples, validate on 781 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 00008: early stopping
---------



In [9]:
# train whole dataset
train_all_models(X_train, Y_train)

Logistic Regression
AUC 0.9063779143498196
Accuracy 0.8393623543838136
---------

Linear Discriminant Analysis




AUC 0.9080772607550482
Accuracy 0.8583690987124464
---------

AdaBoost
AUC 0.8780119012779241
Accuracy 0.8479460453709381
---------

XGBoost
AUC 0.9103267973856208
Accuracy 0.8632740649908032
---------

Random Forest


  if diff:


AUC 0.8823587942639743
Accuracy 0.8362967504598406
---------

MLP
Train on 9237 samples, validate on 1631 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 00005: early stopping
---------



In [10]:
# train normalized data
train_all_models(X_norm, Y_down)

Logistic Regression
AUC 0.9397468686471244
Accuracy 0.8758002560819462
---------

Linear Discriminant Analysis




AUC 0.9586136795855466
Accuracy 0.9026888604353394
---------

AdaBoost
AUC 0.9624696701423043
Accuracy 0.9039692701664532
---------

XGBoost
AUC 0.9747262115548561
Accuracy 0.9206145966709347
---------

Random Forest


  if diff:


AUC 0.9723326119745557
Accuracy 0.9090909090909091
---------

MLP
Train on 4421 samples, validate on 781 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 00004: early stopping
---------

