In [1]:
import numpy as np
import pandas as pd
import time
import sys
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('../feature/train_nn.csv')
test = pd.read_csv('../feature/test_nn.csv')

In [3]:
train.shape,test.shape

((73147, 315), (31349, 314))

In [4]:
y = train.is_pass.values
train.drop(['id', 'is_pass'], inplace=True, axis=1)
X = train.values

x, x_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Dropout
from keras.regularizers import l2

import tensorflow as tf

from sklearn.metrics import roc_auc_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
def get_model(input_shape, layers, dropout = 0.2, regularization = 1e-4):
    input = Input(shape=(input_shape,))

    layer = input
    for layer_dim in layers:
        layer = Dense(layer_dim, activation = 'relu',
                      W_regularizer=l2(regularization),
                      b_regularizer=l2(regularization))(layer)
        #layer = Dropout(dropout)(layer)
    
    layer = Dropout(dropout)(layer)
    result = Dense(1, activation = 'sigmoid')(layer)


    model = Model(input=input, output=result)
    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=[auc])

    return model

In [7]:
nn_layers = [128,64,32]
train_epoches = 200
batch_size = 512

In [8]:
from keras import backend as K  
def auc(y_true, y_pred):  
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)  
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)  
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)  
    binSizes = -(pfas[1:]-pfas[:-1])  
    s = ptas*binSizes  
    return K.sum(s, axis=0)  

def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):  
    y_pred = K.cast(y_pred >= threshold, 'float32')  
    # N = total number of negative labels  
    N = K.sum(1 - y_true)  
    # FP = total number of false alerts, alerts from the negative class labels  
    FP = K.sum(y_pred - y_pred * y_true)  
    return FP/N  

def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):  
    y_pred = K.cast(y_pred >= threshold, 'float32')  
    # P = total number of positive labels  
    P = K.sum(y_true)  
    # TP = total number of correct alerts, alerts from the positive class labels  
    TP = K.sum(y_pred * y_true)  
    return TP/P  

In [9]:
model = get_model(x.shape[1], nn_layers)
model.summary()

  
  
  
  
  from ipykernel import kernelapp as app


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 313)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               80384     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
__________

In [10]:
callback = EarlyStopping("auc", patience=10, verbose=0, mode='auto')
model.fit(
    x, y,
    nb_epoch=train_epoches, batch_size=batch_size,
    validation_data=(x_test, y_test),
    callbacks=[callback], verbose = 1)

# Calculate total roc auc score
score = roc_auc_score(y_test, model.predict(x_test))
print("Total roc auc score = {0:0.4f}".format(score))

  


Train on 58517 samples, validate on 14630 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Total roc auc score = 0.7457


In [11]:
ids = test['id'].values
test.drop('id', inplace=True, axis=1)

In [12]:
def make_submission(model, ids, X_test,
                    submission_file_template = "../output/submission_nn_{}.csv"):
    submission = pd.DataFrame()
    submission["id"] = ids
    submission["is_pass"] = model.predict(X_test)
    filename = submission_file_template.format(time.strftime("%Y-%m-%d_{0:0.4f}".format(score)))
    submission.to_csv(filename, index=None)

In [13]:
make_submission(model,ids,test)