In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
df=pd.read_csv('df_1000.csv')

In [3]:
for bin_feature in ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
    df[bin_feature], uniques = pd.factorize(df[bin_feature])

In [4]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [5]:
df, cat_cols = one_hot_encoder(df, nan_as_category=False)

In [6]:
USELESS_COLUMNS = ['FLAG_DOCUMENT_10',
                   'FLAG_DOCUMENT_12',
                   'FLAG_DOCUMENT_13',
                   'FLAG_DOCUMENT_14',
                   'FLAG_DOCUMENT_15',
                   'FLAG_DOCUMENT_16',
                   'FLAG_DOCUMENT_17',
                   'FLAG_DOCUMENT_19',
                   'FLAG_DOCUMENT_2',
                   'FLAG_DOCUMENT_20',
                   'FLAG_DOCUMENT_21']

In [7]:
df= df.drop(USELESS_COLUMNS,axis=1)

In [8]:
df=df.replace([np.inf, -np.inf], np.nan)

In [9]:
df.shape

(356255, 1130)

In [10]:
total = df.isnull().sum().sort_values(ascending = False)
percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
missing_df  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [12]:
missing_df.head(10)

Unnamed: 0,Total,Percent
last_1_instalment_paid_late_in_days_std_y,356255,100.0
last_1_NUM_INSTALMENT_VERSION_std_y,356255,100.0
last_1_instalment_paid_over_amount_std_x,356255,100.0
last_1_instalment_paid_late_in_days_std_x,356255,100.0
last_1_NUM_INSTALMENT_VERSION_std_x,356255,100.0
last_1_instalment_paid_over_amount_std_y,356255,100.0
previous_application_days_first_drawing_last_1_credits_mean,344988,96.837378
previous_application_days_first_drawing_last_3_credits_mean,321799,90.328276
previous_application_days_first_drawing_last_5_credits_mean,309272,86.811975
credit_card_total_instalments,301644,84.670812


In [13]:
y = df['TARGET']
feats = [f for f in df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index',
                                           'last_1_instalment_paid_late_in_days_std_y',
                                           'last_1_NUM_INSTALMENT_VERSION_std_y',
                                           'last_1_instalment_paid_over_amount_std_x',
                                           'last_1_instalment_paid_late_in_days_std_x',
                                           'last_1_NUM_INSTALMENT_VERSION_std_x',
                                           'last_1_instalment_paid_over_amount_std_y']]
X = df[feats]
print("X shape:", X.shape, "y shape:", y.shape)

('X shape:', (356255, 1121), 'y shape:', (356255,))


In [None]:
print("\nPreparing data...")
X = X.fillna(X.median()).clip(-1e11,1e11)
scaler = MinMaxScaler()
scaler.fit(X)
training = y.notnull()
testing = y.isnull()
X_train = scaler.transform(X[training])
X_test = scaler.transform(X[testing])
y_train = np.array(y[training])
print( X_train.shape, X_test.shape, y_train.shape )


Preparing data...


In [None]:
print( 'Setting up neural network...' )
nn = Sequential()
nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = 1121))
nn.add(PReLU())
nn.add(Dropout(.3))
nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.3))
nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.3))
nn.add(Dense(units = 26, kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.3))
nn.add(Dense(units = 12, kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.3))
nn.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
nn.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:
print( 'Fitting neural network...' )
nn.fit(X_train, y_train, validation_split=0.1, epochs=50, verbose=2)

print( 'Predicting...' )
y_pred = nn.predict(X_test).flatten().clip(0,1)

In [None]:
print( 'Saving results...' )
sub = pd.DataFrame()
sub['SK_ID_CURR'] = df[testing]['SK_ID_CURR']
sub['TARGET'] = y_pred
sub[['SK_ID_CURR', 'TARGET']].to_csv('sub_nn.csv', index= False)

print( sub.head() )