In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score

import xgboost as xgb

from multiprocessing import Pool, cpu_count


from keras.layers import Dense, Input, BatchNormalization, Embedding, Concatenate, Flatten, SpatialDropout1D
from keras.models import Model
from keras.optimizers import Adam

# from tqdm import tqdm

# !python -c "import keras; print(keras.__version__)"
!python -c "import torch; print(torch.__version__) "

# Read/Merge Datasets

In [None]:
%%time

def get_datasets():
    train_transaction = pd.read_csv('../input/train_transaction.csv', index_col='TransactionID')
    test_transaction = pd.read_csv('../input/test_transaction.csv', index_col='TransactionID')

    train_identity = pd.read_csv('../input/train_identity.csv', index_col='TransactionID')
    test_identity = pd.read_csv('../input/test_identity.csv', index_col='TransactionID')

    sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='TransactionID')

    train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
    test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

    print(train.shape)
    print(test.shape)

    y_train = train['isFraud'].copy()
    del train_transaction, train_identity, test_transaction, test_identity

    # Drop target, fill in NaNs
    X_train = train.drop('isFraud', axis=1)
    X_test = test.copy()

    del train, test
    
    return X_train, y_train, X_test


X_train, y_train, X_test = get_datasets()

# Move Categorical Columns to the Left

In [None]:
categorical = ["ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2",
               "P_emaildomain", "R_emaildomain",
              "DeviceInfo", "DeviceType"] + ["id_0" + str(i) for i in range(1, 10)] +\
                ["id_" + str(i) for i in range(10, 39)] + \
                 ["M" + str(i) for i in range(1, 10)]


print(categorical)

In [None]:
def get_numerical_columns(categorical_columns, df):
    return [col for col in df.columns.values if col not in categorical_columns]


numerical = get_numerical_columns(categorical, X_train)

print(numerical)

In [None]:
%%time
def move_columns_to_left(left_columns, df):
    right_columns = [col for col in df.columns.values if col not in left_columns]
    
    return df[left_columns + right_columns]

    
X_train = move_columns_to_left(categorical, X_train)
X_test = move_columns_to_left(categorical, X_test)

# Fill NaN

In [None]:
%%time
def fill_nan_categorical(categorical_columns, df): 
    for f in categorical_columns:
        df[f] = df[f].fillna(-999)
        
        
def fill_nan_numerical(numerical_columns, df):
    for f in numerical_columns:
        mean = int(df[f].mean())
        df[f] = df[f].fillna(mean)
        
        
def fill_nan(categorical_columns, numerical_columns, df):
    fill_nan_categorical(categorical_columns, df)
    fill_nan_numerical(numerical_columns, df)
    
    
fill_nan(categorical, numerical, X_train)
fill_nan(categorical, numerical, X_test)

In [None]:
X_train.head()

In [None]:
%%time
# train_transaction = pd.read_csv('../input/train_transaction.csv', index_col='TransactionID')
# test_transaction = pd.read_csv('../input/test_transaction.csv', index_col='TransactionID')

# train_identity = pd.read_csv('../input/train_identity.csv', index_col='TransactionID')
# test_identity = pd.read_csv('../input/test_identity.csv', index_col='TransactionID')

# sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='TransactionID')

# train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
# test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

# print(train.shape)
# print(test.shape)

# y_train = train['isFraud'].copy()
# del train_transaction, train_identity, test_transaction, test_identity

# # Drop target, fill in NaNs
# X_train = train.drop('isFraud', axis=1)
# X_test = test.copy()

# del train, test

# X_train = X_train.fillna(-999)
# X_test = X_test.fillna(-999)

# Label Encoding
category_size_dict = {}

for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object' or f in categorical:
        
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))  
        
        category_size_dict[f] = len(lbl.classes_)
        

In [None]:
# clf = xgb.XGBClassifier(
#     n_estimators=500,
#     max_depth=9,
#     learning_rate=0.05,
#     subsample=0.9,
#     colsample_bytree=0.9,
#     missing=-999,
#     random_state=2019,
#     tree_method='gpu_hist'  # THE MAGICAL PARAMETER
# )

# %time clf.fit(X_train, y_train)

In [None]:
# del X_test

# import gc
# gc.collect()

In [None]:
# def prediction_score(X, y, model):
#     pred_prob = model.predict_proba(X)
#     score = roc_auc_score(y, pred_prob[:, 1])
    
#     return score

# pred_prob = clf.predict_proba(X_train)

# score = roc_auc_score(y_train, pred_prob[:, 1])

# print(score)

# Get Sizes of each categorical feature

In [None]:
def get_category_sizes(categorical, category_sizes):
    sizes = []
    
    for f in categorical:
        sizes.append(category_sizes[f])
        
    return sizes
    
    
category_sizes = get_category_sizes(categorical, category_size_dict)

print(category_sizes)
print(len(category_sizes))

# Build Model

In [None]:
def build_embedding_layers(vocab_sizes, input_length=1):
    """
    Args:
        embed_sizes (list): list of the number of each class
    Return:
        inputs (keras.layers.Input): shape is len(embed_sizes)
        outputs (Tensor): shape should be np.sum(embed_sizes)
    """
    
    in_embeddings = []
    out_embeddings = []
    for i_input, input_dim in enumerate(vocab_sizes):
        
        in_embed = Input(shape=[1])
        
        # Do not shrink the size of input
        if input_dim > 10000:
            embed_size = (input_dim+1) // 50
        elif input_dim > 1000:
            embed_size = (input_dim+1) // 10
        elif input_dim > 10:
            embed_size = (input_dim+1) // 4
        else:
            embed_size = input_dim
            
        out_embed = Embedding(input_dim=input_dim, 
                              output_dim=embed_size)(in_embed)
        
        out_embed = (out_embed)
        
        print(in_embed.shape)
        print(out_embed.shape)
        
        in_embeddings.append(in_embed)
        out_embeddings.append(out_embed)
        
        
    inputs = Concatenate(axis=-1)(in_embeddings)
    outputs = Concatenate(axis=-1)(out_embeddings)
    
    return Flatten()(outputs), inputs


outputs_embed, inputs_embed = build_embedding_layers(category_sizes)

print(inputs_embed.shape)
print(outputs_embed.shape)

In [None]:
import keras.backend as k

def build_mixed_data_model(category_sizes, numeric_size, lr=1e-4):
    
    k.clear_session()
    
    # Build input layers
    inputs_numeric = Input(shape=[numeric_size])
    outputs_embed, inputs_category = build_embedding_layers(category_sizes)
    
    
    inputs = Concatenate()([inputs_category, inputs_numeric])
    
#     x = Concatenate()([outputs_embed, inputs_numeric])
    x = inputs
    
#     x = Dense(256, activation='relu')(x)
#     x = BatchNormalization()(x)
    
#     x = Dense(128, activation='relu')(x)
# #     x = BatchNormalization()(x)
    
#     x = Dense(64, activation='relu')(x)
# #     x = BatchNormalization()(x)
    
#     x = Dense(16, activation='relu')(x)
# #     x = BatchNormalization()(x)
    
    outputs = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=inputs,
                 outputs=outputs)
    
    optimizer = Adam(lr)
    
    model.compile(loss='binary_crossentropy',
                 optimizer=optimizer,
                 metrics=['accuracy'])
    
    return model


size_numerics = X_train.shape[1] - len(category_sizes)
model = build_mixed_data_model(category_sizes, size_numerics)

model.summary()

In [None]:
def prediction_score(X, y, model, batch_size=128):
    pred_prob = model.predict(X, batch_size)
    score = roc_auc_score(y, pred_prob)
    
    return score

In [None]:
%%time
epochs=20
batch_size = 512
print('memory for batch_size {}: {:,}'.format(batch_size,
                                              int(X_train.memory_usage().sum() / X_train.shape[0] * batch_size)))


for i_epoch in tqdm(range(epochs)):
    model.fit(X_train, y_train,
             epochs=1,
             batch_size=batch_size)
    
    
    print('roc-auc score: {}'.format(prediction_score(X_train, y_train, model)))
    

# Evaluate ROC-AUC Score

In [None]:
print(prediction_score(X_train, y_train, model))