# Baseline Model

In here I'll create a baseline model. 

Kernel https://www.kaggle.com/artkulak/ieee-fraud-simple-baseline-0-9383-lb

I just encode all non-numerical labels 

## Imports

In [1]:
import os

import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb

## Data

In [2]:
df_train_ident = pd.read_csv('../data/raw/train_identity.csv', index_col='TransactionID')
df_test_ident = pd.read_csv('../data/raw/test_identity.csv', index_col='TransactionID')

df_train_trans = pd.read_csv('../data/raw/train_transaction.csv', index_col='TransactionID')
df_test_trans = pd.read_csv('../data/raw/test_transaction.csv', index_col='TransactionID')

df_sample_submission = pd.read_csv('../data/raw/sample_submission.csv', index_col='TransactionID')

In [3]:
df_train = df_train_trans.merge(df_train_ident, how='left', left_index=True, right_index=True)
df_test = df_test_trans.merge(df_test_ident, how='left', left_index=True, right_index=True)


In [4]:
print(df_train.shape)
print(df_test.shape)

(590540, 433)
(506691, 432)


## Preprocessing

In [5]:
# Copy label
y_train = df_train['isFraud'].copy()
del df_train_trans, df_train_ident, df_test_ident, df_test_trans

# Delete label
X_train = df_train.drop('isFraud', axis=1)

# Create X
X_test = df_test.copy()

# Delete train and test

del df_train, df_test

In [6]:
# Encode all non numerical values

for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))  

## Reduce Memory

Kernel: https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

In [7]:

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [8]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

Memory usage of dataframe is 1970.87 MB
Memory usage after optimization is: 547.14 MB
Decreased by 72.2%
Memory usage of dataframe is 1673.87 MB
Memory usage after optimization is: 460.02 MB
Decreased by 72.5%


## Model

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
EPOCHS = 3
kf = KFold(n_splits = EPOCHS, shuffle = True)
y_preds = np.zeros(df_sample_submission.shape[0])
y_oof = np.zeros(X_train.shape[0])
for tr_idx, val_idx in kf.split(X_train, y_train):
    clf = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=9,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        tree_method='auto'
    )
    
    X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
    y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    clf.fit(X_tr, y_tr)
    y_pred_train = clf.predict_proba(X_vl)[:,1]
    y_oof[val_idx] = y_pred_train
    print('ROC AUC {}'.format(roc_auc_score(y_vl, y_pred_train)))
    
    y_preds+= clf.predict_proba(X_test)[:,1] / EPOCHS