In [None]:
import numpy as np
import pandas as pd
import os

from sklearn import preprocessing
import xgboost as xgb
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

### Read Data

In [None]:
train_transaction = pd.read_csv('../input/train_transaction.csv', index_col='TransactionID')#, nrows= 100 )
train_identity = pd.read_csv('../input/train_identity.csv', index_col='TransactionID')
sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='TransactionID')

In [None]:
test_transaction = pd.read_csv('../input/test_transaction.csv', index_col='TransactionID')#, nrows= 100 )
test_identity = pd.read_csv('../input/test_identity.csv', index_col='TransactionID')

### merge data

In [None]:
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

### train, test split

In [None]:
print(train.shape)
print(test.shape)

y_train = train['isFraud'].copy()

# Drop target, fill in NaNs
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

In [None]:
# A util method. Helps lower memory in the DF columns
def downcast_dtypes(df):
    """ This method gets a DF and downcast its columns to float32 and int16 accordingly"""
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype in ["int64", "int32"]]

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int16)

    return df

X_train = downcast_dtypes(X_train)
X_test = downcast_dtypes(X_test)

In [None]:
del train, test, train_transaction, train_identity, test_transaction, test_identity

In [None]:
arr_categorical_columns = []
for col in X_train:
    if len(X_train[col].unique()) < 30:# or col in ['addr1', 'addr2']:
        arr_categorical_columns.append(col)
        
len(arr_categorical_columns)
arr_categorical_columns

In [None]:
# one of the features (addr2) had a row with most values 87. This should be some default / a very high value, 
# so decided to give it a boolean val
def label_race(row):
    if row['addr2'] == 87 :
        return 1
    else:
        return 0

In [None]:
def bucket_value (value, row):
    if row['TransactionAmt'] < value :
        return True
    else:
        return False
    
value = 10
X_train['amntBucket{}'.format(value)] = X_train.apply(lambda row: bucket_value(value,row), axis=1)
X_test['amntBucket{}'.format(value)] = X_test.apply(lambda row: bucket_value(value,row), axis=1)

value = 1
X_train['amntBucket{}'.format(value)] = X_train.apply(lambda row: bucket_value(value,row), axis=1)
X_test['amntBucket{}'.format(value)] = X_test.apply(lambda row: bucket_value(value,row), axis=1)

In [None]:
X_train.drop('amntBucket10',axis=1,inplace=True)
X_train.drop('amntBucket1',axis=1,inplace=True)

X_test.drop('amntBucket10',axis=1,inplace=True)
X_test.drop('amntBucket1',axis=1,inplace=True)

In [None]:
X_train['dayBack'] = ( X_train['TransactionDT'] - (X_train['TransactionDT'] % 86400)) / 86400
X_train['weekday'] = X_train['dayBack'] % 7

X_train['HoursBack'] = ( X_train['TransactionDT'] - (X_train['TransactionDT'] % 3600)) / 3600
X_train['hourOfDay'] = X_train['HoursBack'] % 24

X_test['dayBack'] = ( X_test['TransactionDT'] - (X_test['TransactionDT'] % 86400)) / 86400
X_test['weekday'] = X_test['dayBack'] % 7

X_test['HoursBack'] = ( X_test['TransactionDT'] - (X_test['TransactionDT'] % 3600)) / 3600
X_test['hourOfDay'] = X_test['HoursBack'] % 24

X_train.drop('TransactionDT',axis=1,inplace=True)
X_test.drop('TransactionDT',axis=1,inplace=True)

In [None]:
def calculate_metrics(df, col):

    new_df = df.groupby(col).agg({'TransactionAmt':['sum','count']})
    new_df.columns = ['TransactionAmt_sum_{}'.format(col),'TransactionAmt_count_{}'.format(col)]
    new_df.reset_index(inplace=True)
    
    
    
    df = pd.merge(df, new_df, on=col, how='left')
    df['TransactionAmt_sum_{}'.format(col)] = (df['TransactionAmt_sum_{}'.format(col)].fillna(0).astype(np.float16))
    df['TransactionAmt_count_{}'.format(col)] = (df['TransactionAmt_count_{}'.format(col)].fillna(0).astype(np.float16))
    

    return df


In [None]:
for col in ['card1', 'card2','card3','card4','card5','card6', 'hourOfDay','weekday','addr1','addr2']:
    X_train = calculate_metrics(X_train, col)
    X_test = calculate_metrics(X_test, col)

In [None]:
X_train['isAddr2_87'] = X_train.apply(lambda row: label_race(row), axis=1)
X_train.drop('addr2',axis=1,inplace=True)

X_test['isAddr2_87'] = X_test.apply(lambda row: label_race(row), axis=1)
X_test.drop('addr2',axis=1,inplace=True)



### Model Preparation

In [None]:
# Label Encoding + OneHotEncoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object' or f in arr_categorical_columns: 
    
        print(f, end = '')
        print('... ', end = '')
        
        new_add = pd.get_dummies(X_train[f], prefix=f)
        X_train.join(new_add)
        X_train.drop(f,axis=1,inplace=True)
        
        new_add = pd.get_dummies(X_test[f], prefix=f)
        X_test.join(new_add)
        X_test.drop(f,axis=1,inplace=True)
        
        
        print("done")

In [None]:
seed = 7
test_size = 0.05
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=test_size, random_state=seed)

In [None]:
clf = xgb.XGBClassifier(tree_method= 'hist',
                        n_estimators=100,
                        n_jobs=4,
                        max_depth=6,
                        learning_rate=0.05,
                        subsample=0.8,
                        colsample_bytree=0.9,
                        missing=-999,
                        reg_alpha= 0.7726783188295172,
                        objective= 'binary:logistic',
                        max_leaves= 72,
                        eval_metric = 'auc')
clf.fit(X_train, y_train)

In [None]:
predictions = clf.predict(X_eval)

In [None]:
predictions.shape

### Submission Score

In [None]:
accuracy = accuracy_score(y_eval, predictions)
precision=precision_score(y_eval, predictions)
recall=recall_score(y_eval, predictions)
roc=roc_auc_score(y_eval,predictions)
print("Accuracy: %.2f%% " % (accuracy * 100.0))
print("Precision: %.2f%% " % (precision *100))
print("Recall: %.2f%% " % (recall * 100))
print("AUC: %.2f%% " % (roc *100))

In [None]:
sample_submission['isFraud'] = clf.predict_proba(X_test)[:,1]
sample_submission.to_csv('simple_xgboost.csv')