In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import datetime 
import lightgbm as lgb

import os
import gc

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
train_transaction = reduce_mem_usage(pd.read_csv('fraud-dataset-benchmark/tmp/train_transaction.csv', index_col='TransactionID'))
test_transaction = reduce_mem_usage(pd.read_csv('fraud-dataset-benchmark/tmp/test_transaction.csv', index_col='TransactionID'))

train_identity = reduce_mem_usage(pd.read_csv('fraud-dataset-benchmark/tmp/train_identity.csv', index_col='TransactionID'))
test_identity = reduce_mem_usage(pd.read_csv('fraud-dataset-benchmark/tmp/test_identity.csv', index_col='TransactionID'))

sample_submission = pd.read_csv('fraud-dataset-benchmark/tmp/sample_submission.csv', index_col='TransactionID')

Memory usage of dataframe is 1775.15 MB
Memory usage after optimization is: 489.41 MB
Decreased by 72.4%
Memory usage of dataframe is 1519.24 MB
Memory usage after optimization is: 427.17 MB
Decreased by 71.9%
Memory usage of dataframe is 45.12 MB
Memory usage after optimization is: 10.55 MB
Decreased by 76.6%
Memory usage of dataframe is 44.39 MB
Memory usage after optimization is: 10.39 MB
Decreased by 76.6%


In [21]:
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

In [22]:
def corret_card_id(x): 
    x=x.replace('.0','')
    x=x.replace('-999','nan')
    return x

def define_indexes(df):
    
    # create date column
    START_DATE = '2017-12-01'
    startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
    df['TransactionDT'] = df['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
    
    df['year'] = df['TransactionDT'].dt.year
    df['month'] = df['TransactionDT'].dt.month
    df['dow'] = df['TransactionDT'].dt.dayofweek
    df['hour'] = df['TransactionDT'].dt.hour
    df['day'] = df['TransactionDT'].dt.day
   
    # create card ID 
    cards_cols= ['card1', 'card2', 'card3', 'card5']
    for card in cards_cols: 
        if '1' in card: 
            df['card_id']= df[card].map(str)
        else : 
            df['card_id']+= ' '+df[card].map(str)
    
    # small correction of the Card_ID
    df['card_id']=df['card_id'].apply(corret_card_id)

    return df

In [23]:
train = define_indexes(train)
test = define_indexes(test)

  df['year'] = df['TransactionDT'].dt.year
  df['month'] = df['TransactionDT'].dt.month
  df['dow'] = df['TransactionDT'].dt.dayofweek
  df['hour'] = df['TransactionDT'].dt.hour
  df['day'] = df['TransactionDT'].dt.day
  df['card_id']= df[card].map(str)
  df['year'] = df['TransactionDT'].dt.year
  df['month'] = df['TransactionDT'].dt.month
  df['dow'] = df['TransactionDT'].dt.dayofweek
  df['hour'] = df['TransactionDT'].dt.hour
  df['day'] = df['TransactionDT'].dt.day
  df['card_id']= df[card].map(str)


In [24]:
print('id-02' in test.columns)

True


In [25]:
train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')

test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std')
test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('std')

train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')

test['id_02_to_mean_card1'] = test['id-02'] / test.groupby(['card1'])['id-02'].transform('mean')
test['id_02_to_mean_card4'] = test['id-02'] / test.groupby(['card4'])['id-02'].transform('mean')
test['id_02_to_std_card1'] = test['id-02'] / test.groupby(['card1'])['id-02'].transform('std')
test['id_02_to_std_card4'] = test['id-02'] / test.groupby(['card4'])['id-02'].transform('std')

train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

  train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
  train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
  train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
  train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')
  test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean')
  test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean')
  test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std')
  test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].trans

In [26]:
one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]

many_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.9]
many_null_cols_test = [col for col in test.columns if test[col].isnull().sum() / test.shape[0] > 0.9]

big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols+ one_value_cols_test))

cols_to_drop.remove('isFraud')

# rename columns
for to_rename in ['id-07', 'id-25', 'id-21', 'id-26', 'id-22', 'id-24', 'id-27', 'id-08', 'id-23']:
    # rename - with _ in cols_to_drop
    new_name = to_rename.replace('-', '_')
    cols_to_drop = [i.replace(to_rename, new_name) for i in cols_to_drop]


train.drop(cols_to_drop, axis=1, inplace=True)

for to_rename in ['id_22', 'id_23', 'id_21', 'id_27', 'id_26', 'id_24', 'id_08', 'id_26', 'id_25', 'id_07', 'id_24', 'id_18', 'id_23', 'id_07', 'id_08', 'id_27', 'id_25', 'id_22', 'id_21']:
    # rename - with _ in cols_to_drop
    new_name = to_rename.replace('_', '-')
    cols_to_drop = [i.replace(to_rename, new_name) for i in cols_to_drop]

test.drop(cols_to_drop, axis=1, inplace=True)

In [27]:
del train_transaction, train_identity, test_transaction, test_identity

target = train['isFraud'].copy()

X_train = train.drop('isFraud', axis=1)
X_train.drop('TransactionDT', axis=1, inplace=True)
X_test = test.drop('TransactionDT', axis=1)

del train, test

In [35]:
gc.collect()

10352

In [40]:
for f in X_train.select_dtypes(include='category').columns.tolist() + X_train.select_dtypes(include='object').columns.tolist():
    lbl = LabelEncoder()
    if 'id_' in f:
        f = f.replace('_', '-')
        # f_test = f.replace('_', '-')
    lbl.fit(list(X_train[f].values) + list(X_test[f].values))
    X_train[f] = lbl.transform(list(X_train[f].values))
    X_test[f] = lbl.transform(list(X_test[f].values))  

In [63]:
params = {'num_leaves': 500,
          'min_child_weight': 0.03,
          'feature_fraction': 0.3,
          'bagging_fraction': 0.4,
          'min_data_in_leaf': 100,
          'objective': 'binary',
          'max_depth': 20,
          'learning_rate': 0.006,
          "boosting_type": "gbdt",
          "bagging_seed": 10,
          "metric": 'average_precision',
          "verbosity": -1,
          'reg_alpha': 0.389,
          'reg_lambda': 0.64,
          'random_state': 47
         }

In [64]:
splits = 5
folds = KFold(n_splits = splits)
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))

In [67]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train[:].values, target[:].values)):
    print("Fold {}".format(fold_))
    train_df, y_train_df = X_train.iloc[trn_idx], target.iloc[trn_idx]
    valid_df, y_valid_df = X_train.iloc[val_idx], target.iloc[val_idx]
    
    trn_data = lgb.Dataset(train_df, label=y_train_df)
    val_data = lgb.Dataset(valid_df, label=y_valid_df)
    
    clf = lgb.train(params,
                    trn_data,
                    100,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=5,
                    early_stopping_rounds=5)

    pred = clf.predict(valid_df)
    oof[val_idx] = pred
    print( "  auc = ", roc_auc_score(y_valid_df, pred) )
    # predictions += clf.predict(X_test) / splits

Fold 0




Training until validation scores don't improve for 5 rounds
[5]	training's average_precision: 0.622829	valid_1's average_precision: 0.38568
Early stopping, best iteration is:
[4]	training's average_precision: 0.614236	valid_1's average_precision: 0.386933
  auc =  0.8572101366545911
Fold 1




Training until validation scores don't improve for 5 rounds
[5]	training's average_precision: 0.60377	valid_1's average_precision: 0.524011
[10]	training's average_precision: 0.629626	valid_1's average_precision: 0.537533
[15]	training's average_precision: 0.642281	valid_1's average_precision: 0.547188
[20]	training's average_precision: 0.651086	valid_1's average_precision: 0.553658
[25]	training's average_precision: 0.658323	valid_1's average_precision: 0.55848
[30]	training's average_precision: 0.663661	valid_1's average_precision: 0.561555
[35]	training's average_precision: 0.668858	valid_1's average_precision: 0.564808
[40]	training's average_precision: 0.674343	valid_1's average_precision: 0.566923
[45]	training's average_precision: 0.679132	valid_1's average_precision: 0.56848
[50]	training's average_precision: 0.683046	valid_1's average_precision: 0.571226
[55]	training's average_precision: 0.686812	valid_1's average_precision: 0.57265
[60]	training's average_precision: 0.690383



Training until validation scores don't improve for 5 rounds
[5]	training's average_precision: 0.604683	valid_1's average_precision: 0.506953
[10]	training's average_precision: 0.632117	valid_1's average_precision: 0.531629
[15]	training's average_precision: 0.643668	valid_1's average_precision: 0.541835
[20]	training's average_precision: 0.653759	valid_1's average_precision: 0.545617
[25]	training's average_precision: 0.661148	valid_1's average_precision: 0.54862
[30]	training's average_precision: 0.667244	valid_1's average_precision: 0.547967
Early stopping, best iteration is:
[26]	training's average_precision: 0.662709	valid_1's average_precision: 0.548926
  auc =  0.8926089441099662
Fold 3




Training until validation scores don't improve for 5 rounds
[5]	training's average_precision: 0.602562	valid_1's average_precision: 0.522295
[10]	training's average_precision: 0.626549	valid_1's average_precision: 0.537136
[15]	training's average_precision: 0.6393	valid_1's average_precision: 0.543103
[20]	training's average_precision: 0.648962	valid_1's average_precision: 0.55044
[25]	training's average_precision: 0.656481	valid_1's average_precision: 0.551272
Early stopping, best iteration is:
[23]	training's average_precision: 0.653237	valid_1's average_precision: 0.554309
  auc =  0.9047546070552976
Fold 4




Training until validation scores don't improve for 5 rounds
[5]	training's average_precision: 0.620522	valid_1's average_precision: 0.461255
[10]	training's average_precision: 0.643111	valid_1's average_precision: 0.474367
[15]	training's average_precision: 0.65565	valid_1's average_precision: 0.467202
Early stopping, best iteration is:
[11]	training's average_precision: 0.646243	valid_1's average_precision: 0.474703
  auc =  0.8782263489803079


In [None]:
sample_submission = sample_submission.reset_index()
sample_submission["isFraud"] = predictions
sample_submission.to_csv("lgb_sub.csv", index=False)