# Fraud Detection Binary Classification (Data Preparetion and LDGM Model)

## Imports

In [1]:
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 10]
import seaborn as sns

from sklearn.model_selection import train_test_split


%matplotlib inline
plt.rcParams['axes.unicode_minus'] = False 

# encoder
from sklearn.preprocessing import LabelEncoder

## Load Data

In [2]:
# card transaction data
df_train = pd.read_csv('/Users/alex/Documents/ieee-fraud-detection/train_transaction.csv')
df_test = pd.read_csv('/Users/alex/Documents/ieee-fraud-detection/test_transaction.csv')

In [3]:
# identity data
id_train = pd.read_csv('/Users/alex/Documents/ieee-fraud-detection/train_identity.csv')
id_test = pd.read_csv('/Users/alex/Documents/ieee-fraud-detection/test_identity.csv')

## Merge transaction and identity

In [4]:
# labels
train_labels = df_train.pop('isFraud')

In [5]:
# merge id transaction and identity datasets
df_train = df_train.merge(id_train, on = 'TransactionID', how = 'left')
df_test = df_test.merge(id_test, on = 'TransactionID', how = 'left')
print(df_train.shape)
print(df_test.shape)

(590540, 433)
(506691, 433)


In [7]:
# create feature data frame with train and test data
feature_df = df_train.append(df_test)
print(feature_df.shape)

(1097231, 433)


## Missing values

In [9]:
# functio to check for missing data
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [10]:
# missing values
missing_values = missing_values_table(feature_df)


Your selected dataframe has 433 columns.
There are 428 columns that have missing values.


In [11]:
missing_values.head()

Unnamed: 0,Missing Values,% of Total Values
id_24,1087744,99.1
id_25,1087060,99.1
id_26,1087021,99.1
id_08,1087017,99.1
id_07,1087017,99.1


## Data Types

Categorical Features:

 - ProductCD
 - card1 - card6
 - addr1, addr2
 - P_emaildomain
 - R_emaildomain
 - M1 - M9
 - Categorical Features - Identity
 - DeviceType
 - DeviceInfo
 - id_12 - id_38

In [12]:
# data type counts
print(feature_df.dtypes.value_counts())


float64    399
object      31
int64        3
dtype: int64


## OHE 'ProductCD'

In [13]:
# ProductCD unique values
df_test['ProductCD'].value_counts()

W    360987
C     69266
R     35647
H     29373
S     11418
Name: ProductCD, dtype: int64

In [14]:
# ohe 
encoded_columns = pd.get_dummies(feature_df['ProductCD'], prefix='Product_')
feature_df = pd.concat([feature_df, encoded_columns], axis=1).drop('ProductCD', axis=1)

## Renove missing values from 'card 4', 'card 5'

In [18]:
# replace value to 'unknown'
feature_df.card6.replace('debit or credit', 'unknown', inplace=True)

# fill missing value
feature_df.card4.fillna('unknown', inplace=True)

# fill missing value
feature_df.card6.fillna('unknown', inplace=True)

In [19]:
# ohe 
encoded_columns = pd.get_dummies(feature_df['card4'], prefix='card4_')
feature_df = pd.concat([feature_df, encoded_columns], axis=1).drop('card4', axis=1)

encoded_columns = pd.get_dummies(feature_df['card6'], prefix='card6_')
feature_df = pd.concat([feature_df, encoded_columns], axis=1).drop('card6', axis=1)

In [20]:
print(feature_df.shape)

(1097231, 444)


## Time

In [21]:
# transaction date and time convert to datetime type
feature_df['TransactionDT'] = feature_df['TransactionDT'].apply(lambda t: datetime.datetime.fromtimestamp(int(t)).strftime('%Y-%m-%d %H:%M:%S'))
feature_df['TransactionDT']= feature_df['TransactionDT'].apply(lambda t: pd.to_datetime(t, format='%Y-%m-%d %H:%M:%S'))

In [22]:
# create transaction hour feature
feature_df['hour'] = feature_df['TransactionDT'].apply(lambda t: t.hour)

# create transaction weekday feature
feature_df['day'] = feature_df['TransactionDT'].apply(lambda t: t.weekday())

## 'C' Features

In [25]:
feature_df[['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14']].head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14
0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0
1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
3,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0
4,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0


## 'M' features

In [26]:
feature_df[['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']].head()

Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9
0,T,T,T,M2,F,T,,,
1,,,,M0,T,T,,,
2,T,T,T,M0,F,F,F,F,F
3,,,,M0,T,F,,,
4,,,,,,,,,


In [27]:
# encode lables
for col in ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']:
  feature_df[col] = feature_df[col] = feature_df[col].astype('category').cat.codes
  print(col)

M1
M2
M3
M4
M5
M6
M7
M8
M9


In [28]:
feature_df[['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']].head()

Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9
0,1,1,1,2,0,1,-1,-1,-1
1,-1,-1,-1,0,1,1,-1,-1,-1
2,1,1,1,0,0,0,0,0,0
3,-1,-1,-1,0,1,0,-1,-1,-1
4,-1,-1,-1,-1,-1,-1,-1,-1,-1


## 'D' features

In [29]:
df_test[['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15']].head()

Unnamed: 0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15
0,419.0,419.0,27.0,398.0,27.0,,,,,418.0,203.0,,,,409.0
1,149.0,149.0,7.0,634.0,7.0,,,,,231.0,634.0,,,,634.0
2,137.0,137.0,10.0,97.0,10.0,,,,,136.0,136.0,,,,97.0
3,42.0,42.0,41.0,242.0,41.0,,,,,242.0,242.0,,,,242.0
4,22.0,22.0,0.0,22.0,0.0,,,,,22.0,22.0,,,,22.0


## 'Card1' Frequency Encoding

In [65]:
# frequency encoding for card1 feature
for col in ['card1']:     
    col_encoded = feature_df[col].value_counts().to_dict()   
    feature_df['card1_fr'] = feature_df[col].map(col_encoded)

## 'id_01' - 'id_11' Featues

In [33]:
feature_df[['id_01', 'id_02', 'id_03', 'id_04', 'id_05',
                'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11']].head()

Unnamed: 0,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11
0,,,,,,,,,,,
1,,,,,,,,,,,
2,,,,,,,,,,,
3,,,,,,,,,,,
4,0.0,70787.0,,,,,,,,,100.0


## 'id_12' - 'id_38' Features

In [34]:
# check unique values
feature_df[['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 
            'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']].nunique()

id_12      2
id_13     55
id_14     28
id_15      3
id_16      2
id_17    127
id_18     19
id_19    568
id_20    547
id_21    734
id_22     35
id_23      3
id_24     17
id_25    440
id_26    115
id_27      2
id_28      2
id_29      2
id_30     87
id_31    172
id_32      6
id_33    461
id_34      4
id_35      2
id_36      2
id_37      2
id_38      2
dtype: int64

In [36]:
# encode labels for some id features
for col in ['id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']:
  feature_df[col] = feature_df[col] = feature_df[col].astype('category').cat.codes
  print(col)

id_12
id_15
id_16
id_23
id_27
id_28
id_29
id_34
id_35
id_36
id_37
id_38


In [37]:
# check other id features
feature_df[['id_13', 'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 
            'id_30', 'id_31', 'id_33']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1097231 entries, 0 to 506690
Data columns (total 14 columns):
id_13    257606 non-null float64
id_14    151401 non-null float64
id_17    275335 non-null float64
id_18    95988 non-null float64
id_19    275224 non-null float64
id_20    274894 non-null float64
id_21    10218 non-null float64
id_22    10231 non-null float64
id_24    9487 non-null float64
id_25    10171 non-null float64
id_26    10210 non-null float64
id_30    148224 non-null object
id_31    276907 non-null object
id_33    143960 non-null object
dtypes: float64(11), object(3)
memory usage: 125.6+ MB


In [39]:
feature_df[['id_30', 'id_31', 'id_33']].nunique()

id_30     88
id_31    173
id_33    462
dtype: int64

In [38]:
# encode labels for 'id_30', 'id_31', 'id_33'
le = LabelEncoder()
for col in ['id_30', 'id_31', 'id_33']:
    feature_df[col] = le.fit_transform(feature_df[col].astype(str))
    #print(feature_df[['target', col]].corr())
    #train_df[col].plot.hist(title = 'Days Employment Histogram');

In [40]:
for col in ['P_emaildomain', 'R_emaildomain']:
    feature_df[col] = le.fit_transform(feature_df[col].astype(str))

In [41]:
for col in ['DeviceType', 'DeviceInfo']:
    feature_df[col] = le.fit_transform(feature_df[col].astype(str))

In [42]:
feature_df.dtypes.value_counts()

float64           399
int8               21
uint8              14
int64              12
datetime64[ns]      1
dtype: int64

In [43]:
feature_df[['id_13', 'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 
            'id_30', 'id_31', 'id_33']].head()

Unnamed: 0,id_13,id_14,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_30,id_31,id_33
0,,,,,,,,,,,,86,136,461
1,,,,,,,,,,,,86,136,461
2,,,,,,,,,,,,86,136,461
3,,,,,,,,,,,,86,136,461
4,,-480.0,166.0,,542.0,144.0,,,,,,7,162,268


In [44]:
df_train.tail()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
590535,3577535,15811047,49.0,W,6550,,150.0,visa,226.0,debit,...,,,,,,,,,,
590536,3577536,15811049,39.5,W,10444,225.0,150.0,mastercard,224.0,debit,...,,,,,,,,,,
590537,3577537,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,debit,...,,,,,,,,,,
590538,3577538,15811088,117.0,W,7826,481.0,150.0,mastercard,224.0,debit,...,,,,,,,,,,
590539,3577539,15811131,279.95,W,15066,170.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,


In [118]:
#feature_df = feature_df.drop(['TransactionID', 'TransactionDT'], axis=1)

In [45]:
feature_df.iloc[590540:].head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,card4__mastercard,card4__unknown,card4__visa,card6__charge card,card6__credit,card6__debit,card6__unknown,hour,day,card1_fr
0,3663549,1970-08-02 03:00:24,31.95,10409,111.0,150.0,226.0,170.0,87.0,1.0,...,0,0,1,0,0,1,0,3,6,150
1,3663550,1970-08-02 03:01:03,49.0,4272,111.0,150.0,226.0,299.0,87.0,4.0,...,0,0,1,0,0,1,0,3,6,2077
2,3663551,1970-08-02 03:01:50,171.0,4476,574.0,150.0,226.0,472.0,87.0,2635.0,...,0,0,1,0,0,1,0,3,6,83
3,3663552,1970-08-02 03:01:50,284.95,10989,360.0,150.0,166.0,205.0,87.0,17.0,...,0,0,1,0,0,1,0,3,6,1924
4,3663553,1970-08-02 03:01:57,67.95,18018,452.0,150.0,117.0,264.0,87.0,6.0,...,1,0,0,0,0,1,0,3,6,2786


## Baseline LGBM 

In [47]:
# lgbm
import lightgbm as lgb


In [49]:
# train test split
train=feature_df.iloc[:590540].drop(['TransactionID', 'TransactionDT', 'card1_fr'], axis=1).copy()
test=feature_df.iloc[590540:].drop(['TransactionID', 'TransactionDT', 'card1_fr'], axis=1).copy()
#train_labels = df_train.isFraud

X_train, X_test, y_train, y_test = train_test_split(train, train_labels, random_state=21,
                                                    stratify=train_labels, test_size=0.3)


In [50]:
# check model shape
print(train.shape)
print(test.shape)

(590540, 444)
(506691, 444)


### Baseline LGBM

In [51]:
# model
model = lgb.LGBMClassifier(random_state = 21)
model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=21, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [52]:
# imports
from timeit import default_timer as timer
from sklearn.metrics import roc_auc_score

In [53]:
# maodel fit
start = timer()
model.fit(X_train, y_train)
train_time = timer() - start

predictions = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, predictions)

print('The baseline score on the test set is {:.4f}.'.format(auc))
print('The baseline training time is {:.4f} seconds'.format(train_time))

The baseline score on the test set is 0.9265.
The baseline training time is 49.9797 seconds


### Model Features

In [54]:
columns = ['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'Product__C', 'Product__H', 'Product__R',
           'Product__S', 'Product__W', 'card4__american express', 'card4__discover', 'card4__mastercard', 'card4__unknown', 'card4__visa',
           'card6__charge card', 'card6__credit', 'card6__debit', 'card6__unknown', 'addr1', 'addr2', 'hour', 'dist1', 'dist2' ,
           'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14',
           'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15',
           'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'day', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13',
           'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34',
            'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55',
           'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76',
           'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97',
            'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116',
           'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134',
           'V135', 'V136', 'V137', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152',
           'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170',
            'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188',
            'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206',
            'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 'V223', 'V224',
            'V225', 'V226', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V238', 'V239', 'V240', 'V241', 'V242',
           'V243', 'V244', 'V245', 'V246', 'V247', 'V248', 'V249', 'V250', 'V251', 'V252', 'V253', 'V254', 'V255', 'V256', 'V257', 'V258', 'V259', 'V260',
           'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V269', 'V270', 'V271', 'V272', 'V273', 'V274','V275', 'V276', 'V277', 'V278', 'V279',
            'V280', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297',
           'V298', 'V299', 'V300', 'V301', 'V302', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314',
           'V315', 'V316', 'V317', 'V318', 'V319', 'V320', 'V321', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 'V330', 'V331', 'V332',
           'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339']

In [55]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(train[columns], train_labels, random_state=21,
                                                    stratify=train_labels, test_size=0.3)



### Model (Features)

In [57]:
model = lgb.LGBMClassifier(random_state = 21)
model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=21, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [58]:
start = timer()
model.fit(X_train, y_train)
train_time = timer() - start

predictions = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, predictions)

print('The baseline score on the test set is {:.4f}.'.format(auc))
print('The baseline training time is {:.4f} seconds'.format(train_time))

The baseline score on the test set is 0.9271.
The baseline training time is 42.1566 seconds


### Cross Validation LGBM

In [59]:
# train test split
train=feature_df.iloc[:590540].drop(['TransactionDT', 'card1'], axis=1).copy()
test=feature_df.iloc[590540:].drop(['TransactionDT', 'card1'], axis=1).copy()

train['TARGET'] = train_labels

In [61]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc

def model(features, test_features, encoding = 'ohe', n_folds = 3):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    """
    
    # extract the ids
    train_ids = features['TransactionID']
    test_ids = test_features['TransactionID']
    
    # extract the labels for training
    labels = features['TARGET']
    
    # remove ids and target
    features = features.drop(columns = ['TransactionID', 'TARGET'])
    test_features = test_features.drop(columns = ['TransactionID'])
    
    
    # ohe
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=1000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'TransactionID': test_ids, 'isFraud': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [62]:
# model execution
submission, fi, metrics = model(train, test)

Training Data Shape:  (590540, 444)
Testing Data Shape:  (506691, 444)
Training until validation scores don't improve for 100 rounds.
[200]	train's binary_logloss: 0.313	train's auc: 0.945384	valid's binary_logloss: 0.303582	valid's auc: 0.929809
[400]	train's binary_logloss: 0.263483	train's auc: 0.96536	valid's binary_logloss: 0.26132	valid's auc: 0.942257
[600]	train's binary_logloss: 0.229521	train's auc: 0.976235	valid's binary_logloss: 0.23267	valid's auc: 0.948836
[800]	train's binary_logloss: 0.203936	train's auc: 0.98289	valid's binary_logloss: 0.210728	valid's auc: 0.952687
[1000]	train's binary_logloss: 0.183288	train's auc: 0.987399	valid's binary_logloss: 0.192835	valid's auc: 0.955432
Did not meet early stopping. Best iteration is:
[1000]	train's binary_logloss: 0.183288	train's auc: 0.987399	valid's binary_logloss: 0.192835	valid's auc: 0.955432
Training until validation scores don't improve for 100 rounds.
[200]	train's binary_logloss: 0.315634	train's auc: 0.944949	val

In [63]:
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.012605
1,3663550,0.021027
2,3663551,0.037019
3,3663552,0.034412
4,3663553,0.029279


In [64]:
# submission file
submission.to_csv('/Users/alex/Documents/ieee-fraud-detection/submission3cv_lgbm.csv', index=False)