In [5]:
# https://www.kaggle.com/nroman/lgb-single-model-lb-0-9419 ideas referenced from nroman's kaggle kernel
# https://www.kaggle.com/nroman/recursive-feature-elimination
import tensorflow as tf
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold, cross_val_score, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
import multiprocessing

In [10]:
files = ['Inputs/train_transaction.csv', 'Inputs/train_identity.csv', 'Inputs/test_transaction.csv', 'Inputs/test_identity.csv', 'Inputs/sample_submission.csv']
def read(input):
    ''' reads all the csv files by TransactionID '''
    data = pd.read_csv(input, index_col='TransactionID')
    return data

with multiprocessing.Pool() as pool:
    train_transaction, train_identity, test_transaction, test_identity, sub = pool.map(read, files)

In [3]:
def merge(file1, file2):
    ''' merges transaction and identity file '''
    merged = pd.merge(file1, file2, on= 'TransactionID', how='left')#, left_index=True, right_index=True)
    return merged

train = merge(train_transaction, train_identity)
test = merge(test_transaction, test_identity)

In [4]:
def ratio(data):
    fraud = 0
    for num in data['isFraud']:
        if num == 1:
            fraud += 1
    non_fraud = data.shape[0]
    _ratio = np.true_divide(non_fraud, fraud)
    _ratio = _ratio
    return {0:1., 1:_ratio}

In [5]:
'''
Remove useless features
1. Feature has only one value
2. Feature has 85% missing value
3. Feature has 85% or more of same value
4. Correlation between features is not useful???
'''
features = []
feature_list = train.columns
for feature in feature_list:
    if feature == 'isFraud':
        continue
    else:
        if train[feature].nunique() <= 1 or test[feature].nunique() <= 1:
            features.append(feature)
        if (np.count_nonzero(train[feature].isnull()) / len(train[feature]) >= .85) or  (np.count_nonzero(test[feature].isnull()) / len(test[feature]) >= .85):
            features.append(feature)
        if train[feature].value_counts(dropna=False, normalize=True).values[0] >= .85 or test[feature].value_counts(dropna=False, normalize=True).values[0] >= .85:
            features.append(feature)
features = set(features)
#features.remove('isFraud')

print('There will be {} features dropped because they are not useful'.format(len(features)))
train.drop(features, axis=1)
test.drop(features, axis=1)

There will be 155 features dropped because they are not useful


Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card4,card5,card6,addr1,dist1,...,id_20,id_28,id_29,id_31,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3663549,18403224,31.950,W,10409,111.0,visa,226.0,debit,170.0,1.0,...,,,,,,,,,,
3663550,18403263,49.000,W,4272,111.0,visa,226.0,debit,299.0,4.0,...,,,,,,,,,,
3663551,18403310,171.000,W,4476,574.0,visa,226.0,debit,472.0,2635.0,...,,,,,,,,,,
3663552,18403310,284.950,W,10989,360.0,visa,166.0,debit,205.0,17.0,...,,,,,,,,,,
3663553,18403317,67.950,W,18018,452.0,mastercard,117.0,debit,264.0,6.0,...,,,,,,,,,,
3663554,18403323,57.950,W,12839,321.0,visa,226.0,debit,512.0,,...,,,,,,,,,,
3663555,18403350,87.000,W,16560,476.0,visa,126.0,debit,110.0,,...,,,,,,,,,,
3663556,18403387,390.000,W,15066,170.0,mastercard,102.0,credit,194.0,303.0,...,,,,,,,,,,
3663557,18403405,103.950,W,2803,100.0,visa,226.0,debit,494.0,3.0,...,,,,,,,,,,
3663558,18403416,117.000,W,12544,321.0,visa,226.0,debit,476.0,8.0,...,,,,,,,,,,


In [6]:
for feature in train.columns:
    if train[feature].dtype == 'object':
        encoder = LabelEncoder()
        if feature is 'isFraud':
            encoder.fit(list(train[feature].astype(str).values))
            train[feature] = encoder.transform(list(train[feature].astype(str).values))
        else:
            encoder.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
            train[feature] = encoder.transform(list(train[feature].astype(str).values))
        #test_x[feature] = encoder.transform(list(test_x[feature].values))

In [7]:
''' resets the index head so that we can drop transactionid after we are sorting '''
train_x = train.sort_values('TransactionDT').reset_index().drop(['TransactionID', 'isFraud', 'TransactionDT'], axis=1)
train_y = train.sort_values('TransactionDT')['isFraud']

train_x.fillna(-99, inplace=True)


In [8]:
params = {'num_leaves': 491,
          'min_child_weight': 0.02454473273214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 100,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.010283254663721497,
          "boosting_type": "gbdt",
          "bagging_seed": 15,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.2899927210061127,
          'reg_lambda': 0.4485237330340494,
          'random_state': 53
         }

In [9]:
clf = lgb.LGBMClassifier(**params)
#(n_splits=6, shuffle=False) 'accuracy', 'binary_logloss', 'precision', 'recall'
rfe = RFECV(estimator=clf, step=10, cv=5, scoring='roc_auc', verbose=2)

In [10]:
rfe.fit(train_x, train_y)

Fitting estimator with 431 features.
Fitting estimator with 421 features.
Fitting estimator with 411 features.
Fitting estimator with 401 features.
Fitting estimator with 391 features.
Fitting estimator with 381 features.
Fitting estimator with 371 features.
Fitting estimator with 361 features.
Fitting estimator with 351 features.
Fitting estimator with 341 features.
Fitting estimator with 331 features.
Fitting estimator with 321 features.
Fitting estimator with 311 features.
Fitting estimator with 301 features.
Fitting estimator with 291 features.
Fitting estimator with 281 features.
Fitting estimator with 271 features.
Fitting estimator with 261 features.
Fitting estimator with 251 features.
Fitting estimator with 241 features.
Fitting estimator with 231 features.
Fitting estimator with 221 features.
Fitting estimator with 211 features.
Fitting estimator with 201 features.
Fitting estimator with 191 features.
Fitting estimator with 181 features.
Fitting estimator with 171 features.
F

RFECV(cv=5,
      estimator=LGBMClassifier(bagging_fraction=0.4181193142567742,
                               bagging_seed=15, boosting_type='gbdt',
                               class_weight=None, colsample_bytree=1.0,
                               feature_fraction=0.3797454081646243,
                               importance_type='split',
                               learning_rate=0.010283254663721497, max_depth=-1,
                               metric='auc', min_child_samples=20,
                               min_child_weight=0.02454473273214212,
                               min_data_in_leaf=100, min_split_gain=0.0,
                               n_estimators=100, n_jobs=-1, num_leaves=491,
                               objective='binary', random_state=53,
                               reg_alpha=0.2899927210061127,
                               reg_lambda=0.4485237330340494, silent=True,
                               subsample=1.0, subsample_for_bin=200000,
            

In [11]:
for col in train_x.columns[rfe.ranking_ == 1]:
    print(col)

TransactionAmt
ProductCD
card1
card2
card3
card4
card5
card6
addr1
addr2
dist1
dist2
P_emaildomain
R_emaildomain
C1
C2
C4
C5
C6
C7
C8
C9
C10
C11
C12
C13
C14
D1
D2
D3
D4
D5
D6
D7
D8
D9
D10
D11
D12
D13
D14
D15
M2
M3
M4
M5
M6
M7
M8
M9
V2
V3
V4
V5
V6
V7
V9
V11
V12
V13
V15
V16
V18
V19
V20
V23
V24
V25
V26
V29
V30
V33
V34
V35
V36
V37
V38
V39
V40
V42
V43
V44
V45
V46
V47
V48
V49
V51
V52
V53
V54
V55
V56
V57
V58
V59
V60
V61
V62
V63
V64
V66
V67
V69
V70
V71
V72
V73
V74
V75
V76
V77
V78
V79
V80
V81
V82
V83
V85
V86
V87
V90
V91
V93
V94
V95
V96
V97
V98
V99
V100
V101
V102
V103
V105
V106
V109
V115
V116
V123
V124
V125
V126
V127
V128
V129
V130
V131
V132
V133
V134
V135
V136
V137
V139
V140
V142
V143
V145
V146
V147
V148
V149
V150
V151
V152
V154
V156
V157
V158
V159
V160
V161
V162
V163
V164
V165
V166
V167
V168
V169
V170
V171
V172
V173
V175
V176
V177
V178
V179
V183
V184
V185
V186
V187
V188
V189
V192
V195
V197
V198
V199
V200
V201
V202
V203
V204
V205
V206
V207
V208
V209
V210
V211
V212
V213
V214
V215
V216
V217
V218


In [14]:
most_influential = pd.DataFrame([col for col in train_x.columns[rfe.ranking_==1]], columns=['features'])
most_influential.to_csv('Import_feature.csv')

In [8]:
most_influential = pd.read_csv('Inputs/Import_feature.csv')
useful_features = most_influential['features'].tolist()
with multiprocessing.Pool() as pool:
    train_transaction, train_identity, test_transaction, test_identity, sub = pool.map(read, files)

train = merge(train_transaction, train_identity)
test = merge(test_transaction, test_identity)

bad_feature = []
for feature in train.columns:
    if feature not in useful_features:
        bad_feature.append(feature)

bad_feature.remove('isFraud')
bad_feature.remove('TransactionDT')
print(train.shape)
train = train.drop(bad_feature, axis=1)
test = test.drop(bad_feature, axis=1)

NameError: name 'read' is not defined

In [117]:
for feature in train.columns:
    if train[feature].dtype == 'object':
        encoder = LabelEncoder()
        if feature is 'isFraud':
            encoder.fit(list(train[feature].astype(str).values))
            train[feature] = encoder.transform(list(train[feature].astype(str).values))
            test[feature] = encoder.transform(list(test[feature].astype(str).values))  
        else:
            encoder.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
            train[feature] = encoder.transform(list(train[feature].astype(str).values))
            test[feature] = encoder.transform(list(test[feature].astype(str).values))  

In [118]:
''' resets the index head so that we can drop transactionid after we are sorting '''
train_x = train.sort_values('TransactionDT').reset_index().drop(['TransactionID', 'isFraud', 'TransactionDT'], axis=1)
train_y = train.sort_values('TransactionDT')['isFraud']

test = test.sort_values('TransactionDT').reset_index().drop(['TransactionID', 'TransactionDT'], axis=1)


In [119]:
params = {'num_leaves': 350,
          'min_child_weight': 0.02454473273214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 50,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.010283254663721497,
          "boosting_type": "gbdt",
          "bagging_seed": 15,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha':0.2899927210061127, 
          'reg_lambda': 0.4485237330340494,
          'max_delta_step': 0.5,
          'random_state': 53
         }

In [120]:
folds = TimeSeriesSplit(n_splits=5)
for fold, (train_idx, test_idx) in enumerate(folds.split(train_x, train_y)):
    train_data = lgb.Dataset(train_x.iloc[train_idx], label=train_y.iloc[train_idx])
    val_data = lgb.Dataset(train_x.iloc[test_idx], label=train_y.iloc[test_idx])
    model = lgb.train(params, train_data, 10000, valid_sets = [train_data, val_data], verbose_eval=1000, early_stopping_rounds=800)
    

Training until validation scores don't improve for 800 rounds.
[1000]	training's auc: 0.999052	valid_1's auc: 0.896395
[2000]	training's auc: 1	valid_1's auc: 0.898687
Early stopping, best iteration is:
[1572]	training's auc: 0.999996	valid_1's auc: 0.898972
Training until validation scores don't improve for 800 rounds.
[1000]	training's auc: 0.995969	valid_1's auc: 0.910036
[2000]	training's auc: 0.999964	valid_1's auc: 0.91032
Early stopping, best iteration is:
[1276]	training's auc: 0.998738	valid_1's auc: 0.911856
Training until validation scores don't improve for 800 rounds.
[1000]	training's auc: 0.991241	valid_1's auc: 0.906549
[2000]	training's auc: 0.999149	valid_1's auc: 0.903974
Early stopping, best iteration is:
[1378]	training's auc: 0.996748	valid_1's auc: 0.908183
Training until validation scores don't improve for 800 rounds.
[1000]	training's auc: 0.985516	valid_1's auc: 0.923736
[2000]	training's auc: 0.997287	valid_1's auc: 0.923501
Early stopping, best iteration is:


In [121]:
model.best_iteration

1520

In [122]:
fraud_model = lgb.LGBMClassifier(**params, num_boost_round=model.best_iteration)
fraud_model.fit(train_x, train_y)

LGBMClassifier(bagging_fraction=0.4181193142567742, bagging_seed=15,
               boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               feature_fraction=0.3797454081646243, importance_type='split',
               learning_rate=0.010283254663721497, max_delta_step=0.5,
               max_depth=-1, metric='auc', min_child_samples=20,
               min_child_weight=0.02454473273214212, min_data_in_leaf=50,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1,
               num_boost_round=1520, num_leaves=350, objective='binary',
               random_state=53, reg_alpha=0.2899927210061127,
               reg_lambda=0.4485237330340494, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0, verbosity=-1)

In [123]:
sub['isFraud'] = fraud_model.predict_proba(test)[:, 1]

In [124]:
sub.reset_index().to_csv('CSV_Submissions/ieee_cis_fraud_detection_v5.csv', index=False)