In [2]:
import lightgbm as lgbm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [57]:
X = pd.read_csv("../data/model-data.csv")
print(X.shape)
X.head()

(144233, 366)


Unnamed: 0,isfraud,transactionamt,card1,card2,card3,card5,c1,c2,c3,c4,...,browseropera,browserother,browserpalemoon,browserpuffin,browsersafari,browsersamsung,browserseamonkey,browsersilk,browserwaterfox,browserzte
0,0,50.0,4497,514.0,150.0,102.0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,15.0,2803,100.0,150.0,226.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,75.887,16496,352.0,117.0,134.0,1,4,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,16.495,4461,375.0,185.0,224.0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,30.0,1790,555.0,150.0,226.0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
train = X.copy()
train = train[['isfraud', 'transactionamt', 'card1', 'card2','card3','card5']]
train.head()

Unnamed: 0,isfraud,transactionamt,card1,card2,card3,card5
0,0,50.0,4497,514.0,150.0,102.0
1,0,15.0,2803,100.0,150.0,226.0
2,0,75.887,16496,352.0,117.0,134.0
3,0,16.495,4461,375.0,185.0,224.0
4,0,30.0,1790,555.0,150.0,226.0


In [61]:
# get the labels
y = train.isfraud.ravel()
train.drop(['isfraud'], inplace=True, axis=1)
x = train.values

In [62]:
## Create training and validation sets
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [63]:
## Identify categorical features
categorical_features = []
for c,col in enumerate(train.columns):
    if train[col].dtype.name == "categorical":
        print(col)
        categorical_features.append(col)

In [64]:
train_data = lgbm.Dataset(x, label=y, feature_name = ['transactionamt', 'card1', 'card2','card3','card5'])
test_data = lgbm.Dataset(x_test, label=y_test)

In [68]:
## Train the model

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

model = lgbm.train(parameters,
                       train_data,
                       valid_sets=[test_data],
                       num_boost_round=100, 
                   verbose_eval=False)

In [77]:
import pprint
pprint.pprint(model.model_to_string())

('tree\n'
 'version=v2\n'
 'num_class=1\n'
 'num_tree_per_iteration=1\n'
 'label_index=0\n'
 'max_feature_idx=4\n'
 'objective=binary sigmoid:1\n'
 'feature_names=transactionamt card1 card2 card3 card5\n'
 'feature_infos=[0.251:1800] [1000:18388] [100:600] [100:231] [100:237]\n'
 'tree_sizes=2460 2444 2469 2471 2451 2493 2453 2465 2473 2487 2480 2491 2528 '
 '2490 2471 2518 2495 2503 2511 2507 2512 2512 2521 2530 2520 2509 2507 2545 '
 '2541 2518 2509 2500 2535 2526 2513 2510 2532 2521 2530 2533 2532 2523 2534 '
 '2499 2500 2523 2513 2525 2512 2526 2513 2562 2529 2549 2507 2515 2539 2546 '
 '2532 2531 2523 2560 2553 2545 2555 2525 2534 2538 2549 2545 2555 2538 2535 '
 '2545 2519 2526 2557 2523 2522 2574 2557 2508 2551 2533 2538 2526 2542 2555 '
 '2525 2520 2546 2532 2526 2537 2510 2547 2577 2543 2542 2536\n'
 '\n'
 'Tree=0\n'
 'num_leaves=31\n'
 'num_cat=0\n'
 'split_feature=2 0 2 2 2 2 0 2 2 2 2 2 0 2 2 2 2 2 2 0 0 2 2 0 2 2 2 0 2 0\n'
 'split_gain=6234.45 4432.42 3080.22 2563.19 2543

 '168.50000000000003 184.50000000000003 138.50000000000003 130.50000000000003 '
 '141.50000000000003 202.50000000000003 474.50000000000006 428.50000000000006 '
 '476.50000000000006 170.50000000000003 134.50000000000003 116.50000000000001 '
 '141.50000000000003 526.00000000000011 562.50000000000011 555.50000000000011 '
 '196.00000000000003 204.00000000000003 218.00000000000003 190.50000000000003 '
 '499.50000000000006 407.50000000000006 438.50000000000006\n'
 'decision_type=8 10 8 8 10 10 10 10 10 10 8 10 10 10 10 10 8 10 10 10 10 8 10 '
 '10 10 10 10 10 10 10\n'
 'left_child=10 2 4 7 -2 9 19 11 -9 16 17 -3 -13 14 -14 -15 -4 18 -1 -6 -17 '
 '-12 24 -24 26 -26 -10 28 -28 -30\n'
 'right_child=1 3 5 -5 6 -7 -8 8 22 -11 21 12 13 15 -16 20 -18 -19 -20 -21 -22 '
 '-23 23 -25 25 -27 27 -29 29 -31\n'
 'leaf_value=0.017317181149727835 -0.015609427451535039 -0.0029242801129528203 '
 '-0.097589572593916998 0.034793130845977432 0.012612050831073916 '
 '-0.035723967130513797 -0.1099648280252052 -0.0

## On untransformed data

In [3]:
train_identity = pd.read_csv("../data/raw/train_identity.csv")
train_transaction = pd.read_csv("../data/raw/train_transaction.csv")

In [66]:
test_identity = pd.read_csv("../data/raw/test_identity.csv")
test_transaction = pd.read_csv("../data/raw/test_transaction.csv")

In [4]:
X = train_identity.merge(train_transaction)
print(X.shape)
X.head()

(144233, 434)


Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987004,0.0,70787.0,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,,,,,,,,,,
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,,,,,,,,,,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
for c in X.columns:
    print(c)

TransactionID
id_01
id_02
id_03
id_04
id_05
id_06
id_07
id_08
id_09
id_10
id_11
id_12
id_13
id_14
id_15
id_16
id_17
id_18
id_19
id_20
id_21
id_22
id_23
id_24
id_25
id_26
id_27
id_28
id_29
id_30
id_31
id_32
id_33
id_34
id_35
id_36
id_37
id_38
DeviceType
DeviceInfo
isFraud
TransactionDT
TransactionAmt
ProductCD
card1
card2
card3
card4
card5
card6
addr1
addr2
dist1
dist2
P_emaildomain
R_emaildomain
C1
C2
C3
C4
C5
C6
C7
C8
C9
C10
C11
C12
C13
C14
D1
D2
D3
D4
D5
D6
D7
D8
D9
D10
D11
D12
D13
D14
D15
M1
M2
M3
M4
M5
M6
M7
M8
M9
V1
V2
V3
V4
V5
V6
V7
V8
V9
V10
V11
V12
V13
V14
V15
V16
V17
V18
V19
V20
V21
V22
V23
V24
V25
V26
V27
V28
V29
V30
V31
V32
V33
V34
V35
V36
V37
V38
V39
V40
V41
V42
V43
V44
V45
V46
V47
V48
V49
V50
V51
V52
V53
V54
V55
V56
V57
V58
V59
V60
V61
V62
V63
V64
V65
V66
V67
V68
V69
V70
V71
V72
V73
V74
V75
V76
V77
V78
V79
V80
V81
V82
V83
V84
V85
V86
V87
V88
V89
V90
V91
V92
V93
V94
V95
V96
V97
V98
V99
V100
V101
V102
V103
V104
V105
V106
V107
V108
V109
V110
V111
V112
V113
V114
V115
V116
V117

In [50]:
train = X.copy()
train = train[['isFraud', 'TransactionAmt', 'card1', 'card2','card3','card5','DeviceType']]
train['DeviceType'].astype('category')
train.dropna()
train.head()

Unnamed: 0,isFraud,TransactionAmt,card1,card2,card3,card5,DeviceType
0,0,50.0,4497,514.0,150.0,102.0,mobile
1,0,15.0,2803,100.0,150.0,226.0,mobile
2,0,75.887,16496,352.0,117.0,134.0,desktop
3,0,16.495,4461,375.0,185.0,224.0,desktop
4,0,30.0,1790,555.0,150.0,226.0,desktop


In [55]:
## Convert categorical features to ints
def cat_map(x):
    d = {'mobile':1, 
         'desktop':2}
    if (x is not np.nan) and (x is not np.NAN) and (x is not np.NaN):
        val = d[x]
    else:
        val = 0 ## missing
    return val 

train.DeviceType = train.DeviceType.map(cat_map)

In [56]:
# get the labels
y = train.isFraud.ravel()
train.drop(['isFraud'], inplace=True, axis=1)
x = train.values

In [57]:
## Create training and validation sets
#x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)
n = train.shape[0]
n_test = int(.2*n)
x_test = train.iloc[(n-n_test):n]
y_test = y[(n-n_test):n]

In [58]:
## Identify categorical features
categorical_features = []
for c,col in enumerate(train.columns):
    if (train[col].dtype.name.find('int') == -1) and (train[col].dtype.name.find('float') == -1):
        print(col)
        categorical_features.append(col)

In [64]:
train_data = lgbm.Dataset(x, 
                          label=y, 
                          feature_name = ['transactionamt', 'card1', 'card2','card3','card5','DeviceType'],
                          categorical_feature=categorical_features,
                                            free_raw_data = False,)

test_data = lgbm.Dataset(x_test, label=y_test, categorical_feature=categorical_features,                   free_raw_data = False,)

In [65]:
## Train the model

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'learning_rate':0.05,
    'verbose': 0
}

model = lgbm.train(parameters,
                       train_data,
                       valid_sets=[test_data],
                       num_boost_round=100, 
                   verbose_eval=True)



[1]	valid_0's auc: 0.739989
[2]	valid_0's auc: 0.751113
[3]	valid_0's auc: 0.753991
[4]	valid_0's auc: 0.759882
[5]	valid_0's auc: 0.763636
[6]	valid_0's auc: 0.764901
[7]	valid_0's auc: 0.765666
[8]	valid_0's auc: 0.766477
[9]	valid_0's auc: 0.768605
[10]	valid_0's auc: 0.769526
[11]	valid_0's auc: 0.770462
[12]	valid_0's auc: 0.770804
[13]	valid_0's auc: 0.771673
[14]	valid_0's auc: 0.772301
[15]	valid_0's auc: 0.772815
[16]	valid_0's auc: 0.772871
[17]	valid_0's auc: 0.773282
[18]	valid_0's auc: 0.773777
[19]	valid_0's auc: 0.774238
[20]	valid_0's auc: 0.774277
[21]	valid_0's auc: 0.77432
[22]	valid_0's auc: 0.774841
[23]	valid_0's auc: 0.77523
[24]	valid_0's auc: 0.775959
[25]	valid_0's auc: 0.776239
[26]	valid_0's auc: 0.776673
[27]	valid_0's auc: 0.776805
[28]	valid_0's auc: 0.777521
[29]	valid_0's auc: 0.77796
[30]	valid_0's auc: 0.778455
[31]	valid_0's auc: 0.7787
[32]	valid_0's auc: 0.779008
[33]	valid_0's auc: 0.779285
[34]	valid_0's auc: 0.779708
[35]	valid_0's auc: 0.779819

In [78]:
## prediction
# 7 entities, each contains 10 features
new_data = test_transaction.merge(test_identity, how="left")
transaction_ids = new_data.TransactionID
new_data = new_data[['TransactionAmt', 'card1', 'card2', 'card3','card5','DeviceType']]
new_data.DeviceType = new_data.DeviceType.map(cat_map)
new_data.shape

(506691, 6)

In [79]:
new_data.head()

Unnamed: 0,TransactionAmt,card1,card2,card3,card5,DeviceType
0,31.95,10409,111.0,150.0,226.0,0
1,49.0,4272,111.0,150.0,226.0,0
2,171.0,4476,574.0,150.0,226.0,0
3,284.95,10989,360.0,150.0,166.0,0
4,67.95,18018,452.0,150.0,117.0,0


In [75]:
ypred = model.predict(new_data)

In [81]:
ypred

array([0.09987767, 0.08975913, 0.15466638, ..., 0.12162086, 0.17931066,
       0.20117609])

In [82]:
## new submission

sub.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.099878
1,3663550,0.089759
2,3663551,0.154666
3,3663552,0.546914
4,3663553,0.219062


In [95]:
import os
import time
def make_submission(ypred, sub_description):
    d = {"TransactionID":transaction_ids, "isFraud":ypred}
    sub = pd.DataFrame(d)
    
    ## get timestamp for filename
    tstamp = round(time.time())
    sub_filename = f"0912_{tstamp}.csv"

    ## write to csv
    sub.to_csv(f"../output/submissions/{sub_filename}", index=False)
    line = sub_filename.replace(".csv","") + " | " + sub_description +"\n"
    print(line)
    
    ## save description
    with open("../output/submissions/submission-descriptions.txt", "a") as outfile:
        outfile.write(line)
        
    ## submit to kaggle
    kaggle_command = f"kaggle competitions submit -c ieee-fraud-detection -f ../output/submissions/{sub_filename} -m \"{sub_description}\""
    print(kaggle_command)
    os.system(kaggle_command)

0912_1568307854 | lightgbm model without FE



In [97]:
make_submission(ypred=ypred, sub_description="lightgbm model without FE")

kaggle competitions submit -c ieee-fraud-detection -f ../output/submissions/0912_1568307854.csv -m "lightgbm model without FE"


0

In [90]:
!ls ../output/submissions

0820_120316                 0820_122047.csv
0820_120316.csv             0820_122052.csv
0820_120938.csv             0820_122202.csv
0820_121205.csv             0820_122208.csv
0820_121520.csv             0820_122316.csv
0820_121638.csv             0820_122320.csv
0820_121822.csv             0820_122639.csv
0820_121904.csv             0911_135219.csv
0820_121907.csv             0912_1568307409.csv
0820_121911.csv             submission-descriptions
0820_121914.csv             submission-descriptions.txt
0820_122036.csv
