In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from pandas.api.types import CategoricalDtype

In [3]:
def create_col_name(base_str: str, start_int: int, end_int: int) ->  list:
    return [base_str + str(i) for i in range(start_int, end_int + 1)]

In [4]:
create_col_name("card", 1, 7)

['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'card7']

In [5]:
cat_cols = (['ProductCD'] +  create_col_name('card', 1, 6) + ['addr1', 'addr2', 'P_emaildomain', 'R_emaildomain'] + 
            create_col_name('M', 1, 9) + ['DeviceType', 'DeviceInfo'] +  create_col_name('id_', 12, 38))

id_cols = ['TransactionID', 'TransactionDT']

target = 'isFraud'

In [6]:
type_map = {c: str for c in cat_cols + id_cols}

In [30]:
df_train_id = pd.read_csv('data/train_identity.csv')
df_train_trans = pd.read_csv('data/train_transaction.csv')

df_test_id = pd.read_csv('data/test_identity.csv')
df_test_trans = pd.read_csv('data/test_transaction.csv')

In [8]:
df_train_id.shape, df_train_trans.shape

((144233, 41), (590540, 394))

In [9]:
df_train_id.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [10]:
df_test_id.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663586,-45.0,280290.0,,,0.0,0.0,,,,...,chrome 67.0 for android,,,,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13
1,3663588,0.0,3579.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 67.0 for android,24.0,1280x720,match_status:2,T,F,T,T,mobile,LGLS676 Build/MXB48T
2,3663597,-5.0,185210.0,,,1.0,0.0,,,,...,ie 11.0 for tablet,,,,F,T,T,F,desktop,Trident/7.0
3,3663601,-45.0,252944.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 67.0 for android,,,,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13
4,3663602,-95.0,328680.0,,,7.0,-33.0,,,,...,chrome 67.0 for android,,,,F,F,T,F,mobile,SM-G9650 Build/R16NW


In [31]:
df_train = df_train_trans.merge(df_train_id, on = "TransactionID", how = 'left')

In [32]:
df_test = df_test_trans.merge(df_test_id, on = "TransactionID", how = 'left')

In [33]:
df_test_id.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663586,-45.0,280290.0,,,0.0,0.0,,,,...,chrome 67.0 for android,,,,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13
1,3663588,0.0,3579.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 67.0 for android,24.0,1280x720,match_status:2,T,F,T,T,mobile,LGLS676 Build/MXB48T
2,3663597,-5.0,185210.0,,,1.0,0.0,,,,...,ie 11.0 for tablet,,,,F,T,T,F,desktop,Trident/7.0
3,3663601,-45.0,252944.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 67.0 for android,,,,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13
4,3663602,-95.0,328680.0,,,7.0,-33.0,,,,...,chrome 67.0 for android,,,,F,F,T,F,mobile,SM-G9650 Build/R16NW


In [34]:
numeric_cols = [col for col in df_train.columns.tolist() if col not in cat_cols + id_cols + [target]]

In [35]:
assert(df_train.shape[0] == df_train_trans.shape[0])

In [36]:
lines = list(df_train)

In [37]:
df_train[cat_cols].head()

Unnamed: 0,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,P_emaildomain,...,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38
0,W,13926,,150.0,discover,142.0,credit,315.0,87.0,,...,,,,,,,,,,
1,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,gmail.com,...,,,,,,,,,,
2,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,outlook.com,...,,,,,,,,,,
3,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,yahoo.com,...,,,,,,,,,,
4,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,gmail.com,...,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T


In [38]:
assert(df_test.shape[0] == df_test_trans.shape[0])

In [39]:
## Modeling

In [40]:
from catboost import Pool, CatBoostClassifier, cv

In [41]:
features = cat_cols + numeric_cols

In [42]:
df_train.loc[:,cat_cols] = df_train[cat_cols].fillna('<UNK>')
df_test.loc[:,cat_cols] = df_test[cat_cols].fillna('<UNK>')


In [52]:
def cat_to_int_train(df_train, col):
    catDtype = CategoricalDtype(categories = df_train[col].value_counts().index.values)
    return df_train[col].astype(catDtype).cat.codes.values

def cat_to_int_test(df_test, col):
    catDtype = CategoricalDtype(categories = df_test[col].value_counts().index.values)
    return df_test[col].astype(catDtype).cat.codes.values 


In [53]:
cat_cols_encoded = list()
for col in cat_cols:
    df_train[col]  = cat_to_int_train(df_train, col)
    df_test[col] = cat_to_int_test(df_test, col)
    cat_cols_encoded.append(col)

In [72]:

df_train['addr1']

0         315.0
1         325.0
2         330.0
3         476.0
4         420.0
          ...  
590535    272.0
590536    204.0
590537    231.0
590538    387.0
590539    299.0
Name: addr1, Length: 590540, dtype: object

In [54]:
train_data = Pool(
    data = df_train[features],
    label = df_train[target],
    cat_features = cat_cols_encoded,
)

In [57]:
test_data = Pool(
    data = df_test[features],
    cat_features = cat_cols_encoded
)

In [80]:
params = {
    'iterations': 200,
    'custom_metric': 'AUC',
    'learning_rate': 0.05,
    'loss_function': 'CrossEntropy'}

In [60]:
cv_results = cv(train_data, params, fold_count = 3, plot = False)

0:	learn: 0.6444716	test: 0.6444731	best: 0.6444731 (0)	total: 12.3s	remaining: 10m
1:	learn: 0.5998007	test: 0.5998147	best: 0.5998147 (1)	total: 20.8s	remaining: 8m 20s
2:	learn: 0.5594245	test: 0.5594350	best: 0.5594350 (2)	total: 27.8s	remaining: 7m 16s
3:	learn: 0.5216342	test: 0.5216560	best: 0.5216560 (3)	total: 34.8s	remaining: 6m 39s
4:	learn: 0.4873778	test: 0.4874086	best: 0.4874086 (4)	total: 38.1s	remaining: 5m 43s
5:	learn: 0.4556835	test: 0.4557063	best: 0.4557063 (5)	total: 47.9s	remaining: 5m 51s
6:	learn: 0.4266075	test: 0.4266397	best: 0.4266397 (6)	total: 1m 3s	remaining: 6m 28s
7:	learn: 0.4013086	test: 0.4013445	best: 0.4013445 (7)	total: 1m 8s	remaining: 5m 58s
8:	learn: 0.3772740	test: 0.3773223	best: 0.3773223 (8)	total: 1m 12s	remaining: 5m 32s
9:	learn: 0.3551094	test: 0.3551612	best: 0.3551612 (9)	total: 1m 17s	remaining: 5m 8s
10:	learn: 0.3346956	test: 0.3347556	best: 0.3347556 (10)	total: 1m 22s	remaining: 4m 50s
11:	learn: 0.3162397	test: 0.3162966	best:

In [61]:
print(cv_results)

    iterations  test-CrossEntropy-mean  test-CrossEntropy-std  \
0            0                0.644473               0.000192   
1            1                0.599815               0.001103   
2            2                0.559435               0.001094   
3            3                0.521656               0.001294   
4            4                0.487409               0.002162   
5            5                0.455706               0.002094   
6            6                0.426640               0.003046   
7            7                0.401345               0.002826   
8            8                0.377322               0.002487   
9            9                0.355161               0.002547   
10          10                0.334756               0.001408   
11          11                0.316297               0.000630   
12          12                0.298860               0.001005   
13          13                0.283680               0.001548   
14          14           

In [82]:
model = CatBoostClassifier(**params)
model.fit(train_data)

0:	learn: 0.6120415	total: 12.4s	remaining: 41m 5s
1:	learn: 0.5417567	total: 19.6s	remaining: 32m 24s
2:	learn: 0.4822747	total: 24.9s	remaining: 27m 15s
3:	learn: 0.4306512	total: 29.6s	remaining: 24m 8s
4:	learn: 0.3865012	total: 34.2s	remaining: 22m 13s
5:	learn: 0.3489530	total: 38.8s	remaining: 20m 54s
6:	learn: 0.3180862	total: 43.3s	remaining: 19m 53s
7:	learn: 0.2895576	total: 47.5s	remaining: 18m 59s
8:	learn: 0.2666954	total: 51.4s	remaining: 18m 11s
9:	learn: 0.2474223	total: 55.2s	remaining: 17m 29s
10:	learn: 0.2294559	total: 59.5s	remaining: 17m 1s
11:	learn: 0.2134498	total: 1m 3s	remaining: 16m 28s
12:	learn: 0.2010534	total: 1m 7s	remaining: 16m 7s
13:	learn: 0.1896435	total: 1m 11s	remaining: 15m 46s
14:	learn: 0.1788810	total: 1m 15s	remaining: 15m 29s
15:	learn: 0.1710201	total: 1m 19s	remaining: 15m 14s
16:	learn: 0.1638149	total: 1m 23s	remaining: 15m
17:	learn: 0.1582043	total: 1m 27s	remaining: 14m 46s
18:	learn: 0.1521950	total: 1m 31s	remaining: 14m 29s
19:	l

154:	learn: 0.0681059	total: 12m 6s	remaining: 3m 30s
155:	learn: 0.0677601	total: 12m 11s	remaining: 3m 26s
156:	learn: 0.0676935	total: 12m 16s	remaining: 3m 21s
157:	learn: 0.0676681	total: 12m 20s	remaining: 3m 16s
158:	learn: 0.0675902	total: 12m 25s	remaining: 3m 12s
159:	learn: 0.0675040	total: 12m 29s	remaining: 3m 7s
160:	learn: 0.0673224	total: 12m 34s	remaining: 3m 2s
161:	learn: 0.0671321	total: 12m 39s	remaining: 2m 58s
162:	learn: 0.0669525	total: 12m 45s	remaining: 2m 53s
163:	learn: 0.0668696	total: 12m 49s	remaining: 2m 49s
164:	learn: 0.0668349	total: 12m 53s	remaining: 2m 44s
165:	learn: 0.0667937	total: 12m 58s	remaining: 2m 39s
166:	learn: 0.0667462	total: 13m 2s	remaining: 2m 34s
167:	learn: 0.0666011	total: 13m 8s	remaining: 2m 30s
168:	learn: 0.0664743	total: 13m 13s	remaining: 2m 25s
169:	learn: 0.0664344	total: 13m 17s	remaining: 2m 20s
170:	learn: 0.0663502	total: 13m 22s	remaining: 2m 16s
171:	learn: 0.0662705	total: 13m 27s	remaining: 2m 11s
172:	learn: 0.0

<catboost.core.CatBoostClassifier at 0x7fd0d54fac70>

In [75]:
y_test_hat = model.predict_proba(test_data)[:,1]

In [78]:
df_test['isFraud'] = y_test_hat

In [79]:
df_test[['TransactionID', 'isFraud']].to_csv('data/submission_catboost_v1.csv', index = False)