In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from haversine import haversine, haversine_vector
import joblib

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

print(train.shape, test.shape)
print(train.columns)
print(test.columns)
print(train.target.value_counts())

(786431, 18) (262144, 17)
Index(['transaction_time', 'merch', 'cat_id', 'amount', 'name_1', 'name_2',
       'gender', 'street', 'one_city', 'us_state', 'post_code', 'lat', 'lon',
       'population_city', 'jobs', 'merchant_lat', 'merchant_lon', 'target'],
      dtype='object')
Index(['transaction_time', 'merch', 'cat_id', 'amount', 'name_1', 'name_2',
       'gender', 'street', 'one_city', 'us_state', 'post_code', 'lat', 'lon',
       'population_city', 'jobs', 'merchant_lat', 'merchant_lon'],
      dtype='object')
target
0    781927
1      4504
Name: count, dtype: int64


In [3]:
print(train.dtypes)

transaction_time     object
merch                object
cat_id               object
amount              float64
name_1               object
name_2               object
gender               object
street               object
one_city             object
us_state             object
post_code             int64
lat                 float64
lon                 float64
population_city       int64
jobs                 object
merchant_lat        float64
merchant_lon        float64
target                int64
dtype: object


In [4]:
train.head()

Unnamed: 0,transaction_time,merch,cat_id,amount,name_1,name_2,gender,street,one_city,us_state,post_code,lat,lon,population_city,jobs,merchant_lat,merchant_lon,target
0,2019-12-27 15:21,fraud_Cormier LLC,health_fitness,148.04,Daniel,Martinez,M,8510 Acevedo Burgs,Kent,OR,97033,45.0838,-120.6649,60,Museum education officer,45.042827,-120.709327,0
1,2019-04-17 23:09,"fraud_Brown, Homenick and Lesch",health_fitness,39.4,Grace,Williams,F,28812 Charles Mill Apt. 628,Plantersville,AL,36758,32.6176,-86.9475,1412,Drilling engineer,31.872266,-87.828247,0
2,2019-09-23 15:02,fraud_Ruecker-Mayert,kids_pets,52.96,Kyle,Park,M,7507 Larry Passage Suite 859,Mount Perry,OH,43760,39.8788,-82.188,1831,Barrister's clerk,40.010874,-81.841249,0
3,2019-05-13 16:00,"fraud_Mante, Luettgen and Hackett",health_fitness,7.66,Monique,Martin,F,68276 Matthew Springs,Ratcliff,TX,75858,31.3833,-95.0619,43,"Engineer, production",30.888406,-95.141609,0
4,2019-08-18 07:27,fraud_Luettgen PLC,gas_transport,51.59,Christine,Johnson,F,8011 Chapman Tunnel Apt. 568,Blairsden-Graeagle,CA,96103,39.8127,-120.6405,1725,Chartered legal executive (England and Wales),39.376017,-121.311691,0


In [5]:
test.head()

Unnamed: 0,transaction_time,merch,cat_id,amount,name_1,name_2,gender,street,one_city,us_state,post_code,lat,lon,population_city,jobs,merchant_lat,merchant_lon
0,2019-09-14 02:46,"fraud_Stokes, Christiansen and Sipes",grocery_net,25.79,Michael,Rodriguez,M,172 Paula Inlet Apt. 650,Cross Plains,TX,76443,32.1482,-99.1872,1897,Chief Operating Officer,31.772057,-99.103183
1,2019-07-25 20:30,fraud_Thompson-Gleason,health_fitness,87.8,Curtis,Young,M,4319 Watson Shoals Suite 658,Falconer,NY,14733,42.1239,-79.1895,3833,Metallurgist,42.635312,-78.334559
2,2020-01-05 17:27,fraud_Friesen Inc,shopping_pos,10.98,Brandy,Quinn,F,9734 Beard Fields Suite 885,Altair,TX,77412,29.6047,-96.5249,106,"Editor, film/video",29.363782,-95.624994
3,2019-07-21 00:14,"fraud_Jenkins, Hauck and Friesen",gas_transport,71.97,Ashley,Mcdonald,F,3160 Tina Estates Suite 234,Marietta,SC,29661,35.0296,-82.5136,5648,Museum/gallery exhibitions officer,35.142659,-82.489028
4,2019-03-13 00:45,"fraud_Kovacek, Dibbert and Ondricka",grocery_pos,210.5,Kimberly,Rice,F,63991 Destiny Rue Apt. 651,Tyler,TX,75703,32.2768,-95.3031,144160,Sports development officer,31.833016,-94.746542


In [6]:
df_list = [train]
try:
    for df in df_list:
        df["transaction_time"] = pd.to_datetime(df["transaction_time"])
        df["hour"] = df["transaction_time"].dt.hour
        df["dayofweek"] = df["transaction_time"].dt.dayofweek
except KeyError as e:
    pass

for df in df_list:
    coords1 = np.column_stack([df["lat"].to_numpy(), df["lon"].to_numpy()])
    coords2 = np.column_stack([df["merchant_lat"].to_numpy(), df["merchant_lon"].to_numpy()])
    df["distance_km"] = haversine_vector(coords1, coords2)

df.head()

Unnamed: 0,transaction_time,merch,cat_id,amount,name_1,name_2,gender,street,one_city,us_state,...,lat,lon,population_city,jobs,merchant_lat,merchant_lon,target,hour,dayofweek,distance_km
0,2019-12-27 15:21:00,fraud_Cormier LLC,health_fitness,148.04,Daniel,Martinez,M,8510 Acevedo Burgs,Kent,OR,...,45.0838,-120.6649,60,Museum education officer,45.042827,-120.709327,0,15,4,5.738662
1,2019-04-17 23:09:00,"fraud_Brown, Homenick and Lesch",health_fitness,39.4,Grace,Williams,F,28812 Charles Mill Apt. 628,Plantersville,AL,...,32.6176,-86.9475,1412,Drilling engineer,31.872266,-87.828247,0,23,2,117.172344
2,2019-09-23 15:02:00,fraud_Ruecker-Mayert,kids_pets,52.96,Kyle,Park,M,7507 Larry Passage Suite 859,Mount Perry,OH,...,39.8788,-82.188,1831,Barrister's clerk,40.010874,-81.841249,0,15,0,33.007313
3,2019-05-13 16:00:00,"fraud_Mante, Luettgen and Hackett",health_fitness,7.66,Monique,Martin,F,68276 Matthew Springs,Ratcliff,TX,...,31.3833,-95.0619,43,"Engineer, production",30.888406,-95.141609,0,16,0,55.550246
4,2019-08-18 07:27:00,fraud_Luettgen PLC,gas_transport,51.59,Christine,Johnson,F,8011 Chapman Tunnel Apt. 568,Blairsden-Graeagle,CA,...,39.8127,-120.6405,1725,Chartered legal executive (England and Wales),39.376017,-121.311691,0,7,6,75.267365


In [7]:
target_col = "target"
cat_cols = ["merch", "cat_id", "name_1", "name_2", "gender", "one_city", "us_state", "jobs"]
num_cols = ["amount", "post_code", "population_city", "hour", "dayofweek", "distance_km"]
ts_cols = ["transaction_time"]
text_cols = ["street"]

X = train[cat_cols + num_cols + ts_cols + text_cols].copy()
y = train[target_col]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
ts = X_train[ts_cols].astype("int64") // 10**6
ts_val = X_valid[ts_cols].astype("int64") // 10**6

X_train.drop(columns=["transaction_time"], inplace=True)
X_valid.drop(columns=["transaction_time"], inplace=True)

train_pool = Pool(data=X_train, label=y_train, cat_features=cat_cols, text_features=text_cols, timestamp=ts)
valid_pool = Pool(data=X_valid, label=y_valid, cat_features=cat_cols, text_features=text_cols, timestamp=ts_val)

model = CatBoostClassifier(
    iterations=600,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="AUC",
    task_type="CPU",
    auto_class_weights="Balanced",
    has_time=True,
    verbose=100,
    random_seed=42,
    use_best_model=True,
)

In [8]:
model.fit(train_pool, eval_set=valid_pool)

0:	test: 0.9343379	best: 0.9343379 (0)	total: 273ms	remaining: 2m 43s
100:	test: 0.9928122	best: 0.9928122 (100)	total: 22.9s	remaining: 1m 53s
200:	test: 0.9939769	best: 0.9939898 (197)	total: 42.2s	remaining: 1m 23s
300:	test: 0.9958258	best: 0.9958258 (300)	total: 1m 3s	remaining: 1m 2s
400:	test: 0.9967198	best: 0.9967198 (400)	total: 1m 26s	remaining: 42.9s
500:	test: 0.9970773	best: 0.9970792 (499)	total: 1m 49s	remaining: 21.6s
599:	test: 0.9972473	best: 0.9972532 (596)	total: 2m 10s	remaining: 0us

bestTest = 0.9972531753
bestIteration = 596

Shrink model to first 597 iterations.


<catboost.core.CatBoostClassifier at 0x116483d10>

In [9]:
model.save_model("../models/model.cbm")

In [16]:
# feature_order = cat_cols + text_cols + num_cols
feature_order = list(X_train.columns)
joblib.dump(feature_order, "../artifacts/feature_order.joblib")

['../artifacts/feature_order.joblib']

In [11]:
preds = model.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, preds)

0.9972590587452348

In [12]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,amount,58.776523
1,cat_id,17.911208
2,hour,10.235033
3,street,5.743025
4,merch,1.885544
5,population_city,1.30274
6,one_city,1.158066
7,dayofweek,0.893183
8,name_2,0.658015
9,distance_km,0.557451
