In [2]:
import pandas as pd

In [3]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [4]:
train

Unnamed: 0,transaction_time,merch,cat_id,amount,name_1,name_2,gender,street,one_city,us_state,post_code,lat,lon,population_city,jobs,merchant_lat,merchant_lon,target
0,2019-12-27 15:21,fraud_Cormier LLC,health_fitness,148.04,Daniel,Martinez,M,8510 Acevedo Burgs,Kent,OR,97033,45.0838,-120.6649,60,Museum education officer,45.042827,-120.709327,0
1,2019-04-17 23:09,"fraud_Brown, Homenick and Lesch",health_fitness,39.40,Grace,Williams,F,28812 Charles Mill Apt. 628,Plantersville,AL,36758,32.6176,-86.9475,1412,Drilling engineer,31.872266,-87.828247,0
2,2019-09-23 15:02,fraud_Ruecker-Mayert,kids_pets,52.96,Kyle,Park,M,7507 Larry Passage Suite 859,Mount Perry,OH,43760,39.8788,-82.1880,1831,Barrister's clerk,40.010874,-81.841249,0
3,2019-05-13 16:00,"fraud_Mante, Luettgen and Hackett",health_fitness,7.66,Monique,Martin,F,68276 Matthew Springs,Ratcliff,TX,75858,31.3833,-95.0619,43,"Engineer, production",30.888406,-95.141609,0
4,2019-08-18 07:27,fraud_Luettgen PLC,gas_transport,51.59,Christine,Johnson,F,8011 Chapman Tunnel Apt. 568,Blairsden-Graeagle,CA,96103,39.8127,-120.6405,1725,Chartered legal executive (England and Wales),39.376017,-121.311691,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786426,2019-04-10 12:35,"fraud_O'Connell, Botsford and Hand",home,76.56,Ryan,Foster,M,03921 Cole Mission Suite 882,Hampton,FL,32044,29.8575,-82.1483,2060,Oncologist,29.235257,-82.407844,0
786427,2019-12-15 09:34,"fraud_Adams, Kovacek and Kuhlman",grocery_net,68.58,Jim,Johnson,M,868 Brady Mill Apt. 837,Gretna,LA,70056,29.8872,-90.0331,55581,Biomedical scientist,29.015274,-90.564712,0
786428,2019-10-12 10:22,"fraud_Lind, Huel and McClure",gas_transport,66.66,Christopher,Horn,M,956 Sanchez Highway,Mallie,KY,41836,37.2692,-82.9161,798,Facilities manager,37.515508,-82.443788,0
786429,2019-10-18 09:01,fraud_Rempel PLC,grocery_net,38.06,Samuel,Sandoval,M,0005 Morrison Land,Mounds,OK,74047,35.8896,-96.0887,7163,Fitness centre manager,35.203864,-96.999902,0


In [6]:
import catboost

In [49]:
import numpy as np
from datetime import datetime


def process_time(df):
    df = df.copy()
    
    df['transaction_time'] = pd.to_datetime(df['transaction_time'])
    
    df['hour'] = df['transaction_time'].dt.hour
    df['minute'] = df['transaction_time'].dt.minute
    df['time_of_day'] = (df['hour'] + df['minute'] / 60) / 24

    df['time_sin'] = np.sin(2 * np.pi * df['time_of_day'])
    df['time_cos'] = np.cos(2 * np.pi * df['time_of_day'])

    df['is_weekend'] = (df['transaction_time'].dt.dayofweek >= 5).astype(int)

    df = df.drop(columns=['hour', 'minute', 'time_of_day'])

    return df


def process_features(df):
    df = df.copy()
    df = process_time(df)

    df['amount_log'] = np.log(df['amount'] + 1)
    df['population_city_log'] = (df['population_city'] - df['population_city'].mean()) / df['population_city'].std()

    df['distance'] = np.sqrt((df['lat'] - df['merchant_lat']) ** 2 + (df['lon'] - df['merchant_lon']) ** 2)

    df.drop(columns=['name_1', 'name_2', 'population_city'], inplace=True)

    return df

In [72]:
from sklearn.model_selection import train_test_split

my_train, my_val = train_test_split(train, test_size=0.2, random_state=42)

train_y = my_train['target']
train_X = my_train.drop(columns=['target'])

val_y = my_val['target']
val_X = my_val.drop(columns=['target'])

In [73]:
processed_train = process_features(train_X)
processed_val = process_features(val_X)

In [74]:
cat_features = [
    'merch', 'cat_id', 'gender', 'street', 'one_city', 'us_state', 'post_code', 'jobs'
]
num_features = [
    'amount', 'lat', 'lon', 'merchant_lat', 'merchant_lon', 'distance', 'amount_log', 'population_city_log'
]

In [77]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(processed_train[num_features])
processed_train[num_features] = scaler.transform(processed_train[num_features])
processed_val[num_features] = scaler.transform(processed_val[num_features])

In [78]:
classifier = catboost.CatBoostClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    class_weights=[1, 5]
)
classifier.fit(processed_train, train_y, cat_features=cat_features, verbose=50)

0:	learn: 0.5692461	total: 94.4ms	remaining: 9.35s
50:	learn: 0.0359631	total: 4.25s	remaining: 4.09s
99:	learn: 0.0282843	total: 8.38s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2b4398220>

In [80]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

def predict_and_evaluate(classifier, X, y, th):
    predicted = classifier.predict_proba(X)[:, 1]
    predicted = (predicted > th).astype(int)
    return predicted

predicted = predict_and_evaluate(classifier, processed_val, val_y, 0.5)

print('ROC AUC:', roc_auc_score(val_y, predicted))
print('Accuracy:', accuracy_score(val_y, predicted))
print('Precision:', precision_score(val_y, predicted))
print('Recall:', recall_score(val_y, predicted))
print('F1 Score:', f1_score(val_y, predicted))

ROC AUC: 0.8853072948778975
Accuracy: 0.997030905287786
Precision: 0.7297297297297297
Recall: 0.7722772277227723
F1 Score: 0.7504008551576697


In [81]:
processed_test = process_features(test)

In [82]:
predicted_test = classifier.predict(processed_test)

In [83]:
result = pd.DataFrame({'index': np.arange(len(predicted_test)), 'prediction': predicted_test})
result.to_csv('../data/result.csv', index=False)

In [84]:
pd.read_csv('../data/result.csv')

Unnamed: 0,index,prediction
0,0,0
1,1,0
2,2,0
3,3,1
4,4,1
...,...,...
262139,262139,1
262140,262140,0
262141,262141,0
262142,262142,1


In [86]:
classifier.save_model('../models/catboost_model.cbm') 

In [87]:
import joblib
joblib.dump(scaler, '../models/scaler.pkl')

['../models/scaler.pkl']