In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import f1_score
from typing import Counter
from sklearn.model_selection import GridSearchCV

In [3]:
train_standard = pd.read_csv('pre-processed_train.csv')
test_standard = pd.read_csv('pre-processed_test.csv')

In [4]:
test_standard.shape, train_standard.shape

((147635, 259), (442905, 259))

In [3]:
X_train = train_standard.drop(axis="columns", labels="isFraud").to_numpy().astype(np.float64)
y_train = train_standard["isFraud"].to_numpy().astype(np.float64)
X_test = test_standard.drop(axis="columns", labels="isFraud").to_numpy().astype(np.float64)

# Re-sampling

In [4]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [5]:
over = RandomOverSampler(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

pipeline = Pipeline(steps=[('o', over), ('u', under)])
train_X_re, train_y_re = pipeline.fit_resample(X_train, y_train)

# XGBoost model

In [6]:
import xgboost as xgb

param_test1 = {<br>
    'max_depth': [7, 9, 11],<br>
    'min_child_weight': [1, 3]<br>
}<br>

gsearch_1 = GridSearchCV(estimator=xgb_classifier, param_grid=param_test1, scoring='roc_auc', cv=3)

Best parameters:  {'max_depth': 11, 'min_child_weight': 1}<br>

param_test2 = {<br>
    'n_estimators' : [2000, 2500, 3000],<br>
    'max_depth': [8, 10, 12],<br>
    'min_child_weight': [1, 2]<br>
}<br>

gsearch_2 = GridSearchCV(estimator=xgb_classifier, param_grid=param_test2, scoring='roc_auc', cv=3)

Best parameters:  {'max_depth': 12, 'min_child_weight': 1, 'n_estimators': 2000}

In [7]:
xgb_classifier_opt = xgb.XGBClassifier(learning_rate=0.2, n_estimators=2000, tree_method='gpu_hist', max_depth=12)

In [8]:
xgb_classifier_opt.fit(train_X_re, train_y_re)

In [9]:
ypred_test_xgb = xgb_classifier_opt.predict(X_test)
ypred_train_xgb = xgb_classifier_opt.predict(X_train)

In [10]:
print(f1_score(ypred_train_xgb, y_train))

0.8670377933812627


In [11]:
test_results = pd.Series(ypred_test_xgb.astype('int32'), name="isFraud")
test_results.to_csv("submission_1.csv")

In [12]:
ypred_test_xgb_prob = xgb_classifier_opt.predict_proba(X_test)

In [17]:
ypred_test_xgb_prob

array([[9.2814398e-01, 7.1856052e-02],
       [1.1801720e-05, 9.9998820e-01],
       [9.9999940e-01, 5.8488860e-07],
       ...,
       [9.9999827e-01, 1.7460846e-06],
       [9.9999952e-01, 4.5834420e-07],
       [9.9797678e-01, 2.0232005e-03]], dtype=float32)

In [20]:
prob = []
for i in range(ypred_test_xgb_prob.shape[0]):
    prob.append(ypred_test_xgb_prob[i][1])

In [21]:
prob

[0.07185605,
 0.9999882,
 5.848886e-07,
 4.3796467e-06,
 2.2438755e-05,
 3.71236e-05,
 0.00029950353,
 0.015428688,
 1.0896682e-06,
 0.27876195,
 0.00010929316,
 1.1058468e-05,
 0.00067249127,
 0.00028338688,
 0.00034544268,
 5.3433923e-05,
 0.0026560714,
 0.00010728121,
 0.0050285854,
 0.0035064365,
 2.756082e-06,
 0.0014834405,
 5.947661e-05,
 0.00012190868,
 1.6320748e-05,
 4.1565574e-05,
 3.3594094e-07,
 1.2389384e-06,
 8.0585386e-07,
 0.00030001166,
 8.449739e-07,
 1.5109147e-06,
 9.812354e-05,
 2.3329799e-06,
 0.00026719784,
 3.4281757e-06,
 7.256112e-05,
 1.5280426e-06,
 0.9999968,
 0.00029352587,
 0.00018342305,
 4.728598e-06,
 3.221676e-07,
 1.4351253e-06,
 2.0477379e-05,
 3.219522e-05,
 0.0011752811,
 5.897379e-06,
 0.750539,
 1.031386e-06,
 1.3836599e-06,
 1.3050107e-05,
 7.5460316e-06,
 7.1671043e-06,
 4.2662155e-05,
 0.00025205596,
 0.00011131555,
 4.292441e-07,
 0.002908398,
 1.0868497e-05,
 2.784796e-06,
 0.00021006768,
 0.014363415,
 0.0012392128,
 0.002450319,
 7.20980

In [22]:
test_results = pd.Series(prob, name="isFraud")
test_results.to_csv("submission_1_prob.csv")