In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_curve, auc
import lightgbm as lgb
import os
import joblib

## Import datasets

In [37]:
def import_datasets(path: str) -> pd.DataFrame:
    x_train = pd.read_feather(os.path.join(path, 'x_train.feather'))
    y_train = pd.read_csv(os.path.join(path, 'y_train.csv'), index_col=False, header=0)
    y_train = y_train['isFraud']
    x_test = pd.read_feather(os.path.join(path, 'x_test.feather'))
    y_test = pd.read_csv(os.path.join(path, 'y_test.csv'), index_col=False, header=0)
    y_test = y_test['isFraud']

    return x_train, y_train, x_test, y_test

In [38]:
x_train, y_train, x_test, y_test = import_datasets('../data')

In [39]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442903 entries, 0 to 442902
Columns: 228 entries, TransactionAmt to hour_of_day
dtypes: float64(184), int32(30), int64(14)
memory usage: 719.7 MB


## Train model

In [40]:
from sklearn.model_selection import GridSearchCV

In [41]:
def optimize_parameters(x_train: pd.DataFrame, y_train: pd.DataFrame, x_test: pd.DataFrame, y_test: pd.DataFrame):
    params = {'n_estimators': [100, 200, 500, 1000, 1500, 2000], 'max_depth': [3, 6, 12], 'subsample': [0.4, 0.8]}

    clf = lgb.LGBMClassifier(
        learning_rate=0.02, random_state=42, colsample_bytree=0.4)

    model = GridSearchCV(estimator=clf, param_grid=params, scoring='roc_auc', verbose=3, n_jobs=-1)
    model.fit(x_train, y_train)

    print (model.best_params_)

    return model

In [42]:
model = optimize_parameters(x_train, y_train, x_test, y_test)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'max_depth': 12, 'n_estimators': 2000, 'subsample': 0.4}


In [43]:
joblib.dump(model, '../models/lgbm_gscv.joblib')

['../models/lgbm_gscv.joblib']