In [None]:
import sys
import warnings
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import precision_recall_curve, f1_score
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
from argparse import ArgumentParser

# self defined methods
sys.path.append('..')
from Utils.CrossValidate import CrossValidate, lgb_f1_score
from Utils.Feature import FeatureEngineer
from Utils.func import train_submit
from Utils.read_data import read

warnings.simplefilter('ignore')


def get_dataset():
    print('Getting dataset...')

    """
    There're 23 attributes in the training data set.
    Use the first 22 attributes to predict the last attribute - `fraud_ind` (bool)
    """

    attrs = ['acquirer', 'bank', 'card', 'money', 'trade_cat', 'coin', 'online', 'trade_type',
             'fallback', '3ds', 'pay_type', 'install', 'term', 'date', 'time', 'mcc', 'shop', 'excess',
             'city', 'nation', 'status', 'txkey', 'fraud_ind']

    train = read('../data/train_mr.pkl')
    test = read('../data/test_mr.pkl')

    # reset dataframe column order (affect : LGBM sort by column index)
    combine = pd.concat([train, test])[attrs]
    combine = combine.reset_index(drop=True)

    return combine


def train(action='cv', file_name='submit001'):
    TRAIN_SHAPE = 1521787
    not_train = ['txkey', 'date', 'time', 'fraud_ind']
    need_encode = ['acquirer', 'bank', 'card',
                   'coin', 'mcc', 'shop', 'city', 'nation']
    cat = ['status', 'trade_cat', 'pay_type', 'trade_type']

    # get dataset
    dataset = get_dataset()

    # pre process
    preprocessor = FeatureEngineer()
    preprocessor.engineer_all(dataset)

    if action == 'cv':
        # split train / test
        X = dataset.loc[:TRAIN_SHAPE - 1,
                        [x for x in dataset.columns if x not in not_train and x not in need_encode]]
        y = dataset.loc[:TRAIN_SHAPE - 1, 'fraud_ind']
        print(X.shape, y.shape)

        # by 0.191 threshold
        cv = CrossValidate()
        res = cv.expanding_window(X, y, cat, boost_round=1000)
        print(sum(res) / len(res))
        print('base line: 0.6034704709308101')
    elif action == 'submit':
        split, gain = train_submit(dataset, cat, not_train + need_encode, file_name=file_name)
        print('\nPrediction written to ./submit/{}.csv'.format(file_name))


def parse_args():
    parser = ArgumentParser()
    parser.add_argument("action", choices=['cv', 'submit'], default='cv', type=str)
    parser.add_argument("--file_name", default='submit001', type=str)
    args = parser.parse_args()
    return args


def main():
    action = 'cv'
    file_name = 'submit001'
    train(action=action, file_name=file_name)


if __name__ == '__main__':
    main()
