## ***this notebook regards feature engineering testing and a gradient boosting baseline solution pipeline***

#### **imports**

In [1]:
import sys
import os

current = os.getcwd()
parent = os.path.dirname(current)
sys.path.append(parent)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import train
from xgboost import XGBClassifier, DMatrix
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA
from utils.evaluation import label_evaluation
from data_factory.DataLoader import DataLoader

#### **hyperparameters and constants**

In [3]:
delay = 7
result_file = '../data/predict.csv'
truth_file = '../data/ground_truth.hdf'

In [4]:
MA_MIN = 5
MA_MAX = 100
MA_STEP = 5
VALID_SET_SIZE = 0.2
RANDOM_SEED = 999
SPLITS = 10
THRESHOLD = 0.26
EARLY_STOP = 10

#### **load data**

In [5]:
dl = DataLoader(use_previous_files=True, config_file="../config/config.yml")

INFO:data_factory.DataLoader:Use previously generated file ../data/data_export_train.csv_test.csv_fmd-False_s-True.p. Can not redo preprocessing by loading from generated file.


#### **feature engineering**

In [6]:
moving_avg_lens = [ma for ma in range(MA_MIN, MA_MAX+MA_STEP, MA_STEP)]

for dataset in (dl.train, dl.val, dl.test):
    for length in moving_avg_lens:
        # moving average
        dataset[f'MA{length}'] = dataset.groupby('kpi_id')['value'].transform(lambda x: x.rolling(window=length, min_periods=1).mean())
        dataset[f'DIFF_MA{length}'] = abs(dataset[f'MA{length}'] - dataset['value'])
        # exponential moving average
        dataset[f'EMA{length}'] = dataset.groupby('kpi_id')['value'].transform(lambda x: x.ewm(span=length, min_periods=1).mean())
        dataset[f'DIFF_EMA{length}'] = abs(dataset[f'EMA{length}'] - dataset['value'])
    # cumulative moving average
    dataset[f'CMA'] = dataset.groupby('kpi_id')['value'].transform(lambda x: x.expanding().mean().mean())
    dataset[f'DIFF_CMA'] = abs(dataset[f'CMA'] - dataset['value'])

In [7]:
c_ignored = ['timestamp', 'datetime', 'kpi_id', 'label']
X_train, y_train = dl.train.loc[:, [c for c in dl.train.columns if c not in c_ignored]], dl.train.loc[:, 'label']
X_val, y_val = dl.val.loc[:, [c for c in dl.val.columns if c not in c_ignored]], dl.val.loc[:, 'label']
X_test = dl.test.loc[:, [c for c in dl.test.columns if c not in c_ignored]]

#### **best features**

In [8]:
dtrain = DMatrix(X_train, label=y_train)
dvalid = DMatrix(X_val, label=y_val)
watchlist = [(dvalid, 'valid')]
params = {
    'max_depth': 7,
    'min_child_weight': 200, 
    'colsample_bytree': 0.8, 
    'subsample': 0.8, 
    'eta': 0.04,    
    'seed': RANDOM_SEED,
    'eval_metric': 'auc'
}
booster = train(params, dtrain, num_boost_round=20, evals=watchlist, early_stopping_rounds=EARLY_STOP, verbose_eval=True)
best_features = pd.DataFrame(booster.get_score(importance_type='gain').items(), columns=['features', 'importance'])

[0]	valid-auc:0.70846
[1]	valid-auc:0.86873
[2]	valid-auc:0.91206
[3]	valid-auc:0.91407
[4]	valid-auc:0.92235
[5]	valid-auc:0.93385
[6]	valid-auc:0.93314
[7]	valid-auc:0.93300
[8]	valid-auc:0.93181
[9]	valid-auc:0.93168
[10]	valid-auc:0.93412
[11]	valid-auc:0.95073
[12]	valid-auc:0.95056
[13]	valid-auc:0.95543
[14]	valid-auc:0.95532
[15]	valid-auc:0.95534
[16]	valid-auc:0.95539
[17]	valid-auc:0.95909
[18]	valid-auc:0.95939
[19]	valid-auc:0.96010


##### **models**

In [9]:
model_params = {
    'max_depth': 7,
    'n_estimators': 100,
    'min_child_weight': 200,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'eta': 0.04,
    'objective': 'binary:logistic',
    'use_label_encoder': False,
    'seed': RANDOM_SEED
}

model = XGBClassifier(**model_params)

#### **training**

In [10]:
fit_params = {
    'eval_metric': 'auc',
    'eval_set': [(X_train, y_train), (X_val, y_val)],
    'early_stopping_rounds': EARLY_STOP,
    'verbose': True
}

model.fit(X_train, y_train, **fit_params)

[0]	validation_0-auc:0.90834	validation_1-auc:0.72038
[1]	validation_0-auc:0.91908	validation_1-auc:0.86659
[2]	validation_0-auc:0.92871	validation_1-auc:0.90009
[3]	validation_0-auc:0.93098	validation_1-auc:0.90715
[4]	validation_0-auc:0.93163	validation_1-auc:0.91769
[5]	validation_0-auc:0.93172	validation_1-auc:0.91182
[6]	validation_0-auc:0.93796	validation_1-auc:0.95491
[7]	validation_0-auc:0.93818	validation_1-auc:0.95506
[8]	validation_0-auc:0.93829	validation_1-auc:0.95505
[9]	validation_0-auc:0.93944	validation_1-auc:0.95444
[10]	validation_0-auc:0.94043	validation_1-auc:0.95439
[11]	validation_0-auc:0.94178	validation_1-auc:0.95510
[12]	validation_0-auc:0.94199	validation_1-auc:0.95500
[13]	validation_0-auc:0.94203	validation_1-auc:0.95562
[14]	validation_0-auc:0.94230	validation_1-auc:0.95459
[15]	validation_0-auc:0.94262	validation_1-auc:0.95453
[16]	validation_0-auc:0.94305	validation_1-auc:0.95437
[17]	validation_0-auc:0.94289	validation_1-auc:0.95530
[18]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8,
              enable_categorical=False, eta=0.04, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.0399999991, max_delta_step=0, max_depth=7,
              min_child_weight=200, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1,
              predictor='auto', random_state=999, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, seed=999, subsample=0.8, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, ...)

#### **evaluation**

In [11]:
for name, X, dataset in (('train', X_train, dl.train), ('val', X_val, dl.val)):
    print(f'evaluating: {name}')
    pred = model.predict_proba(X)[:,1]
    pred = np.where(pred > THRESHOLD, 1, 0)
    prediction = dataset.loc[:, ['timestamp', 'kpi_id']].rename(columns={'kpi_id': 'KPI ID'})
    prediction['predict'] = pred
    prediction.to_csv(result_file)

    ground_truth = dataset.loc[:, ['timestamp', 'kpi_id', 'label']].rename(columns={'kpi_id': 'KPI ID'})
    ground_truth.to_hdf(truth_file, key='df')

    print(label_evaluation(truth_file, result_file, delay))

evaluating: train
{"result": true, "data": 0.7379173654520506, "message": "计算成功"}
evaluating: val
{"result": true, "data": 0.9261160181427549, "message": "计算成功"}


#### **submission**

In [12]:
pred = model.predict_proba(X_test)[:,1]
pred = np.where(pred > THRESHOLD, 1, 0)
prediction = dl.test.loc[:, ['timestamp', 'kpi_id']].rename(columns={'kpi_id': 'KPI ID'})
prediction['predict'] = pred
prediction.to_csv(result_file)