In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# models
from sklearn.decomposition import PCA
from xgboost import XGBClassifier, DMatrix
from xgboost import train
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [2]:
import sys

sys.path.append('../')

# evaluation file
from src.evaluation import label_evaluation

In [3]:
%run ../src/DataLoader.py

DEBUG:__main__:Use config {'device': 'cpu', 'seed': False, 'data': {'path': '../data/', 'train_file': 'train.csv', 'test_file': 'test.csv'}, 'preprocessing': {'scale': True, 'fill_missing_dates': True}}
INFO:__main__:Use previously generated file ../data//data_export_train.csv_test.csv_fmd-True_s-True.p. Can not redo preprocessing by loading from generated file.


(2649242, 6)
(139438, 6)
(2345211, 5)


In [4]:
VALID_SET_SIZE = 0.2
RANDOM_SEED = 999
SPLITS = 10
THRESHOLD = 0.26

result_file = 'predict.csv'
truth_file = 'ground_truth.hdf'
delay = 7

Feature Engineering

In [5]:
mas = [ma for ma in range(5, 105, 5)]

for ma in mas:
    dl.train[f'MA{ma}'] = dl.train.groupby('kpi_id')['value'].transform(lambda x: x.rolling(window=ma, min_periods=1).mean())
    dl.train[f'DIFF_MA{ma}'] = abs(dl.train[f'MA{ma}'] - dl.train['value'])
    
    dl.val[f'MA{ma}'] = dl.val.groupby('kpi_id')['value'].transform(lambda x: x.rolling(window=ma, min_periods=1).mean())
    dl.val[f'DIFF_MA{ma}'] = abs(dl.val[f'MA{ma}'] - dl.val['value'])

    dl.test[f'MA{ma}'] = dl.test.groupby('kpi_id')['value'].transform(lambda x: x.rolling(window=ma, min_periods=1).mean())
    dl.test[f'DIFF_MA{ma}'] = abs(dl.test[f'MA{ma}'] - dl.test['value'])

In [10]:
c_ignored = ['timestamp', 'datetime', 'kpi_id', 'label']
X_train, y_train = dl.train.loc[:, [c for c in dl.train.columns if c not in c_ignored]], dl.train.loc[:, 'label']
X_val, y_val = dl.val.loc[:, [c for c in dl.val.columns if c not in c_ignored]], dl.val.loc[:, 'label']
X_test = dl.test.loc[:, [c for c in dl.test.columns if c not in c_ignored]]

In [11]:
dtrain = DMatrix(X_train, label=y_train)
dvalid = DMatrix(X_val, label=y_val)
watchlist = [(dvalid, 'valid')]
params = {
    'max_depth': 7,
    'min_child_weight': 200, 
    'colsample_bytree': 0.8, 
    'subsample': 0.8, 
    'eta': 0.04,    
    'seed': RANDOM_SEED,
    'eval_metric': 'auc'
}
booster = train(params, dtrain, num_boost_round=20, evals=watchlist, early_stopping_rounds=5, verbose_eval=True)
best_features = pd.DataFrame(booster.get_score(importance_type='gain').items(), columns=['features', 'importance'])

[0]	valid-auc:0.95717
[1]	valid-auc:0.96146
[2]	valid-auc:0.95988
[3]	valid-auc:0.95984
[4]	valid-auc:0.96295
[5]	valid-auc:0.96283
[6]	valid-auc:0.96477
[7]	valid-auc:0.96359
[8]	valid-auc:0.96620
[9]	valid-auc:0.96681
[10]	valid-auc:0.96852
[11]	valid-auc:0.96824
[12]	valid-auc:0.96809
[13]	valid-auc:0.96798
[14]	valid-auc:0.96798
[15]	valid-auc:0.96797


In [12]:
model_params = {
    'max_depth': 7,
    'n_estimators': 30,
    'min_child_weight': 200,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'eta': 0.04,
    'objective': 'binary:logistic',
    'use_label_encoder': False,
    'seed': RANDOM_SEED
}

model = XGBClassifier(**model_params)

In [13]:
fit_params = {
    'eval_metric': 'auc',
    'eval_set': [(X_train, y_train), (X_val, y_val)],
    'early_stopping_rounds': 5,
    'verbose': True
}

model.fit(X_train, y_train, **fit_params)



[0]	validation_0-auc:0.89021	validation_1-auc:0.95459
[1]	validation_0-auc:0.91085	validation_1-auc:0.95864
[2]	validation_0-auc:0.90550	validation_1-auc:0.96146
[3]	validation_0-auc:0.90511	validation_1-auc:0.96024
[4]	validation_0-auc:0.90687	validation_1-auc:0.96176
[5]	validation_0-auc:0.90740	validation_1-auc:0.96177
[6]	validation_0-auc:0.90995	validation_1-auc:0.96395
[7]	validation_0-auc:0.91328	validation_1-auc:0.96379
[8]	validation_0-auc:0.91526	validation_1-auc:0.96432
[9]	validation_0-auc:0.91615	validation_1-auc:0.96592
[10]	validation_0-auc:0.91725	validation_1-auc:0.96687
[11]	validation_0-auc:0.91671	validation_1-auc:0.96676
[12]	validation_0-auc:0.91710	validation_1-auc:0.96752
[13]	validation_0-auc:0.91671	validation_1-auc:0.96752
[14]	validation_0-auc:0.91755	validation_1-auc:0.96726
[15]	validation_0-auc:0.92050	validation_1-auc:0.96973
[16]	validation_0-auc:0.91826	validation_1-auc:0.96909
[17]	validation_0-auc:0.92117	validation_1-auc:0.97008
[18]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
              early_stopping_rounds=None, enable_categorical=False, eta=0.04,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.0399999991, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=200,
              missing=nan, monotone_constraints='()', n_estimators=30, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=999,
              reg_alpha=0, ...)

In [14]:
pred = model.predict_proba(X_train)[:,1]
pred = np.where(pred > THRESHOLD, 1, 0)
prediction = dl.train.loc[:, ['timestamp', 'kpi_id']].rename(columns={'kpi_id': 'KPI ID'})
prediction['predict'] = pred
prediction.to_csv(result_file)

ground_truth = dl.train.loc[:, ['timestamp', 'kpi_id', 'label']].rename(columns={'kpi_id': 'KPI ID'})
ground_truth.to_hdf(truth_file, key='df')

print(label_evaluation(truth_file, result_file, delay))

{"result": true, "data": 0.5910519667459827, "message": "计算成功"}


In [15]:
pred = model.predict_proba(X_val)[:,1]
pred = np.where(pred > THRESHOLD, 1, 0)
prediction = dl.val.loc[:, ['timestamp', 'kpi_id']].rename(columns={'kpi_id': 'KPI ID'})
prediction['predict'] = pred
prediction.to_csv(result_file)

ground_truth = dl.val.loc[:, ['timestamp', 'kpi_id', 'label']].rename(columns={'kpi_id': 'KPI ID'})
ground_truth.to_hdf(truth_file, key='df')

print(label_evaluation(truth_file, result_file, delay))

{"result": true, "data": 0.9149949849548645, "message": "计算成功"}


In [17]:
pred = model.predict_proba(X_test)[:,1]
pred = np.where(pred > THRESHOLD, 1, 0)
prediction = dl.test.loc[:, ['timestamp', 'kpi_id']].rename(columns={'kpi_id': 'KPI ID'})
prediction['predict'] = pred
prediction.to_csv(result_file)