In [1]:
!pip3 install tables

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from data_factory.DataLoader import DataLoader
# evaluation file
from utils.evaluation import label_evaluation

# models
from sklearn.decomposition import PCA
from xgboost import XGBClassifier, DMatrix
from xgboost import train
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier



  from pandas import MultiIndex, Int64Index


In [2]:
dl = DataLoader(use_previous_files=True, config_file="../config/config.yml")

INFO:data_factory.DataLoader:Use previously generated file ../data//data_export_train.csv_test.csv_fmd-False_s-True.p. Can not redo preprocessing by loading from generated file.


In [3]:
VALID_SET_SIZE = 0.2
RANDOM_SEED = 999
SPLITS = 10
THRESHOLD = 0.26

result_file = '../data/predict.csv'
truth_file = '../data/ground_truth.hdf'
delay = 7

Feature Engineering

In [4]:
mas = [ma for ma in range(5, 105, 5)]

for ma in mas:
    dl.train[f'MA{ma}'] = dl.train.groupby('kpi_id')['value'].transform(lambda x: x.rolling(window=ma, min_periods=1).mean())
    dl.train[f'DIFF_MA{ma}'] = abs(dl.train[f'MA{ma}'] - dl.train['value'])
    
    dl.val[f'MA{ma}'] = dl.val.groupby('kpi_id')['value'].transform(lambda x: x.rolling(window=ma, min_periods=1).mean())
    dl.val[f'DIFF_MA{ma}'] = abs(dl.val[f'MA{ma}'] - dl.val['value'])

    dl.test[f'MA{ma}'] = dl.test.groupby('kpi_id')['value'].transform(lambda x: x.rolling(window=ma, min_periods=1).mean())
    dl.test[f'DIFF_MA{ma}'] = abs(dl.test[f'MA{ma}'] - dl.test['value'])

In [5]:
c_ignored = ['timestamp', 'datetime', 'kpi_id', 'label']
X_train, y_train = dl.train.loc[:, [c for c in dl.train.columns if c not in c_ignored]], dl.train.loc[:, 'label']
X_val, y_val = dl.val.loc[:, [c for c in dl.val.columns if c not in c_ignored]], dl.val.loc[:, 'label']
X_test = dl.test.loc[:, [c for c in dl.test.columns if c not in c_ignored]]

In [6]:
dtrain = DMatrix(X_train, label=y_train)
dvalid = DMatrix(X_val, label=y_val)
watchlist = [(dvalid, 'valid')]
params = {
    'max_depth': 7,
    'min_child_weight': 200, 
    'colsample_bytree': 0.8, 
    'subsample': 0.8, 
    'eta': 0.04,    
    'seed': RANDOM_SEED,
    'eval_metric': 'auc'
}
booster = train(params, dtrain, num_boost_round=20, evals=watchlist, early_stopping_rounds=5, verbose_eval=True)
best_features = pd.DataFrame(booster.get_score(importance_type='gain').items(), columns=['features', 'importance'])

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[0]	valid-auc:0.94420
[1]	valid-auc:0.96137
[2]	valid-auc:0.96095
[3]	valid-auc:0.96068
[4]	valid-auc:0.96388
[5]	valid-auc:0.96483
[6]	valid-auc:0.96354
[7]	valid-auc:0.96380
[8]	valid-auc:0.96341
[9]	valid-auc:0.96321


In [7]:
model_params = {
    'max_depth': 7,
    'n_estimators': 30,
    'min_child_weight': 200,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'eta': 0.04,
    'objective': 'binary:logistic',
    'use_label_encoder': False,
    'seed': RANDOM_SEED
}

model = XGBClassifier(**model_params)

In [8]:
fit_params = {
    'eval_metric': 'auc',
    'eval_set': [(X_train, y_train), (X_val, y_val)],
    'early_stopping_rounds': 5,
    'verbose': True
}

model.fit(X_train, y_train, **fit_params)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[0]	validation_0-auc:0.91130	validation_1-auc:0.94275
[1]	validation_0-auc:0.92383	validation_1-auc:0.96016
[2]	validation_0-auc:0.92390	validation_1-auc:0.95776
[3]	validation_0-auc:0.92464	validation_1-auc:0.95801
[4]	validation_0-auc:0.93065	validation_1-auc:0.96050
[5]	validation_0-auc:0.93358	validation_1-auc:0.96251
[6]	validation_0-auc:0.93463	validation_1-auc:0.96235
[7]	validation_0-auc:0.93393	validation_1-auc:0.96237
[8]	validation_0-auc:0.93463	validation_1-auc:0.96206
[9]	validation_0-auc:0.93491	validation_1-auc:0.96166


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8,
              enable_categorical=False, eta=0.04, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.0399999991, max_delta_step=0, max_depth=7,
              min_child_weight=200, missing=nan, monotone_constraints='()',
              n_estimators=30, n_jobs=12, num_parallel_tree=1, predictor='auto',
              random_state=999, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=999, subsample=0.8, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, ...)

In [10]:
pred = model.predict_proba(X_train)[:,1]
pred = np.where(pred > THRESHOLD, 1, 0)
prediction = dl.train.loc[:, ['timestamp', 'kpi_id']].rename(columns={'kpi_id': 'KPI ID'})
prediction['predict'] = pred
prediction.to_csv(result_file)

ground_truth = dl.train.loc[:, ['timestamp', 'kpi_id', 'label']].rename(columns={'kpi_id': 'KPI ID'})
ground_truth.to_hdf(truth_file, key='df')

print(label_evaluation(truth_file, result_file, delay))

{"result": true, "data": 0.03757672421991134, "message": "计算成功"}


Scale: True
Fill: True
{"result": true, "data": 0.5910519667459827, "message": "计算成功"}

Scale: True
Fill: False
{"result": true, "data": 0.5910519667459827, "message": "计算成功"}

In [11]:
pred = model.predict_proba(X_val)[:,1]
pred = np.where(pred > THRESHOLD, 1, 0)
prediction = dl.val.loc[:, ['timestamp', 'kpi_id']].rename(columns={'kpi_id': 'KPI ID'})
prediction['predict'] = pred
prediction.to_csv(result_file)

ground_truth = dl.val.loc[:, ['timestamp', 'kpi_id', 'label']].rename(columns={'kpi_id': 'KPI ID'})
ground_truth.to_hdf(truth_file, key='df')

print(label_evaluation(truth_file, result_file, delay))

{"result": true, "data": 0.12783233913220984, "message": "计算成功"}


Scale: True
Fill: True
{"result": true, "data": 0.9149949849548645, "message": "计算成功"}

In [12]:
pred = model.predict_proba(X_test)[:,1]
pred = np.where(pred > THRESHOLD, 1, 0)
prediction = dl.test.loc[:, ['timestamp', 'kpi_id']].rename(columns={'kpi_id': 'KPI ID'})
prediction['predict'] = pred
prediction.to_csv(result_file)