# Train and Predict
Train and evaluate model
- <a href='#1'>1. lightgbm</a> 
- <a href='#2'>2. task2</a> 
- <a href='#3'>3. ensemble</a>

In [1]:
import sys
import os
import gc
from time import time
from datetime import timedelta, datetime
import base64

import pandas as pd
from IPython.core.display import display, HTML
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import lightgbm as lgb
from sklearn import metrics

sys.path.append('../')
import conf
from mlpipeline import (
    feature_engineering,
    train,
    predict,
)
from utils import (
    check_columns,
    check_nan_value,
    #     correct_colum_type,
    decrypt_model,
    plot_dist_of_cols,
)

In [2]:
# global settings
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns',1000)
pd.set_option('display.width',100)
sns.set(rc={'figure.figsize':(20,10)})
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [3]:
# global variables
DEFAULT_MISSING_FLOAT = 0
FAULT_TAG = 1

In [4]:
# notebook settings
%load_ext autoreload
%autoreload 2

In [5]:
# functions
def __dummy():
    pass

### <a id='1'> 1.lightgbm</a>

In [6]:
! du -sh ../data/*

4.3M	../data/chunk_data_for_test.h5
4.2G	../data/data_2017_all.h5
2.3G	../data/data_2017_tag_flag.h5
2.5G	../data/data_2017_tag_flag_raw.h5
20M	../data/data_201808_test.h5
41M	../data/data_201808_test_all.h5
24M	../data/data_201808_test_raw.h5
6.9G	../data/data_2018_all.h5
3.9G	../data/data_2018_tag_flag.h5
4.2G	../data/data_2018_tag_flag_raw.h5
36K	../data/disk_sample_fault_tag.csv
1.2G	../data/disk_sample_smart_log_201707.csv
228M	../data/disk_sample_smart_log_201707_norm.csv
233M	../data/disk_sample_smart_log_201707_raw.csv
2.2G	../data/disk_sample_smart_log_201708.csv
446M	../data/disk_sample_smart_log_201708_norm.csv
460M	../data/disk_sample_smart_log_201708_raw.csv
2.2G	../data/disk_sample_smart_log_201709.csv
450M	../data/disk_sample_smart_log_201709_norm.csv
468M	../data/disk_sample_smart_log_201709_raw.csv
2.5G	../data/disk_sample_smart_log_201710.csv
507M	../data/disk_sample_smart_log_201710_norm.csv
530M	../data/disk_sample_smart_log_201710_raw.csv
2.6

In [None]:
# feature engineering
params = {
    'train_filename':'data_2018_tag_flag_raw.h5',  # train file we want to load
    'test_filename': 'data_201808_test_raw.h5',
    'train_fe_save_filename':'train_fe_df_02_01_raw.h5',  # naming the train fe by the start date of train
    'test_fe_save_filename': 'test_fe_df_02_01_raw.h5',  # same as train fe
    'is_sampling': False,
    'sample_validset': False,  # whether do sampling for valid set and train set at the same time
    'train_start_date': '2018-02-01',
    'train_end_date': '2018-06-30',  # include validation data duration
    'valid_start_date': '2018-06-01',  # for judge the val set from train set in sampling process
    'valid_end_date': '2018-06-30',
    'use_model_one': True,
    'num_processes': 14,
}

train_fe_df, test_fe_df = feature_engineering(**params)

2020-03-02 21:55:04,628 - mlpipeline.feature_engineering - INFO - feature_engineering开始
2020-03-02 21:55:04,630 - mlpipeline.feature_engineering - INFO - _load_dataset_by_filename开始
2020-03-02 21:55:04,633 - mlpipeline.feature_engineering - INFO - 加载训练数据集: ../data/data_2018_tag_flag_raw.h5
2020-03-02 21:58:17,895 - mlpipeline.feature_engineering - INFO - 加载训练数据集完成,共用时: 0:03:13
2020-03-02 21:58:17,899 - mlpipeline.feature_engineering - INFO - 加载测试数据集: ../data/data_201808_test_raw.h5
2020-03-02 21:58:19,920 - mlpipeline.feature_engineering - INFO - 加载测试数据集完成,共用时: 0:00:02
2020-03-02 21:58:19,922 - mlpipeline.feature_engineering - INFO - _load_dataset_by_filename已完成，共用时0:03:15
2020-03-02 21:58:19,923 - mlpipeline.feature_engineering - INFO - _data_preprocess开始
2020-03-02 21:58:34,833 - utils.utils - INFO - correct_column_type开始
2020-03-02 21:58:42,393 - utils.utils - INFO - col_types: serial_number            object
manufacturer             object
model                      int8
dt        

2020-03-02 22:29:15,051 - utils.utils - INFO - smart_5raw_max_30 - 0.0 
2020-03-02 22:29:15,088 - utils.utils - INFO - smart_7raw_max_30 - 0.0 
2020-03-02 22:29:15,123 - utils.utils - INFO - smart_9raw_max_30 - 0.0 
2020-03-02 22:29:15,160 - utils.utils - INFO - smart_12raw_max_30 - 0.0 
2020-03-02 22:29:15,202 - utils.utils - INFO - smart_184raw_max_30 - 0.0 
2020-03-02 22:29:15,240 - utils.utils - INFO - smart_187raw_max_30 - 0.0 
2020-03-02 22:29:15,281 - utils.utils - INFO - smart_188raw_max_30 - 0.0 
2020-03-02 22:29:15,322 - utils.utils - INFO - smart_189raw_max_30 - 0.0 
2020-03-02 22:29:15,363 - utils.utils - INFO - smart_190raw_max_30 - 0.0 
2020-03-02 22:29:15,401 - utils.utils - INFO - smart_192raw_max_30 - 0.0 
2020-03-02 22:29:15,454 - utils.utils - INFO - smart_193raw_max_30 - 0.0 
2020-03-02 22:29:15,520 - utils.utils - INFO - smart_194raw_max_30 - 0.0 
2020-03-02 22:29:15,577 - utils.utils - INFO - smart_195raw_max_30 - 0.0 
2020-03-02 22:29:15,626 - utils.utils - INFO 

In [None]:
# eval
model_params = {
    "objective": "binary",
    "boosting": 'gbdt',
    "learning_rate": 0.001,
    "scale_pos_weight": 25,
    #                 "is_unbalance": True,
    "num_leaves": 32,
    "metric": ["auc"],
    "subsample": 0.8,
    "colsample_bytree": 0.6,
    "num_threads": 10,
    "lambda_l2": 0.8,
    "lambda_l1": 0.8,
    "random_state": 2019,
    "min_data_in_leaf": 80,
    "num_boost_round": 1000,
    "device": 'cpu',
}  # 'cpu','gpu'
#                 "gpu_device_id":'2
#                 "max_bin":255}

params = {
    'model_params': model_params,
    'model_name': 'lgb',
    'train_fe_filename': 'train_fe_df_02_01_raw.h5',
    'is_eval': True,
    "is_standard": True,  # whether using sklearn-standard
    'train_start_date': '2018-02-01',
    'train_end_date': '2018-04-30',
    'eval_on_model_id': 1,
    'valid_start_date':'2018-06-01',  # must select one of split_date and n_splits, date for spliting train and valid
    'valid_end_date': '2018-06-30',
    'n_splits': 3,  # n_splits for cross validation
}

model, eval_df = train(**params)

In [None]:
eval_fault_df = eval_df[eval_df.prediction == 1]
mask = eval_fault_df['prediction'] == eval_fault_df['tag']
len(eval_fault_df[eval_fault_df.tag == 1])

In [None]:
# train
model_params = {
    "objective": "binary",
    "boosting": 'gbdt',
    "learning_rate": 0.001,
    "scale_pos_weight": 25,
    #                 "is_unbalance": True,
    "num_leaves": 32,
    "metric": ["auc"],
    "subsample": 0.8,
    "colsample_bytree": 0.6,
    "num_threads": 10,
    "lambda_l2": 0.8,
    "lambda_l1": 0.8,
    "random_state": 2019,
    "min_data_in_leaf": 80,
    "num_boost_round": 100,
    "device": 'cpu',
}  # 'cpu','gpu'
#                 "gpu_device_id":'2
#                 "max_bin":255}

params = {
    'model_params': model_params,
    'model_name': 'lgb',
    'train_fe_filename': 'train_fe_df_02_01_raw.h5',
    'is_eval': False,
    'is_standard': True,
    'train_start_date': '2018-02-01',
    'train_end_date': '2018-06-30',
}

model, scaler = train(**params)

In [None]:
# predict 
params = {'model_name': 'lgb',
          'test_fe_filename': 'test_fe_df_02_01_raw.h5', 
          'is_standard': True,
          'pred_on_model_id':1,
          'scaler':scaler,
          'test_month':8,
            }

ret, submission_df = predict(**params)

In [None]:
submission_df.head()

In [15]:
# check the distribution of test set
# test_fe_df = pd.read_hdf(os.path.join(conf.DATA_DIR, 'test_fe_df.h5'))
# train_fe_df = pd.read_hdf(os.path.join(conf.DATA_DIR, 'train_fe_df.h5'))
# _, cate_cols, cont_cols, _ = check_columns(train_fe_df.dtypes.to_dict())
# test_fe_df = test_fe_df[cate_cols + cont_cols]
# valid_fe_df = train_fe_df[train_fe_df.dt>=split_date][cate_cols + cont_cols]
# train_fe_df = train_fe_df[train_fe_df.dt<split_date][cate_cols + cont_cols]

In [16]:
# %%time
# proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames([{'name': 'test', 'table': test_fe_df},
#                                                                 {'name':'train','table':train_fe_df},
#                                                                 {'name':'valid','table':valid_fe_df}])
# protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

In [17]:
# HTML_TEMPLATE = """
#         <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
#         <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
#         <facets-overview id="elem"></facets-overview>
#         <script>
#           document.querySelector("#elem").protoInput = "{protostr}";
#         </script>"""
# html = HTML_TEMPLATE.format(protostr=protostr)
# display(HTML(html))