# Train and Eval
Train and evaluate model
- <a href='#1'>1. lightgbm</a> 
- <a href='#2'>2. task2</a> 
- <a href='#3'>3. ensemble</a>

In [1]:
import sys
import os 
import gc
from time import time 
from datetime import timedelta
from datetime import datetime

import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import lightgbm as lgb
from sklearn import metrics

sys.path.append('../')
import conf
from utils.utils import check_columns, encrypt_model, decrypt_model, timer, check_nan_value, correct_colum_type
from mlpipeline.train_eval import train_pipeline_lightgbm, pipeline_inference

In [2]:
# global settings
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns',1000)
pd.set_option('display.width',100)
sns.set(rc={'figure.figsize':(11,4)})

In [3]:
# global variables
DEFAULT_MISSING_FLOAT = 0
FONT = fm.FontProperties(fname = os.path.join(conf.LIB_DIR,'simsun.ttc'))
CLASS_NAME = ['无故障','有故障']

In [4]:
# functions
def __dummy():
    pass

### <a id='1'> 1.lightgbm</a>

In [5]:
! du -sh ../data/*

3.2G	../data/data_4_5_6_norm_tag.h5
36K	../data/disk_sample_fault_tag.csv
1.2G	../data/disk_sample_smart_log_201707.csv
228M	../data/disk_sample_smart_log_201707_norm.csv
233M	../data/disk_sample_smart_log_201707_raw.csv
2.2G	../data/disk_sample_smart_log_201708.csv
446M	../data/disk_sample_smart_log_201708_norm.csv
460M	../data/disk_sample_smart_log_201708_raw.csv
2.2G	../data/disk_sample_smart_log_201709.csv
450M	../data/disk_sample_smart_log_201709_norm.csv
468M	../data/disk_sample_smart_log_201709_raw.csv
2.5G	../data/disk_sample_smart_log_201710.csv
507M	../data/disk_sample_smart_log_201710_norm.csv
530M	../data/disk_sample_smart_log_201710_raw.csv
2.6G	../data/disk_sample_smart_log_201711.csv
523M	../data/disk_sample_smart_log_201711_norm.csv
551M	../data/disk_sample_smart_log_201711_raw.csv
2.9G	../data/disk_sample_smart_log_201712.csv
590M	../data/disk_sample_smart_log_201712_norm.csv
625M	../data/disk_sample_smart_log_201712_raw.csv
3.2G	../data/disk_sample

In [None]:
fe_df = pd.read_hdf(os.path.join(conf.DATA_DIR, 'data_4_5_6_norm_tag.h5'))

In [None]:
fe_df.head()

In [None]:
# get model 1 data
mask = fe_df['model']==1
mod_one_fe_df = fe_df[mask]

In [None]:
drop_na_cols = check_nan_value(mod_one_fe_df)

In [None]:
mod_one_fe_df.drop(columns=drop_na_cols, inplace=True)

In [None]:
mod_one_fe_df.head()

In [None]:
# correct col type
correct_colum_type(mod_one_fe_df)

In [None]:
mod_one_fe_df.loc[mod_one_fe_df.tag!=0,'tag']=1

In [None]:
mod_one_fe_df.tag.value_counts()

In [None]:
mod_one_fe_df.memory_usage().sum()/1024**2

In [None]:
del fe_df
gc.collect()

In [None]:
mod_one_fe_df.head()

In [None]:
def sampling(fe_df, majority_label=1, threshold=60000):
    """Sampling the majority label data"""
    sub_dfs = dict(tuple(fe_df[fe_df['tag']==majority_lable].groupby(['model', 'serie'])))
    if len(sub_dfs) > threshold:
            sample_rate = threshold * 1.0 / len(sub_dfs)
            sample_dfs = dict([(x, sub_dfs[x]) for x in list(sub_dfs) if np.random.random() < sample_rate])
            sub_dfs = sample_dfs
    return sub_dfs

In [None]:
params = {"objective": "binary", 
          "learning_rate": 0.01,
          'scale_pos_weight':100,               
          'metric':['binary_log_loss','auc'], 'subsample':0.7, 'max_bin':255, 'n_thread':3}

In [22]:
model, eval_df = train_pipeline_lightgbm(mod_one_fe_df, split_date='2018-06-01', parmas=params, )

开始训练
[1]	valid_0's auc: 0.654015
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.645875
[3]	valid_0's auc: 0.642065
[4]	valid_0's auc: 0.649644
[5]	valid_0's auc: 0.650183
[6]	valid_0's auc: 0.651959
[7]	valid_0's auc: 0.651064
[8]	valid_0's auc: 0.656524
[9]	valid_0's auc: 0.661531
[10]	valid_0's auc: 0.661854
[11]	valid_0's auc: 0.665668
[12]	valid_0's auc: 0.665266
[13]	valid_0's auc: 0.668917
[14]	valid_0's auc: 0.669202
[15]	valid_0's auc: 0.669072
[16]	valid_0's auc: 0.669407
[17]	valid_0's auc: 0.670327
[18]	valid_0's auc: 0.670056
[19]	valid_0's auc: 0.669536
[20]	valid_0's auc: 0.670266
[21]	valid_0's auc: 0.670602
[22]	valid_0's auc: 0.669998
[23]	valid_0's auc: 0.668926
[24]	valid_0's auc: 0.668789
[25]	valid_0's auc: 0.668697
[26]	valid_0's auc: 0.672986
[27]	valid_0's auc: 0.673243
[28]	valid_0's auc: 0.673821
[29]	valid_0's auc: 0.673953
[30]	valid_0's auc: 0.673542
[31]	valid_0's auc: 0.674869
[32]	valid_0's auc: 0.674555
[33]	valid_0's 

In [24]:
eval_df.head(100)

Unnamed: 0,tag,serial_number,manufacturer,model,dt,prediction
0,0,disk_58088,A,1,2018-06-01,0
1,0,disk_58088,A,1,2018-06-01,0
2,0,disk_58089,A,1,2018-06-01,0
3,0,disk_58094,A,1,2018-06-01,0
4,0,disk_58096,A,1,2018-06-01,0
...,...,...,...,...,...,...
95,0,disk_58242,A,1,2018-06-01,0
96,0,disk_58242,A,1,2018-06-01,0
97,0,disk_58244,A,1,2018-06-01,0
98,0,disk_58244,A,1,2018-06-01,0


In [25]:
# clean test data
test_fe_df = pd.read_csv(os.path.join(conf.DATA_DIR, 'disk_sample_smart_log_test_a_norm.csv'))

In [26]:
drop_na_cols = check_nan_value(test_fe_df)

serial_number - 0.0 
manufacturer - 0.0 
model - 0.0 
dt - 0.0 
smart_1_normalized - 0.0 
smart_3_normalized - 0.0 
smart_4_normalized - 0.0 
smart_5_normalized - 0.0 
smart_7_normalized - 0.0 
smart_9_normalized - 0.0 
smart_10_normalized - 0.0 
smart_12_normalized - 0.0 
smart_184_normalized - 0.0 
smart_187_normalized - 0.0 
smart_188_normalized - 0.0 
smart_189_normalized - 0.0 
smart_190_normalized - 0.0 
smart_191_normalized - 0.0 
smart_192_normalized - 0.0 
smart_193_normalized - 0.0 
smart_194_normalized - 0.0 
smart_195_normalized - 0.0 
smart_197_normalized - 0.0 
smart_198_normalized - 0.0 
smart_199_normalized - 0.0 
smart_240_normalized - 99.46 
smart_241_normalized - 99.46 
smart_242_normalized - 99.46 


In [27]:
test_fe_df.drop(columns=drop_na_cols, inplace=True)

In [28]:
test_fe_df.head()

Unnamed: 0,serial_number,manufacturer,model,dt,smart_1_normalized,smart_3_normalized,smart_4_normalized,smart_5_normalized,smart_7_normalized,smart_9_normalized,smart_10_normalized,smart_12_normalized,smart_184_normalized,smart_187_normalized,smart_188_normalized,smart_189_normalized,smart_190_normalized,smart_191_normalized,smart_192_normalized,smart_193_normalized,smart_194_normalized,smart_195_normalized,smart_197_normalized,smart_198_normalized,smart_199_normalized
0,disk_119164,A,1,20180829,73.0,97.0,100.0,100.0,93.0,57.0,100.0,100.0,100.0,100.0,100.0,100.0,69.0,100.0,100.0,100.0,31.0,62.0,100.0,100.0,200.0
1,disk_119437,A,1,20180815,83.0,96.0,100.0,100.0,93.0,64.0,100.0,100.0,100.0,100.0,100.0,100.0,69.0,100.0,100.0,100.0,31.0,23.0,100.0,100.0,200.0
2,disk_119991,A,1,20180807,75.0,95.0,100.0,100.0,96.0,59.0,100.0,100.0,100.0,100.0,100.0,85.0,68.0,100.0,100.0,100.0,32.0,60.0,100.0,100.0,200.0
3,disk_119991,A,1,20180812,81.0,95.0,100.0,100.0,96.0,59.0,100.0,100.0,100.0,100.0,100.0,85.0,68.0,100.0,100.0,100.0,32.0,57.0,100.0,100.0,200.0
4,disk_120372,A,1,20180826,83.0,96.0,100.0,100.0,93.0,58.0,100.0,100.0,100.0,100.0,100.0,100.0,69.0,100.0,100.0,100.0,31.0,23.0,100.0,100.0,200.0


In [29]:
test_fe_df.model.value_counts()

1    178096
Name: model, dtype: int64

In [31]:
ret = pipeline_inference(test_fe_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [33]:
ret[ret.prediction==1]

Unnamed: 0,serial_number,manufacturer,model,dt,prediction
175,disk_104948,A,1,20180830,1
176,disk_104948,A,1,20180812,1
791,disk_104948,A,1,20180809,1
792,disk_104948,A,1,20180810,1
793,disk_104948,A,1,20180821,1
...,...,...,...,...,...
177713,disk_7904,A,1,20180801,1
177797,disk_76519,A,1,20180801,1
177820,disk_26514,A,1,20180801,1
178033,disk_141663,A,1,20180801,1


In [41]:
output_df = ret[ret.prediction==1][['manufacturer','model', 'serial_number','dt']]

In [42]:
output_df.prediction.value_counts()

1    1034
Name: prediction, dtype: int64

In [43]:
output_df.to_csv(os.path.join('../submission', 'submission_%s.csv'%datetime.now()), index=False, header=False)