# Mlpipeline
Data reformat , preprocess, feature engineering, train, evaluate model and generate submission 
- <a href='#1'>1. feature_engineering</a> 
- <a href='#2'>2. train</a> 
- <a href='#3'>3. predict</a>

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
import gc
from time import time
from datetime import timedelta, datetime
import warnings
from collections import defaultdict
from tqdm import tqdm

import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import lightgbm as lgb
from sklearn import metrics
from dinglingling import wx_reminder
import torch

sys.path.append('../')
import conf
from mlpipeline import (
    feature_engineering_pandas,
    train,
    predict,
)
from utils import (
    check_columns,
    check_nan_value,
    load_model,
)

In [2]:
# global settings
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns',1000)
pd.set_option('display.width',100)
sns.set(rc={'figure.figsize':(20,10)})
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
warnings.simplefilter('ignore', np.RankWarning)

In [3]:
# global variables
SCKEY = 'SCU92138T03d57ff9d4b08ced24c2cceb440cd3bd5e843242680de'  # used for reminding when feature engineering or model training completes

In [4]:
# functions
def __dummy():
    pass

@wx_reminder(SCKEY=SCKEY, remind_started=True)  
def feature_engineering_wrapper(params):
        """
        wrapper for feature engineering func 
        for reminding when it completes
        """
        train_fe_df, test_fe_df = feature_engineering_pandas(**params)
        
        return train_fe_df, test_fe_df
    
@wx_reminder(SCKEY=SCKEY, remind_started=True)  
def train_wrapper(params):
        if params['is_eval']:
            _,_ = train(**params)
        else:
            if params['model_type'] == 'neural':
                model = train(**params) 
                return model
            else:
                model, scaler = train(**params) 
                return model, scaler

In [5]:
! du -sh ../data/*

1.5G	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.bin
1.5G	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy
249M	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_test_fe_df.feather
244M	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather
63M	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl
29M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.bin
29M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy
195M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_test_fe_df.feather
195M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather
1008K	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl
412M	../data/creative_advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather
1.7G	../data/creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.bin
1.7G	../data/creative_id_window_

### <a id='1'> 1.feature_engineering</a>

In [None]:
# feature engineering
params = {
    'train_preprocessed_data_filename':'raw_train_round_one_df.feather', 
    'test_preprocessed_data_filename':'raw_test_df.feather', 
    'train_fe_save_filename': 'neural_train_fe_df.feather',
    'test_fe_save_filename': 'neural_test_fe_df.feather',
    'emb_method':'w2v',
    'max_df':0.9,  # param for tf_idf
    'min_df':3,  # param for tf_idf
    'emb_dim':128,  
    'window':150,  
    'sparse_feat':'advertise_id',  # advertiser_id, product_category, 
    'min_count':1, 
#     'sample':6e-5, 
#     'negative':0,  
    'hs':0, 
#     'alpha':0.03,
#     'min_alpha':0.0007,
    'iter_':10,
    'workers':20,
    'sg':1,
    'num_processes': 40,
    'is_train':True,
    'is_neural_network':True
}

train_fe_df, test_fe_df = feature_engineering_wrapper(params)

2020-05-27 20:14:10,252 - mlpipeline.feature_engineering.feature_engineering - INFO - feature_engineering_pandas开始
2020-05-27 20:14:10,255 - mlpipeline.feature_engineering.feature_engineering - INFO - is_train: True, is_neural_network: True
2020-05-27 20:14:10,256 - mlpipeline.feature_engineering.feature_engineering - INFO - _load_preprocessed_data开始
2020-05-27 20:14:11,351 - mlpipeline.feature_engineering.feature_engineering - INFO - _load_preprocessed_data已完成，共用时0:00:01


In [6]:
# eval 
# lgb_model_params = {
#                'objective': 'multiclass',  # multiclass, binary 
#                'boosting': 'gbdt',
#                'learning_rate': 0.15,
#                'metric': ['multi_logloss'],  # 'binary_logloss', 'multi_logloss'
#                'num_threads': 20,
#                'random_state': 2019,
#                'num_boost_round': 1000,
#                'device': 'cpu',
#                'num_class':20,  # 2, 20 ,10
#                'num_leaves':32,  # [16,32,64,128]
#                'subsample': 0.9,  # [0.7,0.8,0.9,1]
#                'colsample_bytree': 0.9, # [0.2,0.3,0.4,0.5,0.6]
#                'min_data_in_leaf': 40, # [20,40,60,80,100]
#                'lambda_l1': 1.0,  # (0.2,3)
#                'lambda_l2': 1.0,  # (0.2,3)
# }

# 0.3903862863136468, 0.39146038751369455
# xgb_model_params = {
#                'objective': 'multi:softmax',  # multiclass, binary 
#                'booster': 'gbtree',
#                'eta': 0.15,
#                'eval_metric': ['mlogloss'],  # 'binary_logloss', 'multi_logloss'
#                'nthread': 15,
#                'random_state': 2019,
#                'tree_method':'auto',
#                'n_estimators': 2,
#                'device': 'cpu',
#                'num_class':20,  # 2, 20 ,10
#                'max_leaves':32,  # [16,32,64,128]
#                'subsample': 0.9,  # [0.7,0.8,0.9,1]
#                'colsample_bytree': 0.9, # [0.2,0.3,0.4,0.5,0.6]
#                'min_data_in_leaf': 40, # [20,40,60,80,100]
#                'reg_lambda': 1.0,  # (0.2,3)
#                'reg_alpha': 1.0,  # (0.2,3)
# }
# lstm_model_params ={
#     'model_name':'lstm', 
#     'num_classes':20, 
#     'sparse_feat':'creative_id', 
#     'embed':'embedding_creative_id_window_150_dim_300_sg_hs_w2v.npy',
#     'dropout':0.2,
#     'required_improvement':1000,
#     'num_epochs':3,
#     'batch_size':128,
#     'learning_rate':1e-3,
#     'hidden_size':128,
#     'use_pad':True,
#     'max_seq_len':110,
#     'seed':1234,
#     'init_method':'xavier' , # 'kaiming'
#     'num_layers' : 1,
#     'bidirectional':False
# }

# textcnn_model_params={
#     'model_name':'textcnn', 
#     'num_classes':20, 
#     'sparse_feat':'creative_id', 
#     'embed':'embedding_creative_id_300.npy',
#     'dropout':0.2,
#     'required_improvement':1000,
#     'num_epochs':20,
#     'batch_size':128,
#     'learning_rate':1e-3,
#     'filter_size':[3,5,7],
#     'num_filters':3,
#     'use_pad':True,
#     'pad_size':64,
# }

# transformer_model_params = {
#     'model_name':'transformer',
#     'num_classes':20,
#     'sparse_feat':'creative_id', 
#     'embed':'embedding_creative_id_300.npy',
#     'dropout':0.2,
#     'required_improvement':1000,
#     'num_epochs':5,
#     'batch_size':128,
#     'learning_rate':1e-3,
#     'dim_model':300,
#     'hidden':1024,
#     'last_hidden':512,
#     'num_head':5,
#     'init_method':'kaiming',
#     'num_encoder':2,
#     'use_pad':True,
#     'seed':1,
#     'pad_size':64,
# }
bilstm_attention_model_params = {
    'model_name':'bilstm_attention',
    'num_classes':20,
    'sparse_feat':[
        'creative_id',
        'advertiser_id',
    ] ,
    'embed':[
             'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy'
    ],
    'vocab_paths':[
        'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
    ],
    'dropout':0.3,
    'required_improvement':1000,
    'num_epochs':5,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'max_seq_len':90,
    'seed':1234,
    'init_method':'xavier' , # 'kaiming','xavier'
    'num_layers' : 1,
#     'attention_size':256,
    'bidirectional':True
}

# lstm_attention_model_params ={
#     'model_name':'lstm_attention',
#     'num_classes':20,
#     'sparse_feat':'creative_id', 
#     'embed':'embedding_creative_id_300.npy',
#     'dropout':0.2,
#     'required_improvement':1000,
#     'num_epochs':3,
#     'batch_size':1,
#     'learning_rate':1e-3,
#     'hidden_size':256,
#     'use_pad':True,
#     'seed':1234,
#     'init_method':'kaiming' , # 'kaiming', 'xavier'
#     'num_layers' : 1,
#     'max_seq_len':90,
#     'bidirectional':False
# }

params = {
    'fe_filename':'creative_advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather', 
    'is_eval':True, 
    'model_type': 'neural',
    'model_name': 'bilstm_attention',
    'model_params': bilstm_attention_model_params,
    'use_log': False,
    'use_std': False,
    'use_cv': True,  
    'n_splits':2,
}
train_wrapper(params)

2020-05-28 12:57:22,463 - mlpipeline.train - INFO - train开始
2020-05-28 12:57:22,465 - mlpipeline.train - INFO - using_fe_df: creative_advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather, use_label: y, is_eval: True, model_type: neural, model_name: bilstm_attention, use_log: False, use_std: False, use_cv: True, n_splits: 2
2020-05-28 12:57:23,984 - mlpipeline.train - INFO - _train_pipeline_neural开始
2020-05-28 12:57:27,680 - mlpipeline.train - INFO - 模型参数: {'model_name': 'bilstm_attention', 'num_classes': 20, 'sparse_feat': ['creative_id', 'advertiser_id'], 'embed': ['creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy', 'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy'], 'vocab_paths': ['creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl', 'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl'], 'dropout': 0.3, 'required_improvement': 1000, 'num_epochs': 5, 'batch_size': 128, 'learning_rate': 0.001, 'hidden_size': 128

2020-05-28 13:06:35,769 - utils.utils - INFO - Iter:   4900,  Train Loss:   1.6,  Train Acc: 39.84%,Time: 0:06:40
2020-05-28 13:06:43,900 - utils.utils - INFO - Iter:   5000,  Train Loss:   1.5,  Train Acc: 42.19%,Time: 0:06:48
2020-05-28 13:06:52,005 - utils.utils - INFO - Iter:   5100,  Train Loss:   1.5,  Train Acc: 42.19%,Time: 0:06:56
2020-05-28 13:07:00,057 - utils.utils - INFO - Iter:   5200,  Train Loss:   1.5,  Train Acc: 43.75%,Time: 0:07:04
2020-05-28 13:07:08,711 - utils.utils - INFO - Iter:   5300,  Train Loss:   1.6,  Train Acc: 42.19%,Time: 0:07:12
2020-05-28 13:07:17,293 - utils.utils - INFO - Iter:   5400,  Train Loss:   1.4,  Train Acc: 49.22%,Time: 0:07:21
2020-05-28 13:07:25,598 - utils.utils - INFO - Iter:   5500,  Train Loss:   1.5,  Train Acc: 47.66%,Time: 0:07:29
2020-05-28 13:07:33,596 - utils.utils - INFO - Iter:   5600,  Train Loss:   1.4,  Train Acc: 44.53%,Time: 0:07:37
2020-05-28 13:07:41,668 - utils.utils - INFO - Iter:   5700,  Train Loss:   1.5,  Train 

2020-05-28 13:16:21,339 - utils.utils - INFO - Iter:  12000,  Train Loss:   1.6,  Train Acc: 42.19%,Time: 0:16:25
2020-05-28 13:16:29,485 - utils.utils - INFO - Iter:  12100,  Train Loss:   1.7,  Train Acc: 38.28%,Time: 0:16:33
2020-05-28 13:16:37,754 - utils.utils - INFO - Iter:  12200,  Train Loss:   1.3,  Train Acc: 45.31%,Time: 0:16:41
2020-05-28 13:16:46,290 - utils.utils - INFO - Iter:  12300,  Train Loss:   1.4,  Train Acc: 48.44%,Time: 0:16:50
2020-05-28 13:16:54,448 - utils.utils - INFO - Iter:  12400,  Train Loss:   1.3,  Train Acc: 45.31%,Time: 0:16:58
2020-05-28 13:17:02,398 - utils.utils - INFO - Iter:  12500,  Train Loss:   1.3,  Train Acc: 42.19%,Time: 0:17:06
2020-05-28 13:17:11,065 - utils.utils - INFO - Iter:  12600,  Train Loss:   1.5,  Train Acc: 42.19%,Time: 0:17:15
2020-05-28 13:17:19,366 - utils.utils - INFO - Iter:  12700,  Train Loss:   1.5,  Train Acc: 47.66%,Time: 0:17:23
2020-05-28 13:17:27,421 - utils.utils - INFO - Iter:  12800,  Train Loss:   1.5,  Train 

2020-05-28 13:25:45,323 - mlpipeline.train - INFO - age_Confusion Matrix...
2020-05-28 13:25:45,323 - mlpipeline.train - INFO - [[ 8408  6343  1793   197   293   276   122    64    54    48]
 [ 2030 41273 26694  2065  1339   770   135    82   158    90]
 [  419 14733 60877 17669  4521  2141   374   233   348   140]
 [  120  2376 25060 33217  9071  4093   596   271   347   138]
 [  101   783  9297 16356 19152 16263  1911   763   496   212]
 [   88   376  3570  5319 12815 21651  4585  1620   616   221]
 [   93   213  1588  1777  4388 12971  7339  3608  1185   194]
 [   69    91   690   456   987  3364  2789  4546  2630   362]
 [   21    40   379   181   350   996   528  1946  4238  1059]
 [   20    10   218    92    86   292    92   261  1733  2951]]
2020-05-28 13:25:53,629 - utils.utils - INFO - build_dataset开始
2020-05-28 13:25:55,344 - utils.utils - INFO - ../data/creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl has been loaded
2020-05-28 13:25:55,363 - utils.utils - INFO - .

2020-05-28 13:36:03,556 - utils.utils - INFO - Iter:   5600,  Train Loss:   1.5,  Train Acc: 43.75%,Time: 0:07:48
2020-05-28 13:36:12,192 - utils.utils - INFO - Iter:   5700,  Train Loss:   1.4,  Train Acc: 47.66%,Time: 0:07:56
2020-05-28 13:36:20,893 - utils.utils - INFO - Iter:   5800,  Train Loss:   1.3,  Train Acc: 44.53%,Time: 0:08:05
2020-05-28 13:36:29,055 - utils.utils - INFO - Iter:   5900,  Train Loss:   1.6,  Train Acc: 41.41%,Time: 0:08:13
2020-05-28 13:36:37,336 - utils.utils - INFO - Iter:   6000,  Train Loss:   1.5,  Train Acc: 42.19%,Time: 0:08:21
2020-05-28 13:36:45,534 - utils.utils - INFO - Iter:   6100,  Train Loss:   1.6,  Train Acc: 40.62%,Time: 0:08:30
2020-05-28 13:36:54,333 - utils.utils - INFO - Iter:   6200,  Train Loss:   1.6,  Train Acc: 34.38%,Time: 0:08:38
2020-05-28 13:37:02,772 - utils.utils - INFO - Iter:   6300,  Train Loss:   1.3,  Train Acc: 57.81%,Time: 0:08:47
2020-05-28 13:37:11,432 - utils.utils - INFO - Iter:   6400,  Train Loss:   1.6,  Train 

2020-05-28 13:45:49,363 - utils.utils - INFO - Iter:  12700,  Train Loss:   1.3,  Train Acc: 51.56%,Time: 0:17:34
2020-05-28 13:45:58,316 - utils.utils - INFO - Iter:  12800,  Train Loss:   1.5,  Train Acc: 43.75%,Time: 0:17:42
2020-05-28 13:46:07,132 - utils.utils - INFO - Iter:  12900,  Train Loss:   1.5,  Train Acc: 43.75%,Time: 0:17:51
2020-05-28 13:46:15,479 - utils.utils - INFO - Iter:  13000,  Train Loss:   1.3,  Train Acc: 49.22%,Time: 0:18:00
2020-05-28 13:46:23,585 - utils.utils - INFO - Iter:  13100,  Train Loss:   1.5,  Train Acc: 52.34%,Time: 0:18:08
2020-05-28 13:46:32,078 - utils.utils - INFO - Iter:  13200,  Train Loss:   1.4,  Train Acc: 42.97%,Time: 0:18:16
2020-05-28 13:46:40,366 - utils.utils - INFO - Iter:  13300,  Train Loss:   1.4,  Train Acc: 42.19%,Time: 0:18:25
2020-05-28 13:46:48,583 - utils.utils - INFO - Iter:  13400,  Train Loss:   1.5,  Train Acc: 43.75%,Time: 0:18:33
2020-05-28 13:46:56,872 - utils.utils - INFO - Iter:  13500,  Train Loss:   1.5,  Train 

2020-05-28 13:54:20,404 - mlpipeline.train - INFO - gender result list: [0.9407230125820545, 0.9410947701524909], cv mean: 0.9409088913672727
2020-05-28 13:54:20,406 - mlpipeline.train - INFO - total score list: [1.39327698 1.36378041], total score cv mean: 1.3785286922450717
2020-05-28 13:54:26,415 - mlpipeline.train - INFO - _train_pipeline_neural已完成，共用时0:57:02
2020-05-28 13:54:26,416 - mlpipeline.train - INFO - bilstm_attention模型训练完成!
2020-05-28 13:54:26,557 - mlpipeline.train - INFO - train已完成，共用时0:57:04


In [None]:
# train 
lstm_model_params ={
    'model_name':'lstm', 
    'num_classes':20, 
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.5,
    'required_improvement':1000,
    'num_epochs':5,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'pad_size':64,
    'seed':1234,
    'init_method':'kaiming' , # 'kaiming'
    'num_layers' : 1,
    'bidirectional':True
}

bilstm_attention_model_params = {
    'model_name':'bilstm_attention',
    'num_classes':20,
    'sparse_feat':[
        'creative_id',
        'advertiser_id',
    ] ,
    'embed':[
             'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy'
    ],
    'vocab_paths':[
        'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
    ],
    'dropout':0.3,
    'required_improvement':1000,
    'num_epochs':5,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'max_seq_len':90,
    'seed':1234,
    'init_method':'xavier' , # 'kaiming','xavier'
    'num_layers' : 1,
#     'attention_size':256,
    'bidirectional':True
}

lstm_attention_model_params ={
    'model_name':'lstm_attention',
    'num_classes':20,
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.2,
    'required_improvement':1000,
    'num_epochs':3,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':256,
    'use_pad':True,
    'pad_size':90,
    'seed':1234,
    'init_method':'kaiming' , # 'kaiming'
    'num_layers' : 1,
    'max_seq_len':90,
    'bidirectional':False
}

params = {
    'fe_filename':'creative_advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather', 
    'is_eval':False, 
    'model_type': 'neural',
    'model_name': 'bilstm_attention',
    'model_params': bilstm_attention_model_params,
    'use_log': False,
    'use_std': False,
    'use_cv': False,  
}

model = train_wrapper(params)

2020-05-28 13:55:51,495 - mlpipeline.train - INFO - train开始
2020-05-28 13:55:51,497 - mlpipeline.train - INFO - using_fe_df: creative_advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather, use_label: y, is_eval: False, model_type: neural, model_name: bilstm_attention, use_log: False, use_std: False, use_cv: False, n_splits: 2
2020-05-28 13:55:52,992 - mlpipeline.train - INFO - _train_pipeline_neural开始
2020-05-28 13:55:56,610 - mlpipeline.train - INFO - 模型参数: {'model_name': 'bilstm_attention', 'num_classes': 20, 'sparse_feat': ['creative_id', 'advertiser_id'], 'embed': ['creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy', 'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy'], 'vocab_paths': ['creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl', 'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl'], 'dropout': 0.3, 'required_improvement': 1000, 'num_epochs': 5, 'batch_size': 128, 'learning_rate': 0.001, 'hidden_size': 1

2020-05-28 14:04:40,227 - utils.utils - INFO - Iter:   5200,  Train Loss:   1.6,  Train Acc: 46.09%,Time: 0:06:26
2020-05-28 14:04:47,553 - utils.utils - INFO - Iter:   5300,  Train Loss:   1.6,  Train Acc: 41.41%,Time: 0:06:33
2020-05-28 14:04:54,873 - utils.utils - INFO - Iter:   5400,  Train Loss:   1.4,  Train Acc: 50.00%,Time: 0:06:40
2020-05-28 14:05:02,244 - utils.utils - INFO - Iter:   5500,  Train Loss:   1.6,  Train Acc: 42.97%,Time: 0:06:48
2020-05-28 14:05:09,649 - utils.utils - INFO - Iter:   5600,  Train Loss:   1.5,  Train Acc: 45.31%,Time: 0:06:55
2020-05-28 14:05:17,156 - utils.utils - INFO - Iter:   5700,  Train Loss:   1.7,  Train Acc: 36.72%,Time: 0:07:03
2020-05-28 14:05:24,649 - utils.utils - INFO - Iter:   5800,  Train Loss:   1.6,  Train Acc: 34.38%,Time: 0:07:10
2020-05-28 14:05:31,917 - utils.utils - INFO - Iter:   5900,  Train Loss:   1.6,  Train Acc: 37.50%,Time: 0:07:17
2020-05-28 14:05:39,266 - utils.utils - INFO - Iter:   6000,  Train Loss:   1.6,  Train 

In [6]:
# predict 
lstm_attention_model_params ={
    'model_name':'lstm_attention',
    'num_classes':20,
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.2,
    'required_improvement':1000,
    'num_epochs':20,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':256,
    'use_pad':True,
    'pad_size':90,
    'seed':1234,
    'init_method':'kaiming' , # 'kaiming'
    'num_layers' : 1,
    'max_seq_len':90,
    'bidirectional':False
}


bilstm_attention_model_params = {
    'model_name':'bilstm_attention',
    'num_classes':20,
    'sparse_feat':[
        'creative_id',
        'advertiser_id',
    ] ,
    'embed':[
             'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy'
    ],
    'vocab_paths':[
        'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
    ],
    'dropout':0.3,
    'required_improvement':1000,
    'num_epochs':5,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'max_seq_len':90,
    'seed':1234,
    'init_method':'xavier' , # 'kaiming','xavier'
    'num_layers' : 1,
#     'attention_size':256,
    'bidirectional':True
}

params = {
          'test_fe_filename':'creative_advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_test_fe_df.feather',
          'use_log':False,
          'use_std': False,
          'model_type': 'neural',
          'model_name':'bilstm_attention',
          'model_params': bilstm_attention_model_params
            }

submission_df = predict(**params)

2020-05-28 08:30:54,296 - mlpipeline.predict - INFO - predict开始
2020-05-28 08:30:54,297 - mlpipeline.predict - INFO - test_fe_filename: creative_id_window_150_dim_400_sg_1_hs_0_neural_test_fe_df.feather, use_log: False, use_std: False, model_type: neural, model_name: bilstm_attention
2020-05-28 08:30:55,336 - mlpipeline.predict - INFO - inference_pipeline_neural开始
2020-05-28 08:31:04,832 - utils.utils - INFO - build_dataset开始
2020-05-28 08:31:06,880 - utils.utils - INFO - ../data/creative_id_window_150_dim_400_sg_1_hs_0_vocab.pkl has been loaded
2020-05-28 08:32:26,777 - utils.utils - INFO - build_dataset已完成，共用时0:01:22
2020-05-28 08:32:26,779 - mlpipeline.predict - INFO - Loading data...
2020-05-28 08:32:26,780 - utils.utils - INFO - build_iterater开始
2020-05-28 08:32:26,781 - utils.utils - INFO - build_iterater已完成，共用时0:00:00
2020-05-28 08:32:26,781 - mlpipeline.predict - INFO - Time usage:0:00:00
  "num_layers={}".format(dropout, num_layers))
A value is trying to be set on a copy of a 

In [7]:
submission_df.head()

Unnamed: 0,user_id,predicted_age
0,3000001,3
1,3000002,7
2,3000003,3
3,3000004,3
4,3000005,4


In [8]:
# combine age and gender 
submission_age_df = pd.read_csv(os.path.join(conf.SUBMISSION_DIR,'submission_age_2020-05-28T08:34:08.417593.csv')) 

In [9]:
submission_age_df.head()

Unnamed: 0,user_id,predicted_age
0,3000001,3
1,3000002,7
2,3000003,3
3,3000004,3
4,3000005,4


In [10]:
submission_y_df = pd.read_csv(os.path.join(conf.SUBMISSION_DIR,'submission_y_2020-05-23T09:46:08.001373.csv')) 

In [12]:
submission_y_df.head()

Unnamed: 0,user_id,predicted_gender,predicted_age
0,3000001,1,3
1,3000002,2,7
2,3000003,2,2
3,3000004,1,3
4,3000005,1,4


In [13]:
submission_y_df['predicted_age'] = submission_age_df['predicted_age']

In [14]:
submission_y_df.head()

Unnamed: 0,user_id,predicted_gender,predicted_age
0,3000001,1,3
1,3000002,2,7
2,3000003,2,3
3,3000004,1,3
4,3000005,1,4


In [15]:
submission_save_path = os.path.join(conf.SUBMISSION_DIR,'submission_y_%s.csv'%(datetime.now().isoformat()))

In [16]:
submission_y_df.to_csv(os.path.join(conf.SUBMISSION_DIR, submission_save_path),index=False)