# Mlpipeline
Data reformat , preprocess, feature engineering, train, evaluate model and generate submission 
- <a href='#1'>1. feature_engineering</a> 
- <a href='#2'>2. train</a> 
- <a href='#3'>3. predict</a>

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
import gc
from time import time
from datetime import timedelta, datetime
import warnings
from collections import defaultdict
from tqdm import tqdm

import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import lightgbm as lgb
from sklearn import metrics
from dinglingling import wx_reminder
import torch

sys.path.append('../')
import conf
from mlpipeline import (
    feature_engineering_pandas,
    train,
    predict,
)
from utils import (
    check_columns,
    check_nan_value,
    load_model,
)

In [2]:
# global settings
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns',1000)
pd.set_option('display.width',100)
sns.set(rc={'figure.figsize':(20,10)})
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
warnings.simplefilter('ignore', np.RankWarning)

In [3]:
# global variables
SCKEY = 'SCU92138T03d57ff9d4b08ced24c2cceb440cd3bd5e843242680de'  # used for reminding when feature engineering or model training completes

In [4]:
# functions
def __dummy():
    pass

@wx_reminder(SCKEY=SCKEY, remind_started=True)  
def feature_engineering_wrapper(params):
        """
        wrapper for feature engineering func 
        for reminding when it completes
        """
        train_fe_df, test_fe_df = feature_engineering_pandas(**params)
        
        return train_fe_df, test_fe_df
    
@wx_reminder(SCKEY=SCKEY, remind_started=True)  
def train_wrapper(params):
        if params['is_eval']:
            _,_ = train(**params)
        else:
            if params['model_type'] == 'neural':
                model = train(**params) 
                return model
            else:
                model, scaler = train(**params) 
                return model, scaler

In [5]:
! du -sh ../data/*

1.5G	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.bin
1.5G	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy
249M	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_test_fe_df.feather
244M	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather
63M	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl
29M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.bin
29M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy
195M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_test_fe_df.feather
195M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather
1008K	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl
48K	../data/click_times_window_150_dim_128_sg_1_hs_0_iter_10_embedding.bin
52K	../data/click_times_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy
75M	../data/click_times_window_150_dim_128_sg_1_hs_0_iter_

### <a id='1'> 1.feature_engineering</a>

In [None]:
# feature engineering
params = {
    'train_preprocessed_data_filename':'raw_train_round_one_df.feather', 
    'test_preprocessed_data_filename':'raw_test_df.feather', 
    'train_fe_save_filename': 'neural_train_fe_df.feather',
    'test_fe_save_filename': 'neural_test_fe_df.feather',
    'emb_method':'w2v',
    'max_df':0.9,  # param for tf_idf
    'min_df':3,  # param for tf_idf
    'emb_dim':128,  
    'window':150,  
    'sparse_feat':'advertise_id',  # advertiser_id, product_category, 
    'min_count':1, 
#     'sample':6e-5, 
#     'negative':0,  
    'hs':0, 
#     'alpha':0.03,
#     'min_alpha':0.0007,
    'iter_':10,
    'workers':20,
    'sg':1,
    'num_processes': 40,
    'is_train':True,
    'is_neural_network':True
}

train_fe_df, test_fe_df = feature_engineering_wrapper(params)

2020-05-27 20:14:10,252 - mlpipeline.feature_engineering.feature_engineering - INFO - feature_engineering_pandas开始
2020-05-27 20:14:10,255 - mlpipeline.feature_engineering.feature_engineering - INFO - is_train: True, is_neural_network: True
2020-05-27 20:14:10,256 - mlpipeline.feature_engineering.feature_engineering - INFO - _load_preprocessed_data开始
2020-05-27 20:14:11,351 - mlpipeline.feature_engineering.feature_engineering - INFO - _load_preprocessed_data已完成，共用时0:00:01


In [None]:
# eval 
# lgb_model_params = {
#                'objective': 'multiclass',  # multiclass, binary 
#                'boosting': 'gbdt',
#                'learning_rate': 0.15,
#                'metric': ['multi_logloss'],  # 'binary_logloss', 'multi_logloss'
#                'num_threads': 20,
#                'random_state': 2019,
#                'num_boost_round': 1000,
#                'device': 'cpu',
#                'num_class':20,  # 2, 20 ,10
#                'num_leaves':32,  # [16,32,64,128]
#                'subsample': 0.9,  # [0.7,0.8,0.9,1]
#                'colsample_bytree': 0.9, # [0.2,0.3,0.4,0.5,0.6]
#                'min_data_in_leaf': 40, # [20,40,60,80,100]
#                'lambda_l1': 1.0,  # (0.2,3)
#                'lambda_l2': 1.0,  # (0.2,3)
# }

# xgb_model_params = {
#                'objective': 'multi:softmax',  # multiclass, binary 
#                'booster': 'gbtree',
#                'eta': 0.15,
#                'eval_metric': ['mlogloss'],  # 'binary_logloss', 'multi_logloss'
#                'nthread': 15,
#                'random_state': 2019,
#                'tree_method':'auto',
#                'n_estimators': 2,
#                'device': 'cpu',
#                'num_class':20,  # 2, 20 ,10
#                'max_leaves':32,  # [16,32,64,128]
#                'subsample': 0.9,  # [0.7,0.8,0.9,1]
#                'colsample_bytree': 0.9, # [0.2,0.3,0.4,0.5,0.6]
#                'min_data_in_leaf': 40, # [20,40,60,80,100]
#                'reg_lambda': 1.0,  # (0.2,3)
#                'reg_alpha': 1.0,  # (0.2,3)
# }

# textcnn_model_params={
#     'model_name':'textcnn', 
#     'num_classes':20, 
#     'sparse_feat':'creative_id', 
#     'embed':'embedding_creative_id_300.npy',
#     'dropout':0.2,
#     'required_improvement':1000,
#     'num_epochs':20,
#     'batch_size':128,
#     'learning_rate':1e-3,
#     'filter_size':[3,5,7],
#     'num_filters':3,
#     'use_pad':True,
#     'pad_size':64,
# }

# transformer_model_params = {
#     'model_name':'transformer',
#     'num_classes':20,
#     'sparse_feat':'creative_id', 
#     'embed':'embedding_creative_id_300.npy',
#     'dropout':0.2,
#     'required_improvement':1000,
#     'num_epochs':5,
#     'batch_size':128,
#     'learning_rate':1e-3,
#     'dim_model':300,
#     'hidden':1024,
#     'last_hidden':512,
#     'num_head':5,
#     'init_method':'kaiming',
#     'num_encoder':2,
#     'use_pad':True,
#     'seed':1,
#     'pad_size':64,
# }
bilstm_attention_model_params = {
    'model_name':'bilstm_attention',
    'num_classes':20,
    'sparse_feat':[
        'creative_id',
        'advertiser_id',
        'ad_id',
        
#         'industry',
#         'product_category',
#         'product_id',
    ] ,
    'embed':[
             'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
#              'industry_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
#              'product_category_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
#              'product_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
    ],
    'vocab_paths':[
        'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
#         'industry_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
#         'product_category_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
#         'product_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
    ],
    'dropout':0.3,
    'required_improvement':1200,
    'num_epochs':20,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'max_seq_len':90,
    'seed':1234,
    'init_method':'xavier' , # 'kaiming','xavier'
    'num_layers' : 1,
#     'attention_size':256,
    'bidirectional':True
}

lstm_model_params ={
    'model_name':'lstm_attention',
    'num_classes':20,
    'sparse_feat':[
        'creative_id',
        'advertiser_id',
        'ad_id',
#         'product_category'
    ] ,
    'embed':[
             'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
#              'product_category_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy'
    ],
    'vocab_paths':[
        'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
#         'product_category_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
    ],
    'dropout':0.2,
    'required_improvement':1000,
    'num_epochs':5,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'max_seq_len':110,
    'seed':1234,
    'init_method':'xavier' , # 'kaiming'
    'num_layers' : 1,
    'bidirectional':False
}

lstm_attention_model_params ={
    'model_name':'lstm_attention',
    'num_classes':20,
    'sparse_feat':[
        'creative_id',
        'advertiser_id',
        'ad_id',
#         'product_category'
    ] ,
    'embed':[
             'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
#              'product_category_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy'
    ],
    'vocab_paths':[
        'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
#         'product_category_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
    ],
    'dropout':0.2,
    'required_improvement':1000,
    'num_epochs':5,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'seed':1234,
    'init_method':'kaiming' , # 'kaiming', 'xavier'
    'num_layers' : 1,
    'max_seq_len':90,
    'bidirectional':False
}

params = {
    'fe_filename':'creative_advertiser_ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather', 
    'is_eval':True, 
    'model_type': 'neural',
    'model_name': 'bilstm_attention',
    'model_params': bilstm_attention_model_params,
    'use_log': False,
    'use_std': False,
    'use_cv': True,  
    'n_splits':2,
}
train_wrapper(params)

2020-05-31 09:49:18,039 - mlpipeline.train - INFO - train开始
2020-05-31 09:49:18,041 - mlpipeline.train - INFO - using_fe_df: creative_advertiser_ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather, use_label: y, is_eval: True, model_type: neural, model_name: bilstm_attention, use_log: False, use_std: False, use_cv: True, n_splits: 2
2020-05-31 09:49:20,472 - mlpipeline.train - INFO - _train_pipeline_neural开始


In [6]:
# train 
lstm_model_params ={
    'model_name':'lstm', 
    'num_classes':20, 
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.5,
    'required_improvement':1000,
    'num_epochs':5,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'pad_size':64,
    'seed':1234,
    'init_method':'kaiming' , # 'kaiming'
    'num_layers' : 1,
    'bidirectional':True
}

bilstm_attention_model_params = {
    'model_name':'bilstm_attention',
    'num_classes':20,
    'sparse_feat':[
        'creative_id',
        'advertiser_id',
        'ad_id',
        
#         'industry',
#         'product_category',
#         'product_id',
    ] ,
    'embed':[
             'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
#              'industry_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
#              'product_category_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
#              'product_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
    ],
    'vocab_paths':[
        'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
#         'industry_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
#         'product_category_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
#         'product_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
    ],
    'dropout':0.3,
    'required_improvement':1200,
    'num_epochs':4,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'max_seq_len':90,
    'seed':1234,
    'init_method':'xavier' , # 'kaiming','xavier'
    'num_layers' : 1,
#     'attention_size':256,
    'bidirectional':True
}

lstm_attention_model_params ={
    'model_name':'lstm_attention',
    'num_classes':20,
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.2,
    'required_improvement':1000,
    'num_epochs':3,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':256,
    'use_pad':True,
    'pad_size':90,
    'seed':1234,
    'init_method':'kaiming' , # 'kaiming'
    'num_layers' : 1,
    'max_seq_len':90,
    'bidirectional':False
}

params = {
    'fe_filename':'creative_advertiser_ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather', 
    'is_eval':False, 
    'model_type': 'neural',
    'model_name': 'bilstm_attention',
    'model_params': bilstm_attention_model_params,
    'use_log': False,
    'use_std': False,
    'use_cv': False,  
}

model = train_wrapper(params)

2020-05-31 09:49:46,992 - mlpipeline.train - INFO - train开始
2020-05-31 09:49:46,994 - mlpipeline.train - INFO - using_fe_df: creative_advertiser_ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather, use_label: y, is_eval: False, model_type: neural, model_name: bilstm_attention, use_log: False, use_std: False, use_cv: False, n_splits: 2
2020-05-31 09:49:49,316 - mlpipeline.train - INFO - _train_pipeline_neural开始
2020-05-31 09:49:56,910 - mlpipeline.train - INFO - 模型参数: {'model_name': 'bilstm_attention', 'num_classes': 20, 'sparse_feat': ['creative_id', 'advertiser_id', 'ad_id'], 'embed': ['creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy', 'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy', 'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy'], 'vocab_paths': ['creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl', 'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl', 'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.p

2020-05-31 10:04:05,024 - utils.utils - INFO - Iter:   5000,  Train Loss:   1.5,  Train Acc: 39.06%,Time: 0:10:13
2020-05-31 10:04:16,636 - utils.utils - INFO - Iter:   5100,  Train Loss:   1.4,  Train Acc: 45.31%,Time: 0:10:25
2020-05-31 10:04:28,567 - utils.utils - INFO - Iter:   5200,  Train Loss:   1.6,  Train Acc: 42.97%,Time: 0:10:37
2020-05-31 10:04:40,528 - utils.utils - INFO - Iter:   5300,  Train Loss:   1.6,  Train Acc: 34.38%,Time: 0:10:49
2020-05-31 10:04:53,066 - utils.utils - INFO - Iter:   5400,  Train Loss:   1.4,  Train Acc: 46.09%,Time: 0:11:01
2020-05-31 10:05:05,357 - utils.utils - INFO - Iter:   5500,  Train Loss:   1.5,  Train Acc: 45.31%,Time: 0:11:13
2020-05-31 10:05:16,710 - utils.utils - INFO - Iter:   5600,  Train Loss:   1.5,  Train Acc: 46.88%,Time: 0:11:25
2020-05-31 10:05:28,944 - utils.utils - INFO - Iter:   5700,  Train Loss:   1.7,  Train Acc: 38.28%,Time: 0:11:37
2020-05-31 10:05:41,020 - utils.utils - INFO - Iter:   5800,  Train Loss:   1.5,  Train 

2020-05-31 10:18:17,855 - utils.utils - INFO - Iter:  12200,  Train Loss:   1.3,  Train Acc: 42.19%,Time: 0:24:26
2020-05-31 10:18:29,487 - utils.utils - INFO - Iter:  12300,  Train Loss:   1.5,  Train Acc: 48.44%,Time: 0:24:37
2020-05-31 10:18:41,125 - utils.utils - INFO - Iter:  12400,  Train Loss:   1.4,  Train Acc: 44.53%,Time: 0:24:49
2020-05-31 10:18:52,838 - utils.utils - INFO - Iter:  12500,  Train Loss:   1.3,  Train Acc: 48.44%,Time: 0:25:01
2020-05-31 10:19:04,949 - utils.utils - INFO - Iter:  12600,  Train Loss:   1.5,  Train Acc: 40.62%,Time: 0:25:13
2020-05-31 10:19:16,609 - utils.utils - INFO - Iter:  12700,  Train Loss:   1.5,  Train Acc: 46.88%,Time: 0:25:25
2020-05-31 10:19:28,009 - utils.utils - INFO - Iter:  12800,  Train Loss:   1.5,  Train Acc: 45.31%,Time: 0:25:36
2020-05-31 10:19:39,197 - utils.utils - INFO - Iter:  12900,  Train Loss:   1.4,  Train Acc: 48.44%,Time: 0:25:47
2020-05-31 10:19:50,982 - utils.utils - INFO - Iter:  13000,  Train Loss:   1.4,  Train 

2020-05-31 10:32:24,156 - utils.utils - INFO - Iter:  19400,  Train Loss:   1.5,  Train Acc: 42.97%,Time: 0:38:32
2020-05-31 10:32:36,064 - utils.utils - INFO - Iter:  19500,  Train Loss:   1.5,  Train Acc: 43.75%,Time: 0:38:44
2020-05-31 10:32:47,906 - utils.utils - INFO - Iter:  19600,  Train Loss:   1.4,  Train Acc: 46.09%,Time: 0:38:56
2020-05-31 10:32:59,598 - utils.utils - INFO - Iter:  19700,  Train Loss:   1.3,  Train Acc: 51.56%,Time: 0:39:08
2020-05-31 10:33:11,784 - utils.utils - INFO - Iter:  19800,  Train Loss:   1.5,  Train Acc: 42.97%,Time: 0:39:20
2020-05-31 10:33:24,010 - utils.utils - INFO - Iter:  19900,  Train Loss:   1.3,  Train Acc: 57.03%,Time: 0:39:32
2020-05-31 10:33:35,777 - utils.utils - INFO - Iter:  20000,  Train Loss:   1.6,  Train Acc: 41.41%,Time: 0:39:44
2020-05-31 10:33:48,382 - utils.utils - INFO - Iter:  20100,  Train Loss:   1.4,  Train Acc: 48.44%,Time: 0:39:56
2020-05-31 10:34:01,163 - utils.utils - INFO - Iter:  20200,  Train Loss:   1.4,  Train 

2020-05-31 10:47:21,898 - utils.utils - INFO - Iter:  26600,  Train Loss:   1.4,  Train Acc: 49.22%,Time: 0:53:30
2020-05-31 10:47:34,305 - utils.utils - INFO - Iter:  26700,  Train Loss:   1.2,  Train Acc: 50.78%,Time: 0:53:42
2020-05-31 10:47:46,464 - utils.utils - INFO - Iter:  26800,  Train Loss:   1.5,  Train Acc: 42.19%,Time: 0:53:54
2020-05-31 10:47:58,766 - utils.utils - INFO - Iter:  26900,  Train Loss:   1.4,  Train Acc: 39.06%,Time: 0:54:07
2020-05-31 10:48:11,286 - utils.utils - INFO - Iter:  27000,  Train Loss:   1.4,  Train Acc: 48.44%,Time: 0:54:19
2020-05-31 10:48:23,821 - utils.utils - INFO - Iter:  27100,  Train Loss:   1.5,  Train Acc: 42.19%,Time: 0:54:32
2020-05-31 10:48:36,094 - utils.utils - INFO - Iter:  27200,  Train Loss:   1.5,  Train Acc: 42.19%,Time: 0:54:44
2020-05-31 10:48:48,450 - utils.utils - INFO - Iter:  27300,  Train Loss:   1.4,  Train Acc: 39.84%,Time: 0:54:56
2020-05-31 10:49:01,126 - utils.utils - INFO - Iter:  27400,  Train Loss:   1.4,  Train 

In [7]:
# predict 
lstm_attention_model_params ={
    'model_name':'lstm_attention',
    'num_classes':20,
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.2,
    'required_improvement':1000,
    'num_epochs':20,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':256,
    'use_pad':True,
    'pad_size':90,
    'seed':1234,
    'init_method':'kaiming' , # 'kaiming'
    'num_layers' : 1,
    'max_seq_len':90,
    'bidirectional':False
}

bilstm_attention_model_params = {
    'model_name':'bilstm_attention',
    'num_classes':20,
    'sparse_feat':[
        'creative_id',
        'advertiser_id',
        'ad_id',
        
#         'industry',
#         'product_category',
#         'product_id',
    ] ,
    'embed':[
             'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
             'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
#              'industry_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
#              'product_category_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
#              'product_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy',
    ],
    'vocab_paths':[
        'creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
        'ad_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
#         'industry_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
#         'product_category_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
#         'product_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl',
    ],
    'dropout':0.3,
    'required_improvement':1200,
    'num_epochs':4,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'max_seq_len':90,
    'seed':1234,
    'init_method':'xavier' , # 'kaiming','xavier'
    'num_layers' : 1,
#     'attention_size':256,
    'bidirectional':True
}

params = {
          'test_fe_filename':'creative_advertiser_ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather',
          'use_log':False,
          'use_std': False,
          'model_type': 'neural',
          'model_name':'bilstm_attention',
          'model_params': bilstm_attention_model_params
            }

submission_df = predict(**params)

2020-05-31 10:51:14,496 - mlpipeline.predict - INFO - predict开始
2020-05-31 10:51:14,498 - mlpipeline.predict - INFO - test_fe_filename: creative_advertiser_ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather, use_log: False, use_std: False, model_type: neural, model_name: bilstm_attention
2020-05-31 10:51:16,712 - mlpipeline.predict - INFO - inference_pipeline_neural开始
2020-05-31 10:51:22,289 - utils.utils - INFO - build_dataset开始
2020-05-31 10:51:24,538 - utils.utils - INFO - ../data/creative_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl has been loaded
2020-05-31 10:51:24,566 - utils.utils - INFO - ../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl has been loaded
2020-05-31 10:51:26,423 - utils.utils - INFO - ../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl has been loaded
2020-05-31 10:55:07,852 - utils.utils - INFO - build_dataset已完成，共用时0:03:46
2020-05-31 10:55:07,853 - mlpipeline.predict - INFO - Loading data...
2020-05-31 10:55

In [8]:
submission_df.head()

Unnamed: 0,user_id,predicted_gender,predicted_age
0,1,1,5
1,2,1,3
2,3,2,6
3,4,1,5
4,5,1,5


In [8]:
# combine age and gender 
submission_age_df = pd.read_csv(os.path.join(conf.SUBMISSION_DIR,'submission_age_2020-05-28T08:34:08.417593.csv')) 

In [9]:
submission_age_df.head()

Unnamed: 0,user_id,predicted_age
0,3000001,3
1,3000002,7
2,3000003,3
3,3000004,3
4,3000005,4


In [10]:
submission_y_df = pd.read_csv(os.path.join(conf.SUBMISSION_DIR,'submission_y_2020-05-23T09:46:08.001373.csv')) 

In [12]:
submission_y_df.head()

Unnamed: 0,user_id,predicted_gender,predicted_age
0,3000001,1,3
1,3000002,2,7
2,3000003,2,2
3,3000004,1,3
4,3000005,1,4


In [13]:
submission_y_df['predicted_age'] = submission_age_df['predicted_age']

In [14]:
submission_y_df.head()

Unnamed: 0,user_id,predicted_gender,predicted_age
0,3000001,1,3
1,3000002,2,7
2,3000003,2,3
3,3000004,1,3
4,3000005,1,4


In [15]:
submission_save_path = os.path.join(conf.SUBMISSION_DIR,'submission_y_%s.csv'%(datetime.now().isoformat()))

In [16]:
submission_y_df.to_csv(os.path.join(conf.SUBMISSION_DIR, submission_save_path),index=False)