# Mlpipeline
Data reformat , preprocess, feature engineering, train, evaluate model and generate submission 
- <a href='#1'>1. feature_engineering</a> 
- <a href='#2'>2. train</a> 
- <a href='#3'>3. predict</a>

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
import gc
from time import time
from datetime import timedelta, datetime
import warnings
from collections import defaultdict
from tqdm import tqdm

import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import lightgbm as lgb
from sklearn import metrics
from dinglingling import wx_reminder
import torch

sys.path.append('../')
import conf
from mlpipeline import (
    feature_engineering_pandas,
    train,
    predict,
)
from utils import (
    check_columns,
    check_nan_value,
    load_model,
)

In [2]:
# global settings
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns',1000)
pd.set_option('display.width',100)
sns.set(rc={'figure.figsize':(20,10)})
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
warnings.simplefilter('ignore', np.RankWarning)

In [3]:
# global variables
SCKEY = 'SCU92138T03d57ff9d4b08ced24c2cceb440cd3bd5e843242680de'  # used for reminding when feature engineering or model training completes

In [4]:
# functions
def __dummy():
    pass

@wx_reminder(SCKEY=SCKEY, remind_started=True)  
def feature_engineering_wrapper(params):
        """
        wrapper for feature engineering func 
        for reminding when it completes
        """
        train_fe_df, test_fe_df = feature_engineering_pandas(**params)
        
        return train_fe_df, test_fe_df
    
@wx_reminder(SCKEY=SCKEY, remind_started=True)  
def train_wrapper(params):
        if params['is_eval']:
            _,_ = train(**params)
        else:
            if params['model_type'] == 'neural':
                model = train(**params) 
                return model
            else:
                model, scaler = train(**params) 
                return model, scaler

def merge_dfs(df1,dfs,sparse_feats):
    for i, df in enumerate(dfs):
        df1 = df1.merge(df[['user_id',sparse_feats[i]]], how='left', on='user_id')
    return df1    

In [5]:
! du -sh ../data/*

1.5G	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.bin
1.5G	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy
249M	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_test_fe_df.feather
244M	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather
63M	../data/ad_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl
29M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.bin
29M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy
195M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_test_fe_df.feather
195M	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather
1008K	../data/advertiser_id_window_150_dim_128_sg_1_hs_0_iter_10_vocab.pkl
48K	../data/click_times_window_150_dim_128_sg_1_hs_0_iter_10_embedding.bin
52K	../data/click_times_window_150_dim_128_sg_1_hs_0_iter_10_embedding.npy
75M	../data/click_times_window_150_dim_128_sg_1_hs_0_iter_

### <a id='1'> 1.feature_engineering</a>

In [6]:
# feature engineering
params = {
    'train_preprocessed_data_filename':'raw_train_round_one_df.feather', 
    'test_preprocessed_data_filename':'raw_test_df.feather', 
    'train_fe_save_filename': 'neural_train_fe_df.feather',
    'test_fe_save_filename': 'neural_test_fe_df.feather',
    'emb_method':'w2v',
    'max_df':0.9,  # param for tf_idf
    'min_df':3,  # param for tf_idf
    'emb_dim':128,  
    'window':150,  
    'sparse_feat': 'time',  # advertise_id, product_category, product_id
    'min_count':1, 
#     'sample':6e-5, 
#     'negative':0,  
    'hs':0, 
#     'alpha':0.03,
#     'min_alpha':0.0007,
    'iter_':10,
    'workers':20,
    'sg':1,
    'num_processes': 40,
    'is_train':True,
    'is_neural_network':True
}

train_fe_df, test_fe_df = feature_engineering_wrapper(params)

2020-05-31 10:00:43,807 - mlpipeline.feature_engineering.feature_engineering - INFO - feature_engineering_pandas开始
2020-05-31 10:00:43,811 - mlpipeline.feature_engineering.feature_engineering - INFO - is_train: True, is_neural_network: True
2020-05-31 10:00:43,812 - mlpipeline.feature_engineering.feature_engineering - INFO - _load_preprocessed_data开始
2020-05-31 10:00:44,670 - mlpipeline.feature_engineering.feature_engineering - INFO - _load_preprocessed_data已完成，共用时0:00:01
2020-05-31 10:00:54,032 - mlpipeline.feature_engineering.feature_engineering - INFO - _generate_emb_for_sparse_feat开始
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sparse_feat_seq_df[sparse_feat] = sparse_feat_seq_df[sparse_feat].astype(str)
2020-05-31 10:07:39,986 - mlpipeline.feature_engineering.featur

In [22]:
creative_advertiser_ad_id_product_cate_test_fe_df = pd.read_feather(os.path.join(conf.DATA_DIR, 'creative_advertiser_ad_id_product_cate_window_150_dim_128_sg_1_hs_0_iter_10_neural_test_fe_df.feather'))
creative_advertiser_ad_id_product_cate_train_fe_df = pd.read_feather(os.path.join(conf.DATA_DIR, 'creative_advertiser_ad_id_product_cate_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather'))

In [23]:
industry_train_fe_df = pd.read_feather(os.path.join(conf.DATA_DIR, 'industry_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather'))
industry_test_fe_df = pd.read_feather(os.path.join(conf.DATA_DIR, 'industry_window_150_dim_128_sg_1_hs_0_iter_10_neural_test_fe_df.feather'))

In [24]:
product_id_train_fe_df = pd.read_feather(os.path.join(conf.DATA_DIR, 'product_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather'))
product_id_test_fe_df = pd.read_feather(os.path.join(conf.DATA_DIR, 'product_id_window_150_dim_128_sg_1_hs_0_iter_10_neural_test_fe_df.feather'))

In [25]:
creative_advertiser_ad_id_product_cate_industry_train_fe_df = creative_advertiser_ad_id_product_cate_train_fe_df.merge(industry_train_fe_df[['user_id','industry']],how='left',on='user_id')
creative_advertiser_ad_product_id_product_cate_industry_train_fe_df = creative_advertiser_ad_id_product_cate_industry_train_fe_df.merge(product_id_train_fe_df[['user_id','product_id']],how='left',on='user_id')

In [26]:
creative_advertiser_ad_id_product_cate_industry_test_fe_df = creative_advertiser_ad_id_product_cate_test_fe_df.merge(industry_test_fe_df[['user_id','industry']],how='left',on='user_id')
creative_advertiser_ad_product_id_product_cate_industry_test_fe_df = creative_advertiser_ad_id_product_cate_industry_test_fe_df.merge(product_id_test_fe_df[['user_id','product_id']],how='left',on='user_id')

In [27]:
creative_advertiser_ad_product_id_product_cate_industry_train_fe_df.head()

Unnamed: 0,user_id,creative_id,age,gender,y,advertiser_id,ad_id,product_category,industry,product_id
0,1,877468 821396 209778 1683713 122032 71691 1940...,4,1,3,29455 7293 9702 14668 11411 14681 17189 367 44...,773445 724607 188507 1458878 109959 66210 1678...,5 5 2 5 2 18 5 5 18 2 2 2 2,106 326 6 326 0 326 73 217 64 245 238 245 6,0 0 136 0 1334 0 0 0 0 64 1454 64 1261
1,2,63441 155822 39714 609050 13069 1266180 441462...,10,1,9,22885 10686 18562 25932 768 34505 22885 26006 ...,58788 139702 38066 541125 14495 1107111 392680...,2 2 2 2 2 18 2 18 2 18 2 18 2 18 18 18 2 2 2 2...,318 238 6 6 317 47 318 47 6 47 242 6 6 47 47 4...,87 80 129 129 1400 0 87 0 1261 0 111 0 129 0 0...
2,3,661347 808612 710859 825434 593522 726940 3920...,7,2,16,32974 9877 18492 14186 17018 9058 8371 2336 39...,586668 713448 629278 728308 527601 643108 3502...,17 17 2 17 2 18 18 2 2 2 18 12 2 2 18 18 8 4 2...,0 0 322 0 322 6 54 6 322 322 205 302 322 322 2...,36256 40905 1674 35985 1674 0 0 1031 1786 2258...
3,4,39588 589886 574787 1892854 31070 1962706 1230...,5,1,4,19451 7976 13084 12130 13299 23664 10172 811 1...,37966 524312 511235 1638619 30773 1698206 1076...,2 18 2 17 17 17 17 4 18 2 2 2 2 2 2 2 5 2 2 2 ...,238 25 248 0 0 0 0 0 88 319 238 6 319 238 319 ...,1862 0 2625 38743 39422 41265 39904 37758 0 15...
4,5,296145 350759 24333 43235 852327 1054434 12964...,4,1,3,11882 992 22885 9706 38760 2862 17745 31552 22...,265971 314795 24966 41148 751113 925792 113378...,5 8 2 2 18 2 5 18 18 2 18 2 5 5 2 18 2 18 18 2...,297 0 318 6 322 6 288 322 319 238 322 322 203 ...,0 0 87 136 0 136 0 0 0 1064 0 2620 0 0 136 0 2...


In [28]:
creative_advertiser_ad_product_id_product_cate_industry_train_fe_df.to_feather(os.path.join(conf.DATA_DIR, 'creative_advertiser_ad_product_id_product_cate_industry_train_fe_df.feather'))

In [29]:
creative_advertiser_ad_product_id_product_cate_industry_test_fe_df.to_feather(os.path.join(conf.DATA_DIR, 'creative_advertiser_ad_product_id_product_cate_industry_test_fe_df.feather'))

In [16]:
creative_advertiser_ad_id_product_cate_train_fe_df.head()

Unnamed: 0,user_id,creative_id,age,gender,y,advertiser_id,ad_id,product_category
0,1,877468 821396 209778 1683713 122032 71691 1940...,4,1,3,29455 7293 9702 14668 11411 14681 17189 367 44...,773445 724607 188507 1458878 109959 66210 1678...,5 5 2 5 2 18 5 5 18 2 2 2 2
1,2,63441 155822 39714 609050 13069 1266180 441462...,10,1,9,22885 10686 18562 25932 768 34505 22885 26006 ...,58788 139702 38066 541125 14495 1107111 392680...,2 2 2 2 2 18 2 18 2 18 2 18 2 18 18 18 2 2 2 2...
2,3,661347 808612 710859 825434 593522 726940 3920...,7,2,16,32974 9877 18492 14186 17018 9058 8371 2336 39...,586668 713448 629278 728308 527601 643108 3502...,17 17 2 17 2 18 18 2 2 2 18 12 2 2 18 18 8 4 2...
3,4,39588 589886 574787 1892854 31070 1962706 1230...,5,1,4,19451 7976 13084 12130 13299 23664 10172 811 1...,37966 524312 511235 1638619 30773 1698206 1076...,2 18 2 17 17 17 17 4 18 2 2 2 2 2 2 2 5 2 2 2 ...
4,5,296145 350759 24333 43235 852327 1054434 12964...,4,1,3,11882 992 22885 9706 38760 2862 17745 31552 22...,265971 314795 24966 41148 751113 925792 113378...,5 8 2 2 18 2 5 18 18 2 18 2 5 5 2 18 2 18 18 2...


In [30]:
creative_advertiser_ad_product_id_product_cate_industry_test_fe_df.head()

Unnamed: 0,user_id,creative_id,advertiser_id,ad_id,product_category,industry,product_id
0,3000001,351878 665090 103064 593698 1508864 1711578 22...,7579 7000 13084 9950 39179 32735 17963 39179 3...,315858 589862 93662 527764 1312021 1482336 194...,18 2 2 3 18 3 3 18 18 5 18,322 247 248 247 322 317 322 322 322 288 322,0 1701 1794 8938 0 27031 8593 0 0 32368 0
1,3000002,152519 151984 12838 176984 72773 64667 81234 7...,12993 27800 10690 24661 10334 10334 13295 2288...,136730 136259 14271 158762 67139 59874 74563 6...,2 18 2 2 2 2 2 2 2 2 8 2 5 8 2 2 2 2 2 2 2 2 1...,322 24 317 5 5 5 317 318 317 5 328 5 296 277 3...,1674 0 140 1687 1377 1377 143 87 140 1377 1118...
2,3000003,161840 73137 115761 36634 367084 150407 41212 ...,22367 22367 25260 22338 40491 18479 14681 2469...,145199 67477 104623 35431 331464 134931 39373 ...,2 2 18 2 5 18 18 2 2 5 18 18 2 18 18 2 18 2 2 ...,319 319 47 238 202 25 326 238 6 146 296 28 317...,82 82 0 2065 34504 0 0 1447 129 0 0 0 63 0 0 6...
3,3000004,108656 849706 678427 9870 157180 94025 907546 ...,8520 28323 2286 818 2302 8520 8371 2302 38471 ...,98506 748945 601456 11324 140916 85594 799834 ...,2 18 18 2 2 2 18 2 2 18 18 2 2 5 18 2 18 2 18 2,319 40 54 319 5 319 54 5 246 47 47 319 319 291...,1036 0 0 1323 1268 1036 0 1268 2400 0 0 1810 1...
4,3000005,123860 183003 40625 26793 71219 259607 167448 ...,30710 29243 23746 22388 6763 21001 26544 16228...,111593 164114 38229 27193 65782 233585 150078 ...,2 2 2 18 18 2 2 2 2 18 18 2 2 2 2 18 18 18 2 2...,238 26 6 25 28 6 319 6 6 74 25 247 247 318 319...,1896 1987 129 0 0 1538 23 1567 1567 0 0 1944 1...


In [18]:
creative_advertiser_ad_id_product_cate_train_fe_df = creative_advertiser_ad_id_train_fe_df.merge(product_category_train_fe_df[['user_id','product_category']],how='left',on='user_id')
creative_advertiser_ad_id_product_cate_test_fe_df = creative_advertiser_ad_id_test_fe_df.merge(product_category_test_fe_df[['user_id','product_category']],how='left',on='user_id')

In [20]:
creative_advertiser_ad_id_product_cate_train_fe_df.to_feather(os.path.join(conf.DATA_DIR, 'creative_advertiser_ad_id_product_cate_window_150_dim_128_sg_1_hs_0_iter_10_neural_train_fe_df.feather')) 
creative_advertiser_ad_id_product_cate_test_fe_df.to_feather(os.path.join(conf.DATA_DIR, 'creative_advertiser_ad_id_product_cate_window_150_dim_128_sg_1_hs_0_iter_10_neural_test_fe_df.feather')) 

In [None]:
# eval 
# lgb_model_params = {
#                'objective': 'multiclass',  # multiclass, binary 
#                'boosting': 'gbdt',
#                'learning_rate': 0.15,
#                'metric': ['multi_logloss'],  # 'binary_logloss', 'multi_logloss'
#                'num_threads': 20,
#                'random_state': 2019,
#                'num_boost_round': 1000,
#                'device': 'cpu',
#                'num_class':20,  # 2, 20 ,10
#                'num_leaves':32,  # [16,32,64,128]
#                'subsample': 0.9,  # [0.7,0.8,0.9,1]
#                'colsample_bytree': 0.9, # [0.2,0.3,0.4,0.5,0.6]
#                'min_data_in_leaf': 40, # [20,40,60,80,100]
#                'lambda_l1': 1.0,  # (0.2,3)
#                'lambda_l2': 1.0,  # (0.2,3)
# }

# 0.3903862863136468, 0.39146038751369455
# xgb_model_params = {
#                'objective': 'multi:softmax',  # multiclass, binary 
#                'booster': 'gbtree',
#                'eta': 0.15,
#                'eval_metric': ['mlogloss'],  # 'binary_logloss', 'multi_logloss'
#                'nthread': 15,
#                'random_state': 2019,
#                'tree_method':'auto',
#                'n_estimators': 2,
#                'device': 'cpu',
#                'num_class':20,  # 2, 20 ,10
#                'max_leaves':32,  # [16,32,64,128]
#                'subsample': 0.9,  # [0.7,0.8,0.9,1]
#                'colsample_bytree': 0.9, # [0.2,0.3,0.4,0.5,0.6]
#                'min_data_in_leaf': 40, # [20,40,60,80,100]
#                'reg_lambda': 1.0,  # (0.2,3)
#                'reg_alpha': 1.0,  # (0.2,3)
# }
lstm_model_params ={
    'model_name':'lstm', 
    'num_classes':20, 
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_window_150_dim_300_sg_hs_w2v.npy',
    'dropout':0.2,
    'required_improvement':1000,
    'num_epochs':3,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'max_seq_len':110,
    'seed':1234,
    'init_method':'xavier' , # 'kaiming'
    'num_layers' : 1,
    'bidirectional':False
}

# textcnn_model_params={
#     'model_name':'textcnn', 
#     'num_classes':20, 
#     'sparse_feat':'creative_id', 
#     'embed':'embedding_creative_id_300.npy',
#     'dropout':0.2,
#     'required_improvement':1000,
#     'num_epochs':20,
#     'batch_size':128,
#     'learning_rate':1e-3,
#     'filter_size':[3,5,7],
#     'num_filters':3,
#     'use_pad':True,
#     'pad_size':64,
# }

# transformer_model_params = {
#     'model_name':'transformer',
#     'num_classes':20,
#     'sparse_feat':'creative_id', 
#     'embed':'embedding_creative_id_300.npy',
#     'dropout':0.2,
#     'required_improvement':1000,
#     'num_epochs':5,
#     'batch_size':128,
#     'learning_rate':1e-3,
#     'dim_model':300,
#     'hidden':1024,
#     'last_hidden':512,
#     'num_head':5,
#     'init_method':'kaiming',
#     'num_encoder':2,
#     'use_pad':True,
#     'seed':1,
#     'pad_size':64,
# }
bilstm_attention_model_params = {
    'model_name':'bilstm_attention',
    'num_classes':10,
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_window_150_dim_300_sg_hs_w2v.npy',
    'vocab_path':'creative_id_window_150_dim_300_sg_hs_w2v_vocab.pkl',
    'dropout':0.3,
    'required_improvement':1000,
    'num_epochs':20,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':256,
    'use_pad':True,
    'max_seq_len':110,
    'seed':1234,
    'init_method':'xavier' , # 'kaiming','xavier'
    'num_layers' : 1,
#     'attention_size':256,
    'bidirectional':True
}

lstm_attention_model_params ={
    'model_name':'lstm_attention',
    'num_classes':20,
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.2,
    'required_improvement':1000,
    'num_epochs':3,
    'batch_size':1,
    'learning_rate':1e-3,
    'hidden_size':256,
    'use_pad':True,
    'seed':1234,
    'init_method':'kaiming' , # 'kaiming', 'xavier'
    'num_layers' : 1,
    'max_seq_len':90,
    'bidirectional':False
}

params = {
    'fe_filename':'neural_train_fe_df.feather', 
    'is_eval':True, 
    'model_type': 'neural',
    'model_name': 'bilstm_attention',
    'model_params': bilstm_attention_model_params,
    'use_log': False,
    'use_std': False,
    'use_cv': True,  
    'n_splits':2,
}
train_wrapper(params)

2020-05-23 22:08:34,450 - mlpipeline.train - INFO - train开始
2020-05-23 22:08:34,452 - mlpipeline.train - INFO - using_fe_df: neural_train_fe_df.feather, use_label: age, is_eval: True, model_type: neural, model_name: bilstm_attention, use_log: False, use_std: False, use_cv: True, n_splits: 2
2020-05-23 22:08:35,210 - mlpipeline.train - INFO - _train_pipeline_neural开始
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
2020-05-23 22:08:42,254 - mlpipeline.train - INFO - 模型参数: {'model_name': 'bilstm_attention', 'num_classes': 10, 'sparse_feat': 'creative_id', 'embed': 'embedding_creative_id_window_150_dim_300_sg_hs_w2v.npy', 'vocab_path': 'creative_id_window_150_dim_300_sg_hs_w2v_vocab.pkl', 'dropout': 0.3, 'required_improvement': 

In [6]:
# train 
lstm_model_params ={
    'model_name':'lstm', 
    'num_classes':20, 
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.5,
    'required_improvement':1000,
    'num_epochs':5,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':128,
    'use_pad':True,
    'pad_size':64,
    'seed':1234,
    'init_method':'kaiming' , # 'kaiming'
    'num_layers' : 1,
    'bidirectional':True
}

bilstm_attention_model_params = {
    'model_name':'bilstm_attention',
    'num_classes':20,
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.3,
    'required_improvement':1000,
    'num_epochs':4,
    'batch_size':256,
    'learning_rate':1e-3,
    'hidden_size':256,
    'use_pad':True,
    'max_seq_len':110,
    'seed':1234,
    'init_method':'xavier' , # 'kaiming','xavier'
    'num_layers' : 1,
#     'attention_size':256,
    'bidirectional':True
}


lstm_attention_model_params ={
    'model_name':'lstm_attention',
    'num_classes':20,
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.2,
    'required_improvement':1000,
    'num_epochs':3,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':256,
    'use_pad':True,
    'pad_size':90,
    'seed':1234,
    'init_method':'kaiming' , # 'kaiming'
    'num_layers' : 1,
    'max_seq_len':90,
    'bidirectional':False
}
params = {
    'fe_filename':'neural_train_fe_df.feather', 
    'is_eval':False, 
    'model_type': 'neural',
    'model_name': 'bilstm_attention',
    'model_params': bilstm_attention_model_params,
    'use_log': False,
    'use_std': False,
    'use_cv': False,  
}
model = train_wrapper(params)

2020-05-23 09:26:41,157 - mlpipeline.train - INFO - train开始
2020-05-23 09:26:41,159 - mlpipeline.train - INFO - using_fe_df: neural_train_fe_df.feather, use_label: y, is_eval: False, model_type: neural, model_name: bilstm_attention, use_log: False, use_std: False, use_cv: False, n_splits: 2
2020-05-23 09:26:42,109 - mlpipeline.train - INFO - _train_pipeline_neural开始
2020-05-23 09:26:49,944 - mlpipeline.train - INFO - 模型参数: {'model_name': 'bilstm_attention', 'num_classes': 20, 'sparse_feat': 'creative_id', 'embed': 'embedding_creative_id_300.npy', 'dropout': 0.3, 'required_improvement': 1000, 'num_epochs': 4, 'batch_size': 256, 'learning_rate': 0.001, 'hidden_size': 256, 'use_pad': True, 'max_seq_len': 110, 'seed': 1234, 'init_method': 'xavier', 'num_layers': 1, 'bidirectional': True}
2020-05-23 09:26:50,000 - utils.utils - INFO - build_dataset开始
2020-05-23 09:26:52,080 - utils.utils - INFO - ../data/creative_id_vocab.pkl has been loaded
2020-05-23 09:27:56,043 - utils.utils - INFO - bu

2020-05-23 09:33:09,836 - utils.utils - INFO - Iter:   5600,  Train Loss:   1.4,  Train Acc: 41.02%,Time: 0:05:09
2020-05-23 09:33:15,291 - utils.utils - INFO - Iter:   5700,  Train Loss:   1.6,  Train Acc: 42.19%,Time: 0:05:14
2020-05-23 09:33:20,755 - utils.utils - INFO - Iter:   5800,  Train Loss:   1.4,  Train Acc: 45.70%,Time: 0:05:19
2020-05-23 09:33:26,178 - utils.utils - INFO - Iter:   5900,  Train Loss:   1.6,  Train Acc: 39.45%,Time: 0:05:25
2020-05-23 09:33:31,638 - utils.utils - INFO - Iter:   6000,  Train Loss:   1.5,  Train Acc: 42.19%,Time: 0:05:30
2020-05-23 09:33:37,108 - utils.utils - INFO - Iter:   6100,  Train Loss:   1.5,  Train Acc: 42.19%,Time: 0:05:36
2020-05-23 09:33:42,566 - utils.utils - INFO - Iter:   6200,  Train Loss:   1.4,  Train Acc: 41.02%,Time: 0:05:41
2020-05-23 09:33:48,130 - utils.utils - INFO - Iter:   6300,  Train Loss:   1.5,  Train Acc: 44.14%,Time: 0:05:47
2020-05-23 09:33:53,954 - utils.utils - INFO - Iter:   6400,  Train Loss:   1.5,  Train 

2020-05-23 09:39:47,203 - utils.utils - INFO - Iter:  12700,  Train Loss:   1.5,  Train Acc: 39.84%,Time: 0:11:46
2020-05-23 09:39:52,719 - utils.utils - INFO - Iter:  12800,  Train Loss:   1.5,  Train Acc: 44.14%,Time: 0:11:51
2020-05-23 09:39:58,179 - utils.utils - INFO - Iter:  12900,  Train Loss:   1.3,  Train Acc: 46.88%,Time: 0:11:57
2020-05-23 09:40:03,638 - utils.utils - INFO - Iter:  13000,  Train Loss:   1.5,  Train Acc: 44.14%,Time: 0:12:02
2020-05-23 09:40:09,146 - utils.utils - INFO - Iter:  13100,  Train Loss:   1.4,  Train Acc: 45.70%,Time: 0:12:08
2020-05-23 09:40:14,610 - utils.utils - INFO - Iter:  13200,  Train Loss:   1.4,  Train Acc: 46.88%,Time: 0:12:13
2020-05-23 09:40:20,073 - utils.utils - INFO - Iter:  13300,  Train Loss:   1.4,  Train Acc: 44.53%,Time: 0:12:19
2020-05-23 09:40:25,497 - utils.utils - INFO - Iter:  13400,  Train Loss:   1.5,  Train Acc: 48.83%,Time: 0:12:24
2020-05-23 09:40:30,951 - utils.utils - INFO - Iter:  13500,  Train Loss:   1.5,  Train 

In [6]:
# predict 
lstm_attention_model_params ={
    'model_name':'lstm_attention',
    'num_classes':20,
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.2,
    'required_improvement':1000,
    'num_epochs':20,
    'batch_size':128,
    'learning_rate':1e-3,
    'hidden_size':256,
    'use_pad':True,
    'pad_size':90,
    'seed':1234,
    'init_method':'kaiming' , # 'kaiming'
    'num_layers' : 1,
    'max_seq_len':90,
    'bidirectional':False
}


bilstm_attention_model_params = {
    'model_name':'bilstm_attention',
    'num_classes':20,
    'sparse_feat':'creative_id', 
    'embed':'embedding_creative_id_300.npy',
    'dropout':0.3,
    'required_improvement':1000,
    'num_epochs':4,
    'batch_size':256,
    'learning_rate':1e-3,
    'hidden_size':256,
    'use_pad':True,
    'max_seq_len':110,
    'seed':1234,
    'init_method':'xavier' , # 'kaiming','xavier'
    'num_layers' : 1,
#     'attention_size':256,
    'bidirectional':True
}

params = {
          'test_fe_filename':'neural_test_fe_df.feather',
          'use_log':False,
          'use_std': False,
          'model_type': 'neural',
          'model_name':'bilstm_attention',
          'model_params': bilstm_attention_model_params
            }

submission_df = predict(**params)

2020-05-23 09:43:41,710 - mlpipeline.predict - INFO - predict开始
2020-05-23 09:43:41,712 - mlpipeline.predict - INFO - test_fe_filename: neural_test_fe_df.feather, use_log: False, use_std: False, model_type: neural, model_name: bilstm_attention
2020-05-23 09:43:42,692 - mlpipeline.predict - INFO - inference_pipeline_neural开始
2020-05-23 09:43:49,264 - utils.utils - INFO - build_dataset开始
2020-05-23 09:43:51,056 - utils.utils - INFO - ../data/creative_id_vocab.pkl has been loaded
2020-05-23 09:45:01,610 - utils.utils - INFO - build_dataset已完成，共用时0:01:12
2020-05-23 09:45:01,611 - mlpipeline.predict - INFO - Loading data...
2020-05-23 09:45:01,612 - utils.utils - INFO - build_iterater开始
2020-05-23 09:45:01,613 - utils.utils - INFO - build_iterater已完成，共用时0:00:00
2020-05-23 09:45:01,614 - mlpipeline.predict - INFO - Time usage:0:00:00
  "num_layers={}".format(dropout, num_layers))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

In [7]:
submission_df.head()

Unnamed: 0,user_id,predicted_gender,predicted_age
0,3000001,1,3
1,3000002,2,7
2,3000003,2,2
3,3000004,1,3
4,3000005,1,4


In [None]:
# combine age and gender 
submission_age_df = pd.read_csv(os.path.join(conf.SUBMISSION_DIR,'submission_age_2020-05-16T14:43:48.536108.csv')) 

In [None]:
submission_y_df = pd.read_csv(os.path.join(conf.SUBMISSION_DIR,'submission_y_2020-05-16T11:27:59.072741.csv')) 

In [None]:
submission_age_df.head()

In [None]:
submission_y_df.head()

In [None]:
submission_y_df['predicted_age'] = submission_age_df['predicted_age']

In [None]:
submission_y_df.head()

In [None]:
submission_save_path = os.path.join(conf.SUBMISSION_DIR,'submission_y_%s.csv'%(datetime.now().isoformat()))

In [None]:
submission_y_df.to_csv(os.path.join(conf.SUBMISSION_DIR, submission_save_path),index=False)