# Neural Network
build neural network
- <a href='#1'>1. data preprocess</a> 
- <a href='#2'>2. lstm</a> 
- <a href='#3'>3. task3</a>

In [1]:
# ! pip install tensorboardX -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
%load_ext autoreload
%autoreload 2
import sys
import os 
import time

import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import torch 
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from tensorboardX import SummaryWriter
from gensim.models import Word2Vec, KeyedVectors
from tqdm import tqdm
import pickle as pkl

sys.path.append('../')
import conf
from utils import (
timer,
get_time_diff,
check_columns,
LogManager,
build_iterater,
build_dataset
)
from models import (
lstm
)

In [2]:
# global settings
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns',1000)
pd.set_option('display.width',100)
sns.set(rc={'figure.figsize':(11,4)})

In [3]:
# global variables
DEFAULT_MISSING_VALUE = 0
DEFAULT_EMBEDDING_SIZE = 100
DEFAULT_INITIAL_VALUE = 0
UNK, PAD = '<UNK>', '<PAD>'  
MAX_VOCAB_SIZE = 10000000
use_label_cols = ['y']
FONT = fm.FontProperties(fname = os.path.join(conf.LIB_DIR,'simsun.ttc'))
LogManager.created_filename = os.path.join(conf.LOG_DIR, 'feature_engineering.log')
logger = LogManager.get_logger(__name__)

In [8]:
# functions
def __dummy():
    pass

### <a id='1'> 1.data preprocess</a>

In [11]:
train_fe_df = pd.read_feather(os.path.join(conf.DATA_DIR, 'neural_train_fe_df.feather'))

In [12]:
test_fe_df = pd.read_feather(os.path.join(conf.DATA_DIR, 'neural_test_fe_df.feather'))

In [13]:
train_fe_df.head()

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry,age,gender,y
0,1,358946,216469,1,194447,8826.0,3,30464,,6,1,16
1,1,652708,53492,1,49831,26858.0,3,29963,60.0,4,1,14
2,1,866186,220806,1,198186,,12,18103,6.0,5,1,15
3,1,866186,63440,1,58787,87.0,2,22885,318.0,5,1,15
4,1,672438,20857,1,21792,,18,22918,319.0,5,1,15


In [14]:
test_fe_df.head()

Unnamed: 0,time,user_id,creative_id,click_times,ad_id,product_id,product_category,advertiser_id,industry
0,1,3892830,112276,1,101587,,18,8371,54.0
1,1,3530749,14391,1,15775,1261.0,2,10988,6.0
2,1,3711528,208390,1,187259,1261.0,2,10925,6.0
3,1,3711528,236102,1,212289,1261.0,2,19056,98.0
4,1,3998628,90699,1,82895,,18,10955,238.0


In [20]:
index_cols, cate_cols, cont_cols, label_cols = check_columns(train_fe_df.dtypes.to_dict())

### <a id='2'> 2.lstm</a>

In [5]:
# train lstm
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样
config = lstm.Config(
    model_name='lstm', 
    num_classes=20, 
    sparse_feat='creative_id', 
    embed = 'embedding_creative_id_300.npy',
    dropout = 0.2,
    required_improvement = 1000,
    num_epochs = 2,
    batch_size = 128,
    learning_rate = 1e-3,
    hidden_size = 128
)

X_train = pd.read_feather(config.train_path)
vocab_dict, train_data = build_dataset(config,['y'],X_train)
config.n_vocab = len(vocab_dict)

<torch._C.Generator at 0x7fac09ad1450>

2020-05-19 16:48:18,601 - utils.utils - INFO - ../data/creative_id_vocab.pkl has been loaded


In [6]:
X_train.head()

Unnamed: 0,user_id,creative_id,age,gender,y,len_creative_id,creative_id_to_idx
0,1,"[877468, 821396, 209778, 1683713, 122032, 7169...",4,1,3,13,"[83214, 388181, 30714, 55758, 1136, 57, 16949,..."
1,2,"[63441, 155822, 39714, 609050, 13069, 1266180,...",10,1,9,45,"[88, 244165, 18, 198, 54, 2940, 128, 575, 9150..."
2,3,"[661347, 808612, 710859, 825434, 593522, 72694...",7,2,16,30,"[8220, 806960, 4592, 26226, 4090, 1982, 23661,..."
3,4,"[39588, 589886, 574787, 1892854, 31070, 196270...",5,1,4,29,"[82943, 8810, 31918, 7163, 5912, 462469, 1176,..."
4,5,"[296145, 350759, 24333, 43235, 852327, 1054434...",4,1,3,33,"[292289, 664755, 12, 37484, 1203, 8240, 82674,..."


In [7]:
train_data[:50]

[([83214,
   388181,
   30714,
   55758,
   1136,
   57,
   16949,
   2454,
   3188,
   1809,
   17863,
   1809,
   124198],
  3,
  13),
 ([88,
   244165,
   18,
   198,
   54,
   2940,
   128,
   575,
   9150,
   321,
   1821,
   2224,
   934,
   39,
   39,
   150,
   14,
   1494,
   1671,
   14,
   575,
   734243,
   823,
   79,
   43166,
   1129225,
   949822,
   2612,
   1423351,
   347,
   567901,
   530,
   1099310,
   3226,
   6629,
   953,
   61980,
   369744,
   22632,
   11981,
   338,
   877,
   800,
   369399,
   211058],
  9,
  45),
 ([8220,
   806960,
   4592,
   26226,
   4090,
   1982,
   23661,
   19612,
   1142,
   1311519,
   86609,
   248763,
   12059,
   1489751,
   127741,
   101147,
   157230,
   49302,
   1803034,
   2753459,
   137230,
   28,
   25284,
   13352,
   552,
   5317,
   38386,
   117,
   28347,
   269],
  16,
  30),
 ([82943,
   8810,
   31918,
   7163,
   5912,
   462469,
   1176,
   462450,
   727301,
   977,
   1647299,
   9181,
   5451,
   4427,