In [1]:
from functools import wraps
import collections
import os
import time

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, Binarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

def lazy_property(func):
    attr = '_lazy_' + func.__name__

    @property
    @wraps(func)
    def wrapper(self):
        if not hasattr(self, attr):
            setattr(self, attr, func(self))
        return getattr(self, attr)
    return wrapper

class ParamModel1(object):
    def __init__(self):
        self.name = 'model1'
        self.lr = 0.05
        self.keep_prob = 0.8
        self.epochs = 2
        self.batch_size = 5000
        self.val_size = 2000
        self.vocabulary_size = 10000
        self.max_seq_len = 200
        self.hidden_units = 16
        self.seq_embed_dim = 60
        self.log_path = '/Users/zhouzhirui/Desktop/log/%s/'%self.name
        self.save_path = '/Users/zhouzhirui/Desktop/model_save'
    def __setattr__(self, attr, value):
        self.__dict__[attr] = value
    def __str__(self):
        return str(self.__dict__)
    __repr__ = __str__

p1 = ParamModel1()

  return f(*args, **kwds)
Using TensorFlow backend.


In [2]:
kaggle_path = '../input/'
mac_path = '/Users/zhouzhirui/data/Mercari_Price_Forcast/'

def handle_missing(dataset):
    dataset.category.fillna(value="missing", inplace=True)
    dataset.brand.fillna(value="missing", inplace=True)
    dataset.description.fillna(value="missing", inplace=True)
    dataset.loc[dataset.description == 'No description yet', 'description'] = 'missing'
    return dataset

def upper2lower(dataset):
    for (col, dtype) in dataset.dtypes.iteritems():
        if dtype == 'object':
            dataset[col] = dataset[col].str.lower()
    return dataset

def load_data(path):
    print('load data ..')
    train = pd.read_table(path+'train.tsv')
    test = pd.read_table(path+'test.tsv')
    merge = pd.concat([train, test], axis=0).reset_index(drop=True)
    merge.rename(columns={
        'brand_name':'brand',
        'category_name':'category',
        'item_condition_id':'condition',
        'item_description':'description'
    }, inplace=True)
    print('测试，只取100000条数据')
    merge = merge[:100000]
    print('handle missing ..')
    merge = handle_missing(merge)
    print('upper2lower ..')
    merge = upper2lower(merge)
    train = merge[merge.train_id.notnull()].reset_index(drop=True)
    test = merge[merge.train_id.isnull()].reset_index(drop=True)
    return train, test
train, test = load_data(mac_path)
train.head(3)

load data ..
测试，只取100000条数据
handle missing ..
upper2lower ..


Unnamed: 0,brand,category,condition,description,name,price,shipping,test_id,train_id
0,missing,men/tops/t-shirts,3,missing,mlb cincinnati reds t shirt size xl,10.0,1,,0.0
1,razer,electronics/computers & tablets/components & p...,3,this keyboard is in great condition and works ...,razer blackwidow chroma keyboard,52.0,0,,1.0
2,target,women/tops & blouses/blouse,1,adorable top with a hint of lace and a key hol...,ava-viv blouse,10.0,1,,2.0


In [3]:
def text2seq(dtrain, dtest, vocabulary_size):
    tokenizer = Tokenizer(num_words=vocabulary_size)
    tokenizer.fit_on_texts(np.hstack([dtrain.category, dtrain.description, dtrain.name]))
    for col in ['name', 'category', 'description']:
        dtrain['seq_'+col] = tokenizer.texts_to_sequences(dtrain[col])
        dtest['seq_'+col] = tokenizer.texts_to_sequences(dtest[col])
    return dtrain, dtest, tokenizer

train, test, _ = text2seq(train, test, p1.vocabulary_size)

def label_encoding(dtrain, dtest):
    le = LabelEncoder()
    for col in ['category', 'brand', 'condition']:
        le.fit(np.hstack([dtrain[col], dtest[col]]))
        dtrain[col] = le.transform(dtrain[col])
        dtest[col] = le.transform(dtest[col])
    return dtrain, dtest

train, test = label_encoding(train, test)
train.head(3)

Unnamed: 0,brand,category,condition,description,name,price,shipping,test_id,train_id,seq_name,seq_category,seq_description
0,1231,571,2,missing,mlb cincinnati reds t shirt size xl,10.0,1,,0.0,"[2367, 8086, 7463, 74, 103, 7, 201]","[77, 43, 74, 75]",[80]
1,1544,79,2,this keyboard is in great condition and works ...,razer blackwidow chroma keyboard,52.0,0,,1.0,"[8184, 2634]","[61, 923, 845, 3092, 1393]","[34, 2634, 11, 8, 50, 17, 1, 259, 66, 19, 1219..."
2,1813,939,0,adorable top with a hint of lace and a key hol...,ava-viv blouse,10.0,1,,2.0,"[7902, 268]","[2, 43, 76, 268]","[736, 72, 10, 5, 5527, 12, 243, 1, 5, 1050, 13..."


In [4]:
def get_train_val_test(dtrain, dtest, val_size):
    dtrain = dtrain.sample(frac=1.0).reset_index(drop=True)
    dval = dtrain.iloc[:val_size,:].reset_index(drop=True)
    dtrain = dtrain.iloc[val_size:,:].reset_index(drop=True)
    return dtrain, dval, dtest

train, val, test = get_train_val_test(train, test, p1.val_size)

def gen_batch_data(dataset, batch_size):
    dataset = dataset.sample(frac=1.0).reset_index(drop=True)
    max_step = dataset.shape[0] // batch_size
    for step in range(max_step):
        sub = dataset.iloc[step*batch_size : (step+1)*batch_size, :]
        yield sub

def gen_tf_data(subdata, max_len):
    seq_desc_len = subdata['seq_description'].apply(len).values
    seq_desc = tf.keras.preprocessing.sequence.pad_sequences(subdata['seq_description'], maxlen=max_len, padding='post')
    price = np.log1p(subdata['price']).values
    return seq_desc, seq_desc_len, price

val_seq_desc, val_seq_desc_len, val_price = gen_tf_data(subdata=val, max_len=p1.max_seq_len)

## 实验1

In [5]:
class Inputs1(object):
    def __init__(self, param):
        with tf.name_scope('feature') as ns:
            self.seq_desc = tf.placeholder(dtype=tf.int32, shape=[None, param.max_seq_len], name='seq_desc')
            self.seq_desc_len = tf.placeholder(dtype=tf.int32, shape=[None], name='seq_desc_len')
        with tf.name_scope('control'):
            self.keep_prob = tf.placeholder(dtype=tf.float32 ,name='keep_prob')
        with tf.name_scope('label'):
            self.label = tf.placeholder(dtype=tf.float32, shape=[None], name='price')
#             tf.summary.histogram('price', self.label)

            

class Model1(object):
    def __init__(self, mode, param, inputs):
        self.mode = mode
        self.param = param
        self.inputs = inputs
        self.predict
        self.loss
        self.train
    
    def _add_embed_layer(self, name, inputs, inputs_dim, outputs_dim):
        with tf.name_scope(name+'_layer'):
            with tf.variable_scope(name) as vs:
                initializer = tf.initializers.random_normal()
                matrix = tf.get_variable(name=name+'_matrix', shape=[inputs_dim, outputs_dim], initializer=initializer)
                embed = tf.nn.embedding_lookup(matrix, inputs)
        return embed
    
    def _add_dense_layer(self, name, inputs, hidden_units, act=None, keep_prob=1.):
        with tf.name_scope(name) as ns:
            w = tf.get_variable(
                name+'_W', 
                shape=[inputs.get_shape()[1], hidden_units] ,
                initializer=tf.initializers.random_normal(dtype=tf.float32, mean=0., stddev=1.)
            )
            b = tf.get_variable(
                name+'_b', 
                shape=[1, hidden_units],
                initializer=tf.initializers.zeros()
            )
            wx_plus_b = tf.matmul(inputs, w) + b
            if act:
                wx_plus_b = act(b)
            wx_plus_b = tf.nn.dropout(wx_plus_b, keep_prob)
            tf.summary.histogram('W', w)
            tf.summary.histogram('b', b)
            return wx_plus_b
            

    @lazy_property
    def predict(self):
        desc_seq_embed = self._add_embed_layer(
            name='seq_embed', 
            inputs=self.inputs.seq_desc, 
            inputs_dim=self.param.vocabulary_size+1,   #配置词典大小
            outputs_dim=self.param.seq_embed_dim     #配置嵌入空间
        )
        self._cell = tf.nn.rnn_cell.BasicLSTMCell(self.param.hidden_units)
        if self.mode == 'train':
            batch_size = self.param.batch_size
        elif self.mode == 'val':
            batch_size = self.param.val_size
        elif self.mode == 'predict':
            batch_size = 1
        else:
            raise ValueError("wrone model mode")
        self._init_state = self._cell.zero_state(batch_size, dtype=tf.float32)
            
        outputs, state = tf.nn.dynamic_rnn(
            self._cell, 
            desc_seq_embed, 
            initial_state=self._init_state, 
#             sequence_length=self.inputs.seq_desc_len
        )
        outputs = tf.reshape(outputs[:,-1,:], [-1, self.param.hidden_units])
        price = self._add_dense_layer('output', outputs, 1)
        price = tf.reshape(price, [-1])
        return price
    
    @lazy_property
    def loss(self):
        with tf.name_scope('loss'):
#             loss = tf.losses.mean_squared_error(self.predict, self.inputs.label)
            loss = tf.reduce_mean(tf.square(tf.subtract(self.predict, self.inputs.label)), axis=-1)
#             tf.summary.scalar('loss', loss)
        return loss
    
    @lazy_property
    def train(self):
        with tf.variable_scope('train_op'):
            optimizer = tf.train.AdamOptimizer(self.param.lr)
            train_op = optimizer.minimize(self.loss)
        return train_op
    
    
    def reset_size(self, n):
        self._init_state = self._cell.zero_state(n, dtype=tf.float32)

In [6]:
graph = tf.Graph()
with graph.as_default():
    inputs = Inputs1(p1)
    with tf.name_scope('training'):
        train_model = Model1(mode='train', param=p1, inputs=inputs)
    tf.get_variable_scope().reuse_variables()
    with tf.name_scope('val'):
        val_model = Model1(mode='val', param=p1, inputs=inputs)
    merge = tf.summary.merge_all()
    writer = tf.summary.FileWriter(p1.log_path, graph=graph)
    init = tf.global_variables_initializer()

In [7]:
with tf.Session(graph=graph).as_default() as sess:
    sess.run(init)
    k=0
    for i in range(4):
        for sub in gen_batch_data(dataset=train, batch_size=p1.batch_size):
            t1 = time.time()
            seq_desc, seq_desc_len, price = gen_tf_data(subdata=sub, max_len=p1.max_seq_len)
            _, loss_i, rs_train = sess.run([train_model.train, train_model.loss, merge], 
                                 feed_dict={inputs.seq_desc:seq_desc, 
                                            inputs.seq_desc_len:seq_desc_len,
                                            inputs.label:price,
                                            inputs.keep_prob:p1.keep_prob})
            
            k+=1
            t2 = time.time()
            tcost = t2-t1
            if k%2 == 0:
                loss, rs_test = sess.run([val_model.loss, merge],
                               feed_dict={inputs.seq_desc:val_seq_desc, 
                                            inputs.seq_desc_len:val_seq_desc_len,
                                            inputs.label:val_price,
                                            inputs.keep_prob:1.})
#                 writer.add_summary(rs_test, k)
#                 writer.add_summary(rs_train, k)
                print('epoch %d step %d train loss: %.2f ; val loss: %.2f, step cost time %.1f'%(i+1, k, loss_i, loss, tcost))

epoch 1 step 2 train loss: 3.21 ; val loss: 2.31, step cost time 3.1
epoch 1 step 4 train loss: 0.72 ; val loss: 0.65, step cost time 3.0
epoch 1 step 6 train loss: 0.88 ; val loss: 0.93, step cost time 2.9


KeyboardInterrupt: 

### 问题
1. dynamic_rnn  设置seq——len 反而速度慢？而且收敛也慢了
2. cell的zero state问题 ——已经解决

In [19]:
x = [[1,2,3]]

In [20]:
x_mean = tf.reduce_mean(x)

In [21]:
sess = tf.Session()
sess.run(x_mean)

2