In [1]:
from functools import wraps
import collections
import os
import time

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, Binarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

def lazy_property(func):
    attr = '_lazy_' + func.__name__

    @property
    @wraps(func)
    def wrapper(self):
        if not hasattr(self, attr):
            setattr(self, attr, func(self))
        return getattr(self, attr)
    return wrapper

VOC_SIZE = 50000

  return f(*args, **kwds)
Using TensorFlow backend.


In [2]:
kaggle_path = '../input/'
mac_path = '/Users/zhouzhirui/data/Mercari_Price_Forcast/'
def get_train_val_test(path, voc_size, val_size):
    
    def handle_missing(dataset):
        dataset.category.fillna(value="missing", inplace=True)
        dataset.brand.fillna(value="missing", inplace=True)
        dataset.description.fillna(value="missing", inplace=True)
        dataset.loc[dataset.description == 'No description yet', 'description'] = 'missing'
        return dataset
    
    def upper2lower(dataset):
        for (col, dtype) in dataset.dtypes.iteritems():
            if dtype == 'object':
                dataset[col] = dataset[col].str.lower()
        return dataset
    
    def load_data(path):
        print('load data ..')
        train = pd.read_table(path+'train.tsv')
        test = pd.read_table(path+'test.tsv')
        merge = pd.concat([train, test], axis=0).reset_index(drop=True)
        merge.rename(columns={
            'brand_name':'brand',
            'category_name':'category',
            'item_condition_id':'condition',
            'item_description':'description'
        }, inplace=True)
        print('测试，只取100000条数据')
        merge = merge[:100000]
        print('handle missing ..')
        merge = handle_missing(merge)
        print('upper2lower ..')
        merge = upper2lower(merge)
        train = merge[merge.train_id.notnull()].reset_index(drop=True)
        test = merge[merge.train_id.isnull()].reset_index(drop=True)
        return train, test
    
    train, test = load_data(mac_path)
    
    def text2seq(dtrain, dtest, vocabulary_size):
        tokenizer = Tokenizer(num_words=vocabulary_size)
        tokenizer.fit_on_texts(np.hstack([dtrain.category, dtrain.description, dtrain.name]))
        for i,col in enumerate(['name', 'category', 'description']):
            dtrain['seq_'+col] = tokenizer.texts_to_sequences(dtrain[col])
            dtest['seq_'+col] = tokenizer.texts_to_sequences(dtest[col])
        return dtrain, dtest, tokenizer
    
    print('text2seq ...')
    train, test, _ = text2seq(train, test, voc_size)
    
    def label_encoding(dtrain, dtest):
        le = LabelEncoder()
        for col in ['category', 'brand', 'condition']:
            le.fit(np.hstack([dtrain[col], dtest[col]]))
            dtrain[col] = le.transform(dtrain[col])
            dtest[col] = le.transform(dtest[col])
        return dtrain, dtest
    
    print('label encoding ...')
    train, test = label_encoding(train, test)
    
    def get_train_val_test(dtrain, dtest, val_size):
        dtrain = dtrain.sample(frac=1.0).reset_index(drop=True)
        dval = dtrain.iloc[:val_size,:].reset_index(drop=True)
        dtrain = dtrain.iloc[val_size:,:].reset_index(drop=True)
        return dtrain, dval, dtest
    
    print('data splite ...')
    train, val, test = get_train_val_test(train, test, val_size)
    
    return train, val, test

In [3]:
train, val, test = get_train_val_test(mac_path, voc_size=50000, val_size=5000)

load data ..
测试，只取100000条数据
handle missing ..
upper2lower ..
text2seq ...
label encoding ...
data splite ...


In [4]:
def gen_batch_data(dataset, batch_size):
    dataset = dataset.sample(frac=1.0).reset_index(drop=True)
    max_step = dataset.shape[0] // batch_size
    for step in range(max_step):
        sub = dataset.iloc[step*batch_size : (step+1)*batch_size, :]
        yield sub

def gen_tf_data(subdata):
    tfdata = dict()
    tfdata['seq_desc'] = tf.keras.preprocessing.sequence.pad_sequences(subdata['seq_description'], maxlen=200, padding='post')
    tfdata['seq_cate'] = tf.keras.preprocessing.sequence.pad_sequences(subdata['seq_category'], maxlen=75, padding='post')
    tfdata['seq_name'] = tf.keras.preprocessing.sequence.pad_sequences(subdata['seq_name'], maxlen=75, padding='post')
    tfdata['condition'] = subdata.condition.values.reshape(-1,1)
    tfdata['shipping'] = subdata.shipping.values.reshape(-1,1)
    tfdata['brand'] = subdata.brand.values.reshape(-1,1)
    tfdata['category'] = subdata.category.values.reshape(-1,1)
    tfdata['price'] = np.log1p(subdata['price']).values.reshape(-1,1)
    return tfdata

val_tfdata = gen_tf_data(subdata=val)

In [5]:
class Param(object):
    def __init__(self, name):
        self.name = name
        self.lr = 0.05
        self.keep_prob = 0.9
        self.epochs = 5
        self.batch_size = 5000
        self.val_size = 5000
        self.vocabulary_size = 50000
        self.seq_desc_max_len = 200
        self.seq_name_max_len = 75
        self.seq_cate_max_len = 75
        self.seq_embed_dim = 60
        self.category_num = 3000
        self.category_embed_dim = 60
        self.brand_num = 3000
        self.brand_embed_dim = 60
        self.log_path = '/Users/zhouzhirui/Desktop/log/%s/'%self.name
        self.save_path = '/Users/zhouzhirui/Desktop/model_save'
    def __setattr__(self, attr, value):
        self.__dict__[attr] = value
    def __str__(self):
        return str(self.__dict__)
    __repr__ = __str__


In [6]:
class Inputs(object):
    def __init__(self, param):
        with tf.name_scope('inputs'):
            self.seq_desc = tf.placeholder(dtype=tf.int32, shape=[None, param.seq_desc_max_len], name='seq_desc')
            self.seq_cate = tf.placeholder(dtype=tf.int32, shape=[None, param.seq_cate_max_len], name='seq_cate')
            self.seq_name = tf.placeholder(dtype=tf.int32, shape=[None, param.seq_name_max_len], name='seq_name')
            self.brand = tf.placeholder(dtype=tf.int32, shape=[None, 1])
            self.category = tf.placeholder(dtype=tf.int32, shape=[None, 1])
            self.shipping = tf.placeholder(dtype=tf.float32, shape=[None, 1])
            self.condition = tf.placeholder(dtype=tf.int32, shape=[None, 1])
            self.price = tf.placeholder(dtype=tf.float32, shape=[None,1], name='log_price')
                

In [7]:
def add_fc_layer(name, inputs, units, activation=None, keep_prob=None):
    init = tf.initializers.truncated_normal()
    with tf.name_scope(name) as ns:
        with tf.variable_scope(name) as vs:
            w = tf.get_variable('W', shape=[inputs.get_shape()[1], units], initializer=init)
            b = tf.get_variable('b', shape=[units])
            wx_plus_b = tf.nn.bias_add(tf.matmul(inputs, w, name='wx'), b, name='wx_plus_b')
            if activation:
                wx_plus_b = activation(wx_plus_b)
            if keep_prob:
                wx_plus_b = tf.nn.dropout(wx_plus_b, keep_prob=keep_prob)
            return wx_plus_b

def add_embed_layer(name, inputs, input_dim=None, output_dim=None, reuse=False):
    init = tf.initializers.truncated_normal()
    with tf.name_scope(name) as ns:
        with tf.variable_scope(name) as vs:
            if reuse:
                vs.reuse_variables()
                matrix = tf.get_variable('matrix')
            else:
                matrix = tf.get_variable('matrix', shape=[input_dim, output_dim], initializer=init)
            embed = tf.nn.embedding_lookup(matrix, inputs)
            return embed

def add_rnn_layer(name, inputs, units, batch_size):
    with tf.name_scope(name) as ns:
        with tf.variable_scope(name) as vs:
            cell = tf.nn.rnn_cell.BasicLSTMCell(units)
            init = cell.zero_state(batch_size, tf.float32)
            outputs, state = tf.nn.dynamic_rnn(cell, inputs, initial_state=init)
            outputs = tf.reshape(outputs[:,-1,:], [-1, units])
            return outputs

def add_concat_layer(name, tensors):
    with tf.name_scope(name):
        with tf.variable_scope(name):
            concat = tf.concat(tensors, axis=1)
            return concat

In [8]:
class RNN:
    def __init__(self, is_train, param, inputs):
        self.is_train = is_train,
        self.param = param
        self.inputs = inputs
        self.predict
        self.loss
        self.train
    
    @lazy_property
    def predict(self):
        if self.is_train:
            batch_size = self.param.batch_size
            keep_prob = self.param.keep_prob
        else:
            batch_size = 1
            keep_prob = None
            
        seq_desc_embed = add_embed_layer(
            'seq_embed', 
            self.inputs.seq_desc, 
            self.param.vocabulary_size, 
            self.param.seq_embed_dim
        )
        seq_cate_embed = add_embed_layer(
            'seq_embed', 
            self.inputs.seq_cate, 
            reuse=True
        )
        seq_name_embed = add_embed_layer(
            'seq_embed', 
            self.inputs.seq_name, 
            reuse=True
        )
        brand_embed = add_embed_layer(
            'brand_embed', 
            self.inputs.brand, 
            self.param.brand_num, 
            self.param.brand_embed_dim
        )
        brand_embed = tf.reshape(brand_embed, shape=[-1, self.param.brand_embed_dim])
        
        category_embed = add_embed_layer(
            'category_embed', 
            self.inputs.category, 
            self.param.category_num, 
            self.param.category_embed_dim
        )
        category_embed = tf.reshape(category_embed, shape=[-1, self.param.category_embed_dim])
            
        desc_rnn = add_rnn_layer('desc_lstm', seq_desc_embed, 16, batch_size)
        name_rnn = add_rnn_layer('name_lstm', seq_name_embed, 8, batch_size)
        cate_rnn = add_rnn_layer('cate_lstm', seq_cate_embed, 8, batch_size)
        
        main = add_concat_layer('concat', [desc_rnn, name_rnn, cate_rnn, brand_embed, category_embed])
        
        fc1 = add_fc_layer('fc1', main, 64, tf.nn.relu, keep_prob)
        fc2 = add_fc_layer('fc2', fc1, 32, tf.nn.relu, keep_prob)
        output = add_fc_layer('predict', fc2, 1, None, None)
        
        return output
    
    @lazy_property
    def loss(self):
        mse = tf.losses.mean_squared_error(self.predict, self.inputs.price)
        tf.summary.scalar('mse', mse)
        return mse
    
    @lazy_property
    def train(self):
        opt = tf.train.AdamOptimizer(self.param.lr).minimize(self.loss)
        return opt

In [9]:
param = Param('rnn')
inputs = Inputs(param)
model = RNN(True, param, inputs)

merge_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(param.log_path, tf.get_default_graph())
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

In [10]:
for i in range(param.epochs):
    k = 0
    for d in gen_batch_data(train, param.batch_size):
        train_tfdata = gen_tf_data(d)
        _,loss,summary = sess.run([model.train, model.loss, merge_summary], 
                             {
                                 model.inputs.price:train_tfdata['price'],
                              model.inputs.seq_cate:train_tfdata['seq_cate'],
                              model.inputs.seq_desc:train_tfdata['seq_desc'],
                              model.inputs.seq_name:train_tfdata['seq_name'],
                              model.inputs.brand:train_tfdata['brand'],
                              model.inputs.category:train_tfdata['category']
                             })
        writer.add_summary(summary, k)
        if k%2 == 0:
            loss_val = sess.run(model.loss, 
                             {
                                 model.inputs.price:val_tfdata['price'],
                              model.inputs.seq_cate:val_tfdata['seq_cate'],
                              model.inputs.seq_desc:val_tfdata['seq_desc'],
                              model.inputs.seq_name:val_tfdata['seq_name'],
                             model.inputs.brand:val_tfdata['brand'],
                             model.inputs.category:val_tfdata['category']
                             })
            print('epoch:%d  step%d : train_loss:%.4f ,val_loss:%.4f'%(i, k, loss, loss_val))
        k+=1

epoch:0  step0 : train_loss:10267.1348 ,val_loss:4299.2983
epoch:0  step2 : train_loss:2156.8088 ,val_loss:1283.1630
epoch:0  step4 : train_loss:787.2867 ,val_loss:721.5554
epoch:0  step6 : train_loss:651.7065 ,val_loss:454.6009
epoch:0  step8 : train_loss:282.5908 ,val_loss:205.6822
epoch:0  step10 : train_loss:176.9879 ,val_loss:148.7789
epoch:0  step12 : train_loss:130.3295 ,val_loss:98.4908
epoch:0  step14 : train_loss:80.5414 ,val_loss:64.4563
epoch:0  step16 : train_loss:57.0116 ,val_loss:54.1215
epoch:0  step18 : train_loss:50.5724 ,val_loss:46.4794
epoch:1  step0 : train_loss:45.2719 ,val_loss:40.5143
epoch:1  step2 : train_loss:34.1311 ,val_loss:28.1366
epoch:1  step4 : train_loss:24.4079 ,val_loss:20.5545
epoch:1  step6 : train_loss:17.5147 ,val_loss:16.6572
epoch:1  step8 : train_loss:15.5065 ,val_loss:14.6179
epoch:1  step10 : train_loss:12.7383 ,val_loss:12.5913
epoch:1  step12 : train_loss:11.2943 ,val_loss:11.2834
epoch:1  step14 : train_loss:10.2449 ,val_loss:9.3360
epo