### RNN 预测产品价格、嵌入空间可视化

#### 背景：

数据来自[kaggle]()，赛题为根据产品描述、类别、品牌等信息预测产品价格。  
讨论区中已经有tf-idf + xgboost/lightgmb和rnn + keras的开源实现。  
本文将以tensorflow作为主要工具，实现多种架构的神经网络，通过tensorboard可视化调参提升性能。

#### 环境
+ python 3.6.3
+ tensorflow 1.4

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, Binarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

  return f(*args, **kwds)


#### 读入数据

In [2]:
kaggle_path = '../input/'
mac_path = '/Users/zhouzhirui/data/Mercari_Price_Forcast/'

def load_data(path):
    train = pd.read_table(path+'train.tsv')
    test = pd.read_table(path+'test.tsv')
    merge = pd.concat([train, test], axis=0).reset_index(drop=True)
    return merge
merge = load_data(mac_path)
merge.head(3)

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0


In [4]:
merge = merge[merge.train_id.notnull()]

#### 处理数据
+ fill missing
+ convert upper-case to lower-case
+ encoding
+ onehot

In [5]:
def onehot_condition(dataset):
    onehot = pd.get_dummies(merge.condition, drop_first=True, prefix='condition')
    dataset = pd.concat([dataset, onehot], axis=1).drop('condition', axis=1)
    return dataset

def handle_missing(dataset):
    dataset.category_name.fillna(value="missing", inplace=True)
    dataset.brand_name.fillna(value="missing", inplace=True)
    dataset.item_description.fillna(value="missing", inplace=True)
    dataset.loc[dataset.item_description == 'No description yet', 'item_description'] = 'missing'
    return dataset

def upper2lower(dataset):
    for (col, dtype) in dataset.dtypes.iteritems():
        if dtype == 'object':
            dataset[col] = dataset[col].str.lower()
    return dataset

def label_encoding(dataset):
    le_category = LabelEncoder()
    dataset['category'] = le_category.fit_transform(dataset.category_name)
    le_brand = LabelEncoder()
    dataset['brand'] = le_brand.fit_transform(dataset.brand_name)
    dataset['condition'] = dataset['item_condition_id']
    del dataset['brand_name'], dataset['item_condition_id']
    return dataset, le_category, le_brand

merge = handle_missing(merge)
merge = upper2lower(merge)
merge, le_category, le_brand = label_encoding(merge)
merge = onehot_condition(merge)
merge.head(3)

Unnamed: 0,category_name,item_description,name,price,shipping,test_id,train_id,category,brand,condition_2,condition_3,condition_4,condition_5
0,men/tops/t-shirts,missing,mlb cincinnati reds t shirt size xl,10.0,1,,0.0,808,2937,0,1,0,0
1,electronics/computers & tablets/components & p...,this keyboard is in great condition and works ...,razer blackwidow chroma keyboard,52.0,0,,1.0,86,3615,0,1,0,0
2,women/tops & blouses/blouse,adorable top with a hint of lace and a key hol...,ava-viv blouse,10.0,1,,2.0,1255,4242,0,0,0,0


### 样本切割

In [6]:
merge = merge.sample(frac=1).reset_index(drop=True)
dtrain = merge.iloc[:1470000,:].reset_index(drop=True)
dtest = merge.iloc[1470000:,:].reset_index(drop=True)

### 训练样本迭代器

In [11]:
def batch_generator(dataset, batch_size):
    data = dataset[['brand','category','price','shipping','condition_2','condition_3','condition_4','condition_5']].values
    np.random.shuffle(data)
    datalen = data.shape[0]
    idx = 0
    while idx * batch_size < datalen:
        brand = data[idx*batch_size:(idx+1)*batch_size, 0].reshape(-1, 1)
        category = data[idx*batch_size:(idx+1)*batch_size, 1].reshape(-1, 1)
        price = np.log1p(data[idx*batch_size:(idx+1)*batch_size, 2]).reshape(-1, 1)
        num = data[idx*batch_size:(idx+1)*batch_size, 3:]
        idx += 1
        yield brand, category, num, price

### 验证集数据准备

In [10]:
def get_val_data(dataset):
    data = dataset[['brand','category','price','shipping','condition_2','condition_3','condition_4','condition_5']].values
    brand = data[:, 0].reshape(-1, 1)
    category = data[:, 1].reshape(-1, 1)
    price = np.log1p(data[:, 2]).reshape(-1, 1)
    num = data[:, 3:]
    return brand, category, num, price

val_brand, val_category, val_num, val_price = get_val_data(dtest)

### Model1 without text feature

#### 定义模型参数

In [135]:
class Param(object):        
    def __setattr__(self, attr, value):
        self.__dict__[attr] = value
    def __str__(self):
        return str(self.__dict__)
    __repr__ = __str__

param = Param()
param.lr = 0.005
param.epochs = 8
param.batchsize = 10000
param.keep_prob = 0.7
param.logdir = '/Users/zhouzhirui/Desktop/log1'
param.brand_num = merge.brand.max() + 1
param.brand_embed_dim = 30  # brand类别过多，添加embedding layer降维
param.category_num = merge.category.max() + 1
param.category_embed_dim = 20  # 同理brand
print(param)

{'lr': 0.005, 'epochs': 8, 'batchsize': 10000, 'keep_prob': 0.7, 'logdir': '/Users/zhouzhirui/Desktop/log1', 'brand_num': 4810, 'brand_embed_dim': 30, 'category_num': 1288, 'category_embed_dim': 20}


## 定义模型

In [143]:
class Model1(object):
    """无文本建模模型"""
    def __init__(self, param):
        self.graph = tf.Graph()
        with self.graph.as_default():
            
            self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
            
            with tf.name_scope('inputs'):
                self.add_input()
                
            with tf.name_scope('brand_embedding'):
                self.brand_embedding = self.add_brand_embedding(param.brand_num, param.brand_embed_dim)
                
            with tf.name_scope('category_embedding'):
                self.category_embedding = self.add_category_embedding(param.category_num, param.category_embed_dim)
                
            self.main = tf.concat([self.input_num, 
                                   tf.reshape(self.brand_embedding, [-1, param.brand_embed_dim]), 
                                   tf.reshape(self.category_embedding, [-1, param.category_embed_dim])], axis=1)
            
            with tf.name_scope('dense1'):
                self.dense1 = self.add_dence_layer(inputs=self.main, 
                                                   input_size=param.category_embed_dim + param.brand_embed_dim + 5, 
                                                   output_size=64, 
                                                   activation=tf.nn.relu, 
                                                   keep_prob=self.keep_prob)
                
            with tf.name_scope('dense2'):
                self.dense2 = self.add_dence_layer(inputs=self.dense1, 
                                                   input_size=64, 
                                                   output_size=32, 
                                                   activation=tf.nn.relu, 
                                                   keep_prob=self.keep_prob)
            
                
            with tf.name_scope('output_layer'):
                self.output = self.add_dence_layer(inputs=self.dense2,
                                                   input_size=32,
                                                   output_size=1,
                                                   activation='false', 
                                                   keep_prob='false')
            with tf.name_scope('loss'):
                self.loss = tf.losses.mean_squared_error(self.target, self.output)

            with tf.name_scope('optimizer'):
                self.train = tf.train.AdamOptimizer(param.lr).minimize(self.loss)

            self.init = tf.global_variables_initializer()

               
    def add_input(self):
        self.input_brand = tf.placeholder(tf.int32, [None, 1], name='brand')
        self.input_category = tf.placeholder(tf.int32, [None, 1], name='category')
        self.input_num = tf.placeholder(tf.float32, [None, 5], name='num')
        self.target = tf.placeholder(tf.float32, [None,1], name='price')
        
    
    def add_brand_embedding(self, input_dim, output_dim):
        self.brand_embed_matrix = tf.Variable(tf.random_uniform([input_dim, output_dim], -1., 1.0), name='brand_embed_matrix')
        embeding = tf.nn.embedding_lookup(self.brand_embed_matrix, self.input_category, name='brand_embed_lookup')
        return embeding
    
    def add_category_embedding(self, input_dim, output_dim):
        self.category_embed_matrix = tf.Variable(tf.random_uniform([input_dim, output_dim], -1., 1.0), name='category_embed_matrix')
        embeding = tf.nn.embedding_lookup(self.category_embed_matrix, self.input_category, name='category_embed_lookup')
        return embeding
    
    def add_dence_layer(self, inputs, input_size, output_size, activation=None, keep_prob='false'):
        W = tf.Variable(tf.random_normal(dtype=tf.float32, shape=[input_size, output_size], mean=0, stddev=0.1), name='W')
        b = tf.Variable(tf.zeros(dtype=tf.float32, shape=[1, output_size]) + 0.1, name='b')
        output = tf.matmul(inputs, W) + b
        if activation != 'false':
            output = tf.nn.relu(output)
        if keep_prob != 'false':
            output = tf.nn.dropout(output, keep_prob)
        return output

In [144]:
model1 = Model1(param)
tf.summary.histogram(name='brand_embedding',values=model1.brand_embed_matrix)
tf.summary.histogram(name='category_embedding',values=model1.category_embed_matrix)
tf.summary.scalar('loss',model1.loss)
summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(param.logdir, model1.graph)
# val_writer = tf.summary.FileWriter(param.logdir+'/val', model1.graph)
sess = tf.Session(graph=model1.graph)
sess.run(model1.init)

for i in range(param.epochs):
    k = 0
    for (brand, category, num, price) in batch_generator(dtrain, param.batchsize):
        _, loss = sess.run([model1.train, model1.loss], {model1.input_brand:brand, 
                                                        model1.input_category:category,
                                                        model1.input_num:num, 
                                                        model1.target:price,
                                                        model1.keep_prob:param.keep_prob})
        if k%20 == 0:
            val_loss = sess.run(model1.loss, {model1.input_brand:val_brand, 
                                                        model1.input_category:val_category,
                                                        model1.input_num:val_num, 
                                                        model1.target:val_price,
                                                        model1.keep_prob:1.})
            train_loss = sess.run(model1.loss, {model1.input_brand:brand, 
                                                        model1.input_category:category,
                                                        model1.input_num:num, 
                                                        model1.target:price,
                                                        model1.keep_prob:1.})
            print('epoch:%d  step%d : train_loss:%.4f ,val_loss:%.4f, embeddsum:%.2f'%(i, k, train_loss, val_loss, sess.run(model1.brand_embed_matrix).sum()))
        k+=1

epoch:0  step0 : train_loss:8.5754 ,val_loss:8.7049, embeddsum:29.96
epoch:0  step20 : train_loss:0.5533 ,val_loss:0.5786, embeddsum:145.46
epoch:0  step40 : train_loss:0.4689 ,val_loss:0.4707, embeddsum:149.95
epoch:0  step60 : train_loss:0.4328 ,val_loss:0.4303, embeddsum:162.12
epoch:0  step80 : train_loss:0.4229 ,val_loss:0.4270, embeddsum:173.30
epoch:0  step100 : train_loss:0.4126 ,val_loss:0.4161, embeddsum:182.48
epoch:0  step120 : train_loss:0.4091 ,val_loss:0.4158, embeddsum:181.68
epoch:0  step140 : train_loss:0.4061 ,val_loss:0.4131, embeddsum:184.66
epoch:1  step0 : train_loss:0.4125 ,val_loss:0.4154, embeddsum:185.13
epoch:1  step20 : train_loss:0.4162 ,val_loss:0.4138, embeddsum:187.81
epoch:1  step40 : train_loss:0.3987 ,val_loss:0.4110, embeddsum:188.94
epoch:1  step60 : train_loss:0.3938 ,val_loss:0.4101, embeddsum:186.92
epoch:1  step80 : train_loss:0.3957 ,val_loss:0.4088, embeddsum:185.51
epoch:1  step100 : train_loss:0.4040 ,val_loss:0.4110, embeddsum:186.48
epoch

In [148]:
merge['brand_name'] = le_brand.inverse_transform(merge.brand)
id2brand = dict(zip(range(4810), le_brand.classes_))
brand2id = dict(zip(le_brand.classes_,range(4810)))
embedding_matrix = sess.run(model1.brand_embed_matrix)

In [160]:
embedding_matrix[brand2id['adidas neo']]

array([-0.67389876, -0.70933992,  0.58079666, -0.21463154,  0.55771631,
        0.45473605, -0.45238081,  0.2506023 ,  0.05044049, -0.42083871,
        0.56253678,  0.08243759, -0.51152676, -0.07393313,  0.02964799,
        0.22313081, -0.89522874,  0.07326837, -0.68000102,  0.46859783,
        0.02646305, -0.13218412,  0.0137408 ,  0.13657908, -0.29262793,
        0.55516356,  0.08805592, -0.41509423,  0.63190967, -0.12040908], dtype=float32)

In [182]:
for i in np.argsort(np.sum(np.square(embedding_matrix - vec), axis=1))[:10]:
    print(id2brand[i])

adidas
all sport
lite brix
canon
aqua
old spice
john paul pet
xelement
equipment
basic editions


In [139]:
def distance(x, y):
    return np.sqrt(np.square(x-y).sum())
    
def similar(brand_name, n=5):
    brand_vec = embedding_matrix[brand2id[brand_name]]
    
    print(pd.Series(np.dot(embedding_matrix, brand_vec), index=le_brand.classes_).sort_values(ascending=False)[:n])

In [183]:
similar("adidas", n=3)

adidas          8.770928
acne studios    5.847715
mother          5.129402
dtype: float32


In [41]:
class A():
    a = 3
    def good(self):
        print(1)

In [42]:
a = A()

In [43]:
type(a.a)

int

In [44]:
a.good()

1


In [48]:
import tensorflow as tf

with tf.variable_scope("scope1"):
    w1 = tf.Variable("w1", shape=[])
    w2 = tf.Variable(0.0, name="w2")
with tf.variable_scope("scope1", reuse=True):
    w1_p = tf.get_variable("w1", shape=[])
    w2_p = tf.Variable(1.0, name="w2")

print(w1 is w1_p, w2 is w2_p)
#输出
#True  False

TypeError: __init__() got an unexpected keyword argument 'reuse'