In [1]:
from collections import Counter
import gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
%matplotlib inline

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
class Param:
    def __setattr__(self, attr, value):
        self.__dict__[attr] = value

param = Param()
param.brand_num = 2768 + 1         # 大于等于5个数2768
param.category_num = 1127 + 1      # 大于等于5个数1127
param.vocabulary_size = 150000
param.seq_desc_len = 200
param.seq_name_len = 50
param.seq_cate_len = 50

In [3]:
kaggle_path = '../input/'
mac_path = '/Users/zhouzhirui/data/Mercari_Price_Forcast/'

def load_data(path):
    print('load data ..')
    train = pd.read_table(path+'train.tsv')
    test = pd.read_table(path+'test.tsv')
    return train, test

def columns_rename(dataset):
    dataset.rename(columns={
        'brand_name':'brand',
        'category_name':'category',
        'item_condition_id':'condition',
        'item_description':'description'
    }, 
                   inplace=True)
    return dataset
    
def handle_missing(dataset):
    dataset.category.fillna(value="missing", inplace=True)
    dataset.brand.fillna(value="missing", inplace=True)
    dataset.description.fillna(value="missing", inplace=True)
    dataset.loc[dataset.description == 'No description yet', 'description'] = 'missing'
    return dataset

def upper2lower(dataset):
    for (col, dtype) in dataset.dtypes.iteritems():
        if dtype == 'object':
            dataset[col] = dataset[col].str.lower()
    return dataset

def category2id(dataset, column, most_common_n=None, start=1, ignore=0):
    c = Counter(dataset.loc[dataset.test_id.isnull(), column].values)
    category_dict = dict()
    for idx, (cate, _) in enumerate(c.most_common(most_common_n)):
        category_dict[cate] = idx + start
    dataset[column] = dataset[column].apply(lambda x: category_dict.get(x, ignore))
    return dataset

In [4]:
train, test = load_data(mac_path)
merge = pd.concat([train, test], axis=0).reset_index(drop=True)
del train, test
gc.collect()
merge = columns_rename(merge)
merge = handle_missing(merge)
merge = upper2lower(merge)

tk = Tokenizer(param.vocabulary_size)
tk.fit_on_texts(
    np.hstack([
        merge.loc[merge.test_id.isnull(), 'category'].values, 
        merge.loc[merge.test_id.isnull(), 'name'].values,
        merge.loc[merge.test_id.isnull(), 'description'].values
    ])
)
merge['seq_category'] = tk.texts_to_sequences(merge['category'])
merge['seq_name'] = tk.texts_to_sequences(merge['name'])
merge['seq_description'] = tk.texts_to_sequences(merge['description'])

merge = category2id(merge, 'brand', param.brand_num)
merge = category2id(merge, 'condition', start=0, ignore=-1)
merge = category2id(merge, 'category', param.category_num)
merge.drop(['description','name','train_id'], axis=1, inplace=True)
merge.to_csv(mac_path + 'merge.csv', index=None)

load data ..


In [11]:
merge.head()

Unnamed: 0,brand,category,condition,price,shipping,test_id,seq_category,seq_name,seq_description
0,1,19,1,10.0,1,,"[77, 41, 71, 72]","[2490, 8906, 6986, 71, 99, 7, 198]",[83]
1,585,242,1,52.0,0,,"[62, 921, 828, 3280, 1380]","[10846, 25263, 16315, 2749]","[33, 2749, 11, 8, 50, 17, 1, 256, 65, 21, 1218..."
2,82,9,0,10.0,1,,"[2, 41, 75, 276]","[7717, 10621, 276]","[700, 74, 10, 5, 5472, 12, 243, 1, 5, 993, 140..."
3,1,28,0,35.0,1,,"[37, 37, 196, 37, 196, 502]","[226, 2728, 620]","[6, 10, 80, 226, 6629, 285, 4, 22, 210, 1193, ..."
4,1,12,0,44.0,0,,"[2, 105, 348]","[4995, 127, 1143, 340]","[913, 10, 6995, 12, 2104]"
