In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import optimizers
from tensorflow.keras import metrics
from tensorflow.keras import activations
from tensorflow.keras import callbacks
from tensorflow.keras import Input
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler


In [5]:
data = pd.read_csv('./data/TrainingSet.csv')
labels = data['QuantitySold']
data = data.drop(columns=['ReturnsAccepted', 'QuantitySold'])
columns = data.columns
print('特征个数: {0}'.format(len(columns)))
print(columns)
print('\n统计是否有缺失值')
for c in columns:
    print('{0}: {1}'.format(c, data[c].isnull().sum()))
embed_cols = ['EbayID', 'SellerName', 'Category', 'EndDay',]

特征个数: 26
Index(['EbayID', 'Price', 'PricePercent', 'StartingBidPercent', 'SellerName',
       'SellerClosePercent', 'Category', 'PersonID', 'StartingBid', 'AvgPrice',
       'EndDay', 'HitCount', 'AuctionAvgHitCount', 'ItemAuctionSellPercent',
       'SellerSaleAvgPriceRatio', 'SellerAvg', 'SellerItemAvg',
       'AuctionHitCountAvgRatio', 'BestOffer', 'IsHOF', 'ItemListedCount',
       'AuctionCount', 'AuctionSaleCount', 'SellerAuctionCount',
       'SellerAuctionSaleCount', 'AuctionMedianPrice'],
      dtype='object')

统计是否有缺失值
EbayID: 0
Price: 0
PricePercent: 0
StartingBidPercent: 0
SellerName: 0
SellerClosePercent: 0
Category: 0
PersonID: 0
StartingBid: 0
AvgPrice: 0
EndDay: 0
HitCount: 0
AuctionAvgHitCount: 0
ItemAuctionSellPercent: 0
SellerSaleAvgPriceRatio: 0
SellerAvg: 0
SellerItemAvg: 0
AuctionHitCountAvgRatio: 0
BestOffer: 0
IsHOF: 0
ItemListedCount: 0
AuctionCount: 0
AuctionSaleCount: 0
SellerAuctionCount: 0
SellerAuctionSaleCount: 0
AuctionMedianPrice: 0


In [6]:
seller_name = data[embed_cols[1]].values
end_day = data[embed_cols[3]].values
ebay_id = data[embed_cols[0]].values
category = data[embed_cols[2]].values

ei_emb = layers.experimental.preprocessing.IntegerLookup()
ei_emb.adapt(ebay_id)
ei_num = len(ei_emb.get_vocabulary())
print('EbayID个数: {0}'.format(ei_num))
cate_emb = layers.experimental.preprocessing.IntegerLookup()
cate_emb.adapt(category)
cate_num = len(cate_emb.get_vocabulary())
print('Category个数: {0}'.format(cate_num))
ed_emb = layers.experimental.preprocessing.StringLookup()
ed_emb.adapt(end_day)
ed_num = len(ed_emb.get_vocabulary())
print('EndDay个数: {0}'.format(ed_num))
sn_emb = layers.experimental.preprocessing.StringLookup()
sn_emb.adapt(seller_name)
sn_num = len(sn_emb.get_vocabulary())
print('SellerName个数: {0}'.format(sn_num))

EbayID个数: 258589
Category个数: 46
EndDay个数: 8
SellerName个数: 11067


In [None]:
data = data.drop(columns=embed_cols).values
print(data.shape)
print('数值特征归一化:')
scaler = MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(data)
print(data)

In [None]:
# ei_input = Input(shape=(1,), dtype='float32', name='ebayid_input')
cate_input = Input(shape=(1,), dtype='float32', name='cate_input')
ed_input = Input(shape=(1,), dtype='string', name='endday_input')
sn_input = Input(shape=(1,), dtype='string', name='sellername_input')

# ei_emb = ei_emb(ei_input)
cate_emb = cate_emb(cate_input)
ed_emb = ed_emb(ed_input)
sn_emb = sn_emb(sn_input)

# ei_emb = layers.Embedding(ei_num, 16)(ei_emb)
cate_emb = layers.Embedding(cate_num, 8)(cate_emb)
ed_emb = layers.Embedding(ed_num, 4)(ed_emb)
sn_emb = layers.Embedding(sn_num, 16)(sn_emb)

ox_input = layers.Input(shape=(data.shape[-1],), dtype='float32', name='otherfea_input')
x = layers.Concatenate()([ox_input, cate_emb, ed_emb, sn_emb])
x = layers.Dense(128, use_bias=False)(x)
x = layers.BatchNormalization()