In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from pyhive import hive

In [2]:
#连接Hive
conn = hive.connect(
        host=os.environ['PYHIVE_HOST'], 
        port=os.environ['PYHIVE_PORT'],
        username=os.environ['JUPYTER_HADOOP_USER'],
        password=os.environ['HADOOP_USER_PASSWORD'],
        auth='LDAP',
        configuration={'mapreduce.job.queuename': os.environ['JUPYTER_HADOOP_QUEUE'],
                       'hive.resultset.use.unique.column.names':'false'})

sql = """
    select
        a1.*
    from
        (
            select
                order_id
            from
                sstg_zraq.tmp_speed_cheat_order_train
            where
                order_process in (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1)
            group by
                order_id
            having
                count(order_process) = 10
        ) a0
        join (
            select
                *
            from
                sstg_zraq.tmp_speed_cheat_order_train
            where
                order_process in (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1)
        ) a1 on a0.order_id = a1.order_id
"""

data = pd.read_sql_query(sql,conn)
data.head()

Unnamed: 0,driver_id,call_city,order_id,target,distance,calc_speed,speed,diff_speed,pct_speed,dir,next_dir,altitude,diff_altitude,pct_altitude,diff_createtime,diff_localtime,order_process
0,565089049842744,11,17765362920891,0,0.0,0.0,0.0,0.0,,349.609985,349.609985,24.9,0.0,0.0,3,3,0.1
1,565089049842744,11,17765362920891,0,0.0,0.0,0.0,0.0,,119.620003,119.620003,13.9,-0.4,0.028777,1,1,1.0
2,565089049842744,11,17765362920891,0,41.839233,13.946411,11.966262,2.276102,0.19021,123.07,125.419998,11.2,1.2,0.107143,3,3,0.9
3,565089049842744,11,17765362920891,0,25.362932,8.454311,5.338184,2.500006,0.468325,160.210007,185.110001,20.9,-1.3,0.062201,3,3,0.8
4,565089049842744,11,17765362920891,0,92.117933,30.705978,29.624323,0.975399,0.032926,194.320007,196.720001,19.4,0.5,0.025773,3,3,0.7


In [3]:
data_o1 = data.fillna(0)
data_s1 = pd.DataFrame(data_o1)
#data_s1.drop(columns=['driver_id','order_id','target'])

In [4]:
data_s1 = data_s1.astype({'call_city':'str','distance':'float32','calc_speed':'float32','speed':'float32','diff_speed':'float32','pct_speed':'float32','dir':'float32','next_dir':'float32','altitude':'float32','pct_altitude':'float32','diff_createtime':'int32','diff_localtime':'int32','order_process':'float32'})
data_s1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362400 entries, 0 to 362399
Data columns (total 17 columns):
driver_id          362400 non-null object
call_city          362400 non-null object
order_id           362400 non-null object
target             362400 non-null int64
distance           362400 non-null float32
calc_speed         362400 non-null float32
speed              362400 non-null float32
diff_speed         362400 non-null float32
pct_speed          362400 non-null float32
dir                362400 non-null float32
next_dir           362400 non-null float32
altitude           362400 non-null float32
diff_altitude      362400 non-null float64
pct_altitude       362400 non-null float32
diff_createtime    362400 non-null int32
diff_localtime     362400 non-null int32
order_process      362400 non-null float32
dtypes: float32(10), float64(1), int32(2), int64(1), object(3)
memory usage: 30.4+ MB


In [5]:
for i in range(len(data_s1)):
    format(data_s1['distance'][i],'.6f')
    format(data_s1['calc_speed'][i],'.6f')
    format(data_s1['speed'][i],'.6f')
    format(data_s1['diff_speed'][i],'.6f')
    format(data_s1['pct_speed'][i],'.6f')
    format(data_s1['dir'][i],'.6f')
    format(data_s1['next_dir'][i],'.6f')
    format(data_s1['altitude'][i],'.6f')
    format(data_s1['diff_altitude'][i],'.6f')
    format(data_s1['pct_altitude'][i],'.6f')
    format(data_s1['order_process'][i],'.3f')

In [36]:
#划分训练集验证集
from sklearn.model_selection import train_test_split
train_set_s1, test_set_s1 = train_test_split(data_s1, test_size=0.2, random_state=1226)
train_set_s1.head()

Unnamed: 0,driver_id,call_city,order_id,target,distance,calc_speed,speed,diff_speed,pct_speed,dir,next_dir,altitude,diff_altitude,pct_altitude,diff_createtime,diff_localtime,order_process
41424,580543344560275,61,35292374425572,0,47.94537,15.981791,16.26,0.34,0.02091,101.900002,100.0,4.0,2.0,0.5,3,3,0.7
242262,580542285118284,196,35292342421834,0,32.290028,10.763342,10.12,1.39,0.137352,266.600006,278.299988,571.870361,-0.86352,0.00151,3,3,0.2
42634,567950194114732,196,35292378730614,0,44.806255,14.935418,13.65,1.610001,0.117949,320.399994,320.700012,564.672974,0.7541,0.001335,3,3,0.9
121766,580543171866913,27,35292361361150,0,41.22887,13.742956,10.03,1.22,0.121635,175.34024,175.34024,175.057953,-2.31952,0.01325,3,4,0.5
152949,580546263517317,150,35292331139512,0,0.0,0.0,0.0,0.0,0.0,169.039993,169.009995,21.5,0.0,0.0,3,4,0.6


In [37]:
train_set_s1_g = pd.DataFrame(train_set_s1,columns=['call_city','distance','calc_speed','speed','diff_speed','pct_speed','dir','next_dir','altitude','diff_altitude','pct_altitude','diff_createtime','diff_localtime','order_process'])[train_set_s1['target']==0]
train_set_s1_b = pd.DataFrame(train_set_s1,columns=['call_city','distance','calc_speed','speed','diff_speed','pct_speed','dir','next_dir','altitude','diff_altitude','pct_altitude','diff_createtime','diff_localtime','order_process'])[train_set_s1['target']==1]
train_set_s1_g.head()
train_set_s1_b.head()

Unnamed: 0,call_city,distance,calc_speed,speed,diff_speed,pct_speed,dir,next_dir,altitude,diff_altitude,pct_altitude,diff_createtime,diff_localtime,order_process
312244,66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.022522,-0.00384,5.4e-05,3,3,0.9
85916,19,10.091279,3.36376,2.113449,1.107168,0.523868,237.679993,266.130005,1920.5,0.1,5.2e-05,3,3,0.1
302589,168,14.925586,4.975195,4.740077,0.113753,0.023998,13.68,13.07,66.599998,-0.3,0.004505,3,3,0.7
119978,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,203.658325,0.11346,0.000557,3,3,0.7
119974,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,241.723755,-0.7846,0.003246,3,4,0.9


In [38]:
from tensorflow.keras.layers import Input, Dense, Activation, Reshape,Dropout
from tensorflow.keras.layers import Concatenate
from tensorflow.python.keras.layers.embeddings import Embedding
from tensorflow.keras.models import Model
import math
import random as rn

In [39]:
def build_embedding_network(category_data,continuous_data):
    cat_cols = [x for x in category_data.columns]
    category_origin_dimension = [math.ceil(category_data[cat_col].drop_duplicates().size) for cat_col in cat_cols]
    category_embedding_dimension = [math.ceil(math.sqrt(category_data[cat_col].drop_duplicates().size)) for cat_col in cat_cols]

    inputs = []
    embeddings = []

    for cat_val, cat_origin_dim, cat_embedd_dim in list(zip(cat_cols,category_origin_dimension,category_embedding_dimension)):
        input_cate_feature = Input(shape=(1,))
        embedding = Embedding(input_dim=cat_origin_dim,output_dim=cat_embedd_dim,input_length=1)(input_cate_feature)
        embedding = Reshape(target_shape=(cat_embedd_dim,))(embedding)

        inputs.append(input_cate_feature)
        embeddings.append(embedding)

    cnt_val_num = continuous_data.shape[1]
    for cnt_val_num in range(cnt_val_num):
        input_numeric_features = Input(shape=(1,))
        embedding_numeric_features = Dense(units=16)(input_numeric_features)

        inputs.append(input_numeric_features)
        embeddings.append(embedding_numeric_features)

    x = Concatenate()(embeddings)
    x = Dense(units=16,activation = 'relu')(x)

    x = Dropout(0.15)(x)

    output = Dense(1,activation='relu')(x)
    model = Model(inputs,output)
    model.compile(loss='mean_squared_error', optimizer='adam')

    return model

In [40]:
category_data_train = pd.DataFrame(train_set_s1 ,columns = ['call_city'])
continuous_data_train = pd.DataFrame(train_set_s1,columns = ['distance','calc_speed','speed','diff_speed','pct_speed','dir','next_dir','altitude','diff_altitude','pct_altitude','diff_createtime','diff_localtime','order_process'])
category_data_test = pd.DataFrame(test_set_s1,columns = ['call_city'])
continuous_data_test = pd.DataFrame(test_set_s1,columns = ['distance','calc_speed','speed','diff_speed','pct_speed','dir','next_dir','altitude','diff_altitude','pct_altitude','diff_createtime','diff_localtime','order_process'])

X_train = pd.concat([category_data_train,continuous_data_train],axis=1)
y_train = pd.DataFrame(train_set_s1,columns=['target'])
X_test = pd.concat([category_data_test,continuous_data_test],axis=1)
y_test = pd.DataFrame(test_set_s1,columns=['target'])

In [41]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
continuous_data_train = pd.DataFrame(scaler.fit_transform(continuous_data_train),columns=continuous_data_train.columns,index=continuous_data_train.index)
continuous_data_test = pd.DataFrame(scaler.fit_transform(continuous_data_test),columns=continuous_data_test.columns,index=continuous_data_test.index)

In [42]:
continuous_data_train.head()
category_data_train.head()

Unnamed: 0,call_city
41424,61
242262,196
42634,196
121766,27
152949,150


In [43]:
NN = build_embedding_network(category_data_train,continuous_data_train)
NN.fit(X_train,y_train, epochs=3, batch_size=40, verbose=0)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 14 array(s), but instead got the following list of 1 arrays: [array([['61', 47.945369720458984, 15.981790542602539, ..., 3, 3,
        0.699999988079071],
       ['196', 32.2900276184082, 10.763341903686523, ..., 3, 3,
        0.20000000298023224],
       ['196...

In [44]:
arr_1 = X_train['call_city'].values
arr_2 = X_train['distance'].values
arr_3 = X_train['calc_speed'].values
arr_4 = X_train['speed'].values
arr_5 = X_train['diff_speed'].values
arr_6 = X_train['pct_speed'].values
arr_7 = X_train['dir'].values
arr_8 = X_train['next_dir'].values
arr_9 = X_train['altitude'].values
arr_10 = X_train['diff_altitude'].values
arr_11 = X_train['pct_altitude'].values
arr_12 = X_train['diff_createtime'].values
arr_13 = X_train['diff_localtime'].values
arr_14 = X_train['order_process'].values
X_train_2 = [arr_1,arr_2,arr_3,arr_4,arr_5,arr_6,arr_7,arr_8,arr_9,arr_10,arr_11,arr_12,arr_13,arr_14]
X_train_2

[array(['61', '196', '196', ..., '102', '188', '43'], dtype=object),
 array([47.94537 , 32.290028, 44.806255, ..., 15.254155, 31.169456,
        51.870834], dtype=float32),
 array([15.981791 , 10.763342 , 14.935418 , ...,  5.0847187, 10.389819 ,
        17.29028  ], dtype=float32),
 array([16.26   , 10.12   , 13.65   , ...,  5.2    ,  5.08   , 17.49463],
       dtype=float32),
 array([ 0.34000015,  1.3900003 ,  1.6100006 , ...,  0.6500001 ,
         4.6099997 , -1.2343254 ], dtype=float32),
 array([0.02091022, 0.13735181, 0.11794876, ..., 0.12500003, 0.90748024,
        0.07055452], dtype=float32),
 array([1.0190000e+02, 2.6660001e+02, 3.2039999e+02, ..., 1.4239999e+02,
        1.0000000e-01, 1.1795529e+02], dtype=float32),
 array([100.     , 278.3    , 320.7    , ..., 140.6    , 359.8    ,
        121.55035], dtype=float32),
 array([   4.     ,  571.87036,  564.673  , ...,  254.02228,   17.01587,
        1444.6442 ], dtype=float32),
 array([ 2.     , -0.86352,  0.7541 , ...,  2.39746,

In [45]:
NN.fit(X_train_2,y_train, epochs=3, batch_size=40, verbose=0)

InvalidArgumentError:  indices[29,0] = 329 is not in [0, 327)
	 [[node embedding_3/embedding_lookup (defined at <ipython-input-45-5b72fb64938b>:1) ]] [Op:__inference_keras_scratch_graph_10813]

Function call stack:
keras_scratch_graph


In [None]:
# cate_feature_num = category_data_train.columns.size 

# model = NN 
# for i in range(cate_feature_num):
#     layer_name = NN.get_config()['layers'][cate_feature_num+i]['name']
#     intermediate_layer_model = Model(NN.input,outputs=model.get_layer(layer_name).output)
#     intermediate_output = intermediate_layer_model.predict(X_train) 
#     intermediate_output.resize([X_train.shape[0],cate_embedding_dimension[i][1]])