In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
color = sns.color_palette()
%matplotlib inline

project_path = '../../'
mainpath = project_path + r'Data/data_set_phase1/'

tr_queries = pd.read_csv(mainpath+'train_queries.csv')
te_queries = pd.read_csv(mainpath+'test_queries.csv')
tr_plans = pd.read_csv(mainpath+'train_plans.csv')
te_plans = pd.read_csv(mainpath+'test_plans.csv')
tr_click = pd.read_csv(mainpath+'train_clicks.csv')

In [2]:
from tqdm import tqdm_notebook
import json

In [3]:

tr_data = tr_queries.merge(tr_click, on='sid', how='left')
tr_data = tr_data.merge(tr_plans, on='sid', how='left')
tr_data = tr_data.drop(['click_time'], axis=1)
tr_data['click_mode'] = tr_data['click_mode'].fillna(0)

te_data = te_queries.merge(te_plans, on='sid', how='left')
te_data['click_mode'] = -1

data = pd.concat([tr_data, te_data], axis=0)
data = data.drop(['plan_time'], axis=1)
data = data.reset_index(drop=True)
print('total data size: {}'.format(data.shape))
print('raw data columns: {}'.format(', '.join(data.columns)))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


total data size: (594358, 7)
raw data columns: click_mode, d, o, pid, plans, req_time, sid


In [4]:
tr_data.tail()

Unnamed: 0,sid,pid,req_time,o,d,click_mode,plan_time,plans
499995,3136563,113317.0,2018-10-26 12:53:24,"116.54,40.07","116.38,39.94",2.0,2018-10-26 12:53:24,"[{""distance"": 29571, ""price"": 600, ""eta"": 4449..."
499996,1775691,,2018-10-09 16:40:51,"116.50,40.00","116.58,39.91",0.0,,
499997,722748,193800.0,2018-10-01 16:46:52,"116.17,39.71","116.20,39.75",6.0,2018-10-01 16:46:52,"[{""distance"": 7634, ""price"": """", ""eta"": 2308, ..."
499998,1594185,,2018-10-03 12:27:51,"116.32,40.06","116.16,40.23",3.0,2018-10-03 12:27:51,"[{""distance"": 26955, ""price"": """", ""eta"": 2084,..."
499999,1795967,101673.0,2018-11-29 10:01:13,"116.50,39.89","116.37,39.94",4.0,2018-11-29 10:01:13,"[{""distance"": 19329, ""price"": 500, ""eta"": 3787..."


In [5]:
data.tail()

Unnamed: 0,click_mode,d,o,pid,plans,req_time,sid
594353,-1.0,"116.47,39.95","116.34,39.78",,"[{""distance"": 28555, ""price"": """", ""eta"": 2366,...",2018-12-01 22:32:41,471555
594354,-1.0,"116.34,39.78","116.34,39.71",,"[{""distance"": 10469, ""price"": """", ""eta"": 2223,...",2018-12-05 17:44:49,1123933
594355,-1.0,"116.29,39.91","116.25,39.92",162194.0,"[{""distance"": 6333, ""price"": 500, ""eta"": 2331,...",2018-12-03 12:02:04,1409146
594356,-1.0,"116.33,39.96","116.41,40.06",,"[{""distance"": 18235, ""price"": """", ""eta"": 2300,...",2018-12-04 22:34:20,1338606
594357,-1.0,"116.36,39.88","116.41,39.92",,"[{""distance"": 9836, ""price"": 600, ""eta"": 3109,...",2018-12-04 11:31:05,1353834


In [6]:

def gen_od_feas(data):
    data['o1'] = data['o'].apply(lambda x: float(x.split(',')[0]))
    data['o2'] = data['o'].apply(lambda x: float(x.split(',')[1]))
    data['d1'] = data['d'].apply(lambda x: float(x.split(',')[0]))
    data['d2'] = data['d'].apply(lambda x: float(x.split(',')[1]))
    data = data.drop(['o', 'd'], axis=1)
    return data

In [7]:

data = gen_od_feas(data)

In [8]:
data.tail()

Unnamed: 0,click_mode,pid,plans,req_time,sid,o1,o2,d1,d2
594353,-1.0,,"[{""distance"": 28555, ""price"": """", ""eta"": 2366,...",2018-12-01 22:32:41,471555,116.34,39.78,116.47,39.95
594354,-1.0,,"[{""distance"": 10469, ""price"": """", ""eta"": 2223,...",2018-12-05 17:44:49,1123933,116.34,39.71,116.34,39.78
594355,-1.0,162194.0,"[{""distance"": 6333, ""price"": 500, ""eta"": 2331,...",2018-12-03 12:02:04,1409146,116.25,39.92,116.29,39.91
594356,-1.0,,"[{""distance"": 18235, ""price"": """", ""eta"": 2300,...",2018-12-04 22:34:20,1338606,116.41,40.06,116.33,39.96
594357,-1.0,,"[{""distance"": 9836, ""price"": 600, ""eta"": 3109,...",2018-12-04 11:31:05,1353834,116.41,39.92,116.36,39.88


In [9]:
n = data.shape[0]
mode_list_feas = np.zeros((n, 22))
speed, pricePerDis = np.zeros((n,11)), np.zeros((n,11))
for i, plan in tqdm_notebook(enumerate(data['plans'].values)):
#     if i == 5:
#         break
    try:
        cur_plan_list = json.loads(plan)
    except:
        cur_plan_list = []
    if len(cur_plan_list) == 0:
        speed[i] = speed[i]-1
        pricePerDis[i] = pricePerDis[i]-1
    else:
        for tmp_dit in cur_plan_list:
#             print(tmp_dit['distance'])
#             print(tmp_dit['eta'])
            _speed = round(int(tmp_dit['distance'])/int(tmp_dit['eta']) , 4)
            if tmp_dit['price'] == '':
                tmp_dit['price'] = 0
            _pricePerDis = round(int(tmp_dit['price'])/int(tmp_dit['distance']), 4)
            speed[i][int(tmp_dit['transport_mode'])-1] = _speed
            pricePerDis[i][int(tmp_dit['transport_mode'])-1] = _pricePerDis

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [11]:
speed.shape, pricePerDis.shape

((594358, 11), (594358, 11))

In [14]:
feature_data = pd.DataFrame(mode_list_feas)
feature_data.columns = ['fea_{}'.format(i) for i in range(22)]

In [15]:
feature_data.tail()

Unnamed: 0,fea_0,fea_1,fea_2,fea_3,fea_4,fea_5,fea_6,fea_7,fea_8,fea_9,fea_10,fea_11,fea_12,fea_13,fea_14,fea_15,fea_16,fea_17,fea_18,fea_19,fea_20,fea_21
594353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
594354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
594355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
594356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
594357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
for i in range(22):
    if i%2 == 0:
        feature_data['fea_{}'.format(i)] = speed[:,int(i/2)]
    else:
        feature_data['fea_{}'.format(i)] = pricePerDis[:,int(i/2)]

In [17]:
feature_data.tail()

Unnamed: 0,fea_0,fea_1,fea_2,fea_3,fea_4,fea_5,fea_6,fea_7,fea_8,fea_9,fea_10,fea_11,fea_12,fea_13,fea_14,fea_15,fea_16,fea_17,fea_18,fea_19,fea_20,fea_21
594353,0.0,0.0,5.25,0.02,12.07,0.0,10.71,0.32,0.0,0.0,0.0,0.0,4.96,0.03,0.0,0.0,6.6,0.02,0.0,0.0,0.0,0.0
594354,2.35,0.03,0.0,0.0,4.71,0.0,4.47,0.42,0.0,0.0,3.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
594355,0.0,0.0,0.0,0.0,5.66,0.0,4.55,0.29,1.08,0.0,3.3,0.0,2.65,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
594356,0.0,0.0,0.0,0.0,7.93,0.0,7.01,0.37,0.0,0.0,0.0,0.0,5.37,0.03,0.0,0.0,7.15,0.02,0.0,0.0,0.0,0.0
594357,2.49,0.02,0.0,0.0,3.73,0.0,3.55,0.36,0.0,0.0,3.31,0.0,3.16,0.06,0.0,0.0,0.0,0.0,0.0,0.0,2.36,0.05
