In [1]:
import json
import pandas as pd
import numpy as np

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA
from tqdm import tqdm
import math

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from time import gmtime, strftime

import lightgbm as lgb

import matplotlib.pyplot as plt #Visulization
import seaborn as sns #Visulization

import networkx as nx
from node2vec import Node2Vec

import warnings
warnings.filterwarnings('ignore')

# Loading Data

In [2]:
def merge_raw_data_p2(input_dir = '../input/data_set_phase2/', version = 2, nrows = None):
    train_clicks_2 = pd.read_csv(input_dir+'train_clicks_phase{}.csv'.format(version),parse_dates=['click_time'],nrows=nrows)
    train_clicks_2['phase'] = 2
    train_clicks_1 = pd.read_csv(input_dir+'train_clicks_phase{}.csv'.format(version-1),parse_dates=['click_time'],nrows=nrows)
    train_clicks_2['phase'] = 1
    tr_click = train_clicks_2.append(train_clicks_1).reset_index(drop=True)
    
    train_plans_2 = pd.read_csv(input_dir+'train_plans_phase{}.csv'.format(version),parse_dates=['plan_time'],nrows=nrows)
    train_plans_1 = pd.read_csv(input_dir+'train_plans_phase{}.csv'.format(version-1),parse_dates=['plan_time'],nrows=nrows)
    tr_plans = train_plans_2.append(train_plans_1).reset_index(drop=True)
    
    train_queries_2 = pd.read_csv(input_dir+'train_queries_phase{}.csv'.format(version),parse_dates=['req_time'],nrows=nrows)
    train_queries_1 = pd.read_csv(input_dir+'train_queries_phase{}.csv'.format(version-1),parse_dates=['req_time'],nrows=nrows)
    tr_queries = train_queries_2.append(train_queries_1).reset_index(drop=True)
    
    te_queries = pd.read_csv(input_dir+'test_queries.csv',parse_dates=['req_time'],nrows=nrows)
    te_plans = pd.read_csv(input_dir+'test_plans.csv',parse_dates=['plan_time'],nrows=nrows)
    
    tr_data = tr_queries.merge(tr_click, on='sid', how='left')
    tr_data = tr_data.merge(tr_plans, on='sid', how='left')
    tr_data = tr_data.drop(['click_time'], axis=1)
    tr_data['click_mode'] = tr_data['click_mode'].fillna(0)

    te_data = te_queries.merge(te_plans, on='sid', how='left')
    te_data['click_mode'] = -1

    data = pd.concat([tr_data, te_data], axis=0)
    data = data.drop(['plan_time'], axis=1)
    data = data.reset_index(drop=True)
    print('total data size: {}'.format(data.shape))
    print('raw data columns: {}'.format(', '.join(data.columns)))
    return data

In [3]:
def merge_raw_data(input_dir = '../input/data_set_phase1/', nrows = None):
    tr_queries = pd.read_csv(input_dir+'train_queries.csv', nrows=nrows)
    te_queries = pd.read_csv(input_dir+'test_queries.csv', nrows=nrows)
    tr_plans = pd.read_csv(input_dir+'train_plans.csv', nrows=nrows)
    te_plans = pd.read_csv(input_dir+'test_plans.csv', nrows=nrows)
    tr_click = pd.read_csv(input_dir+'train_clicks.csv', nrows=nrows)

    tr_data = tr_queries.merge(tr_click, on='sid', how='left')
    tr_data = tr_data.merge(tr_plans, on='sid', how='left')
    tr_data = tr_data.drop(['click_time'], axis=1)
    tr_data['click_mode'] = tr_data['click_mode'].fillna(0)

    te_data = te_queries.merge(te_plans, on='sid', how='left')
    te_data['click_mode'] = -1

    data = pd.concat([tr_data, te_data], axis=0)
    data = data.drop(['plan_time'], axis=1)
    data = data.reset_index(drop=True)
    print('total data size: {}'.format(data.shape))
    print('raw data columns: {}'.format(', '.join(data.columns)))
    return data

# Graph Method

In [4]:
def get_deepgraph_model(data=None, cols=None, isWeight=False):
    G = nx.DiGraph()
    if isWeight:
        for idx in range(len(cols) -1):
            print("weighted node {} to {}".format(cols[idx], cols[idx+1]))
            idx_tmp_ = [cols[idx], cols[idx+1]]
            G.add_weighted_edges_from(data[idx_tmp_].astype(str).values)
    else:
        for idx in range(len(cols) -1):
            print("node {} to {}".format(cols[idx], cols[idx+1]))
            idx_tmp_ = [cols[idx], cols[idx+1]]
            G.add_edges_from(data[idx_tmp_].astype(str).values)
    node2vec = Node2Vec(
        G,
        dimensions=8,
        walk_length=5,
        num_walks=200,
        workers=40
    )
    model = node2vec.fit(window=30, min_count=1, batch_words=10)
    return model

In [5]:
def get_deepnode2v_embedding(data=None, cols=None, prefix="", model=None, vec_len=32, save_flag=True):
    def _concat_data(data=None, cols=None):
        result = data[cols[0]].astype(str)
        for idx in range(1, len(cols)):
            result = pd.concat([result,data[cols[idx]].astype(str)],axis=0,ignore_index=True)
        return result
    input_sets = _concat_data(data=data, cols=cols)
    tmp_groupby = pd.DataFrame(input_sets).drop_duplicates().reset_index(drop=True)
    tmp_groupby['tmp_node2vec'] = tmp_groupby[0].apply(lambda x:model[x])
    df_node2vec = []
    for i in tmp_groupby['tmp_node2vec'].values:
        df_node2vec.append(i)

    df_node2vec = pd.DataFrame(df_node2vec)
    df_node2vec.columns = ["{}_{}_vec_%d".format('n2v',prefix) % j for j in range(df_node2vec.shape[1])]
    df_node2vec['tmp_val'] = tmp_groupby[0]
    
    def _merge_node_embedding(data=None, df_node2vec=None, col=None, new_col='tmp_val', how='left', need_rename=True):
        df_tmp_ = data[[col]]
        df_tmp_.columns = [new_col]
        df_tmp_ = df_tmp_.merge(df_node2vec, on=new_col, how=how)
        if need_rename:
            df_tmp_ = df_tmp_.rename(columns= lambda x: col+"_"+x if x != new_col else col)
            df_tmp_ = df_tmp_.drop([col], axis=1)
        df_tmp_['sid'] = data['sid']
        return df_tmp_
    def _merge_result(data=None, df_node2vec=None, cols=None):
        result_tmp_ = _merge_node_embedding(data=data, df_node2vec=df_node2vec, col=cols[0])
        for idx in range(1, len(cols)):
            df_tmp_ = _merge_node_embedding(data=data, df_node2vec=df_node2vec, col=cols[idx])
            result_tmp_ = result_tmp_.merge(df_tmp_, on='sid', how='left')
        return result_tmp_
    def _concat_result(data=None, df_node2vec=None, cols=None):
        result_tmp_ = _merge_node_embedding(data=data, df_node2vec=df_node2vec, col=cols[0], need_rename=False)
        for idx in range(1, len(cols)):
            df_tmp_ = _merge_node_embedding(data=data, df_node2vec=df_node2vec, col=cols[idx], need_rename=False)
            result_tmp_ = pd.concat([result_tmp_, df_tmp_],axis=0,ignore_index=True)
        return result_tmp_.drop(['tmp_val'], axis=1)
    
    df_result_merge = _merge_result(data=data, df_node2vec=df_node2vec, cols=cols)
    df_result_concat = _concat_result(data=data, df_node2vec=df_node2vec, cols=cols)
    
    if save_flag:
        df_result_merge.to_pickle('../cache/p2_{}_graph_{}_merge.pkl'.format(prefix, df_result_merge.shape[1]))
        df_result_concat.to_pickle('../cache/p2_{}_graph_{}_concat.pkl'.format(prefix, df_result_concat.shape[1]))
        
    return df_result_merge, df_result_concat

In [8]:
def preprocessing2wordbag(data=None, col=None):
    data[col] = col + "_" + data[col].astype(str) 
    return data

In [6]:
def get_radio_fests(data=None, rank=6):
    for i in range(rank):
        postfix = "_" + str(i)
        data['ep'+postfix] = data['eta'+postfix] / data['pri'+postfix] # 单位时间所需价格
        data['dp'+postfix] = data['dis'+postfix] / data['pri'+postfix] # 单位距离所需价格
        data['de'+postfix] = data['dis'+postfix] / data['eta'+postfix] # 单位距离所需时间
        data['ed'+postfix] = data['eta'+postfix] / data['dis'+postfix] # 单位eta所需距离
        data['pe'+postfix] = data['pri'+postfix] / data['eta'+postfix]
        data['pd'+postfix] = data['pri'+postfix] / data['dis'+postfix]
    return data

In [7]:
def jsonLoads(strs,key):
    '''strs：传进来的json数据
       key：字典的键
    '''
    try:
        dict_ = json.loads(strs)
        return list(i[key] for i in dict_)
    except:
        return [-1]
    
def time_fun(x):
    try:
        return time.mktime(x.timetuple())
    except:
        return -1

def flatten_data_rank(data=None, col=None, prefix="tmp"):
    if prefix != "":
        prefix = prefix+"_"
    df = pd.DataFrame(list(data[col].values))
    df = df.rename(columns= lambda x: prefix+str(x))
    df['sid'] = data['sid']
    return df

In [13]:
def get_od_feas(data=None):
    result = data[['sid']]
    result['o1'] = data['o'].apply(lambda x: float(x.split(',')[0]))
    result['o2'] = data['o'].apply(lambda x: float(x.split(',')[1]))
    result['d1'] = data['d'].apply(lambda x: float(x.split(',')[0]))
    result['d2'] = data['d'].apply(lambda x: float(x.split(',')[1]))
    return result

def get_time_space_feats(data=None):
    result = data[['sid']]
    result['o'] = data['o']
    result['d'] = data['d']
    result['req_time'] = pd.to_datetime(data['req_time'])
    result['dayofweek'] = data['req_time'].dt.dayofweek
    result['weekofyear'] = data['req_time'].dt.weekofyear
    result['isweekend'] = (data['req_time'].dt.weekday>=5).astype(int)
    result['hour'] = data['req_time'].dt.hour
    result['hourminute'] = data['req_time'].dt.hour+data['req_time'].dt.minute/60
#     result['time_diff'] = data['plan_time'].astype(int) - data['req_time'].astype(int)
    return result

# Agg 

In [10]:
def multi_method(x):
    result = 1
    for tmp in x:
        result = result * tmp
    return result

def concat_result_agg(data=None):
    aggs = {}
    for col in data.columns:
        if col != 'sid':
            aggs[col] = ['sum', 'mean', 'std', 'max', 'min', multi_method]
    df_tmp = data.groupby(['sid']).agg(aggs)
    df_tmp.columns = pd.Index([e[0] + "_" + e[1] for e in df_tmp.columns.tolist()])
    return df_tmp.reset_index()

# Main

In [None]:
# all_data = pd.read_pickle("../cache_data/985_all_data_2304916.pickle")
# all_data['req_time'] = pd.to_datetime(all_data['req_time'])
# all_data.head()

In [11]:
all_data = merge_raw_data_p2()
all_data.head()

total data size: (2304916, 8)
raw data columns: click_mode, d, o, phase, pid, plans, req_time, sid


Unnamed: 0,click_mode,d,o,phase,pid,plans,req_time,sid
0,1.0,"114.09,22.57","114.12,22.55",1.0,,"[{""distance"": 6238, ""price"": 300, ""eta"": 1713,...",2018-11-15 18:42:55,1972109
1,5.0,"113.84,22.80","113.84,22.81",1.0,17261.0,"[{""distance"": 527, ""price"": """", ""eta"": 450, ""t...",2018-11-17 14:45:45,1684471
2,2.0,"121.50,31.24","121.51,31.23",1.0,,"[{""distance"": 2243, ""price"": 300, ""eta"": 1027,...",2018-10-06 22:14:36,702620
3,9.0,"113.30,23.15","113.31,23.11",1.0,6784.0,"[{""distance"": 8485, ""price"": 300, ""eta"": 1823,...",2018-11-10 17:46:42,1255247
4,2.0,"113.30,23.10","113.26,23.14",1.0,11803.0,"[{""distance"": 8753, ""price"": 400, ""eta"": 1519,...",2018-11-11 14:05:27,1374354


In [14]:
data_time = get_time_space_feats(data=all_data)
data_time.head()

Unnamed: 0,sid,o,d,req_time,dayofweek,weekofyear,isweekend,hour,hourminute
0,1972109,"114.12,22.55","114.09,22.57",2018-11-15 18:42:55,3,46,0,18,18.7
1,1684471,"113.84,22.81","113.84,22.80",2018-11-17 14:45:45,5,46,1,14,14.75
2,702620,"121.51,31.23","121.50,31.24",2018-10-06 22:14:36,5,40,1,22,22.233333
3,1255247,"113.31,23.11","113.30,23.15",2018-11-10 17:46:42,5,45,1,17,17.766667
4,1374354,"113.26,23.14","113.30,23.10",2018-11-11 14:05:27,6,45,1,14,14.083333


In [15]:
data_space = get_od_feas(data=data_time)
data_space.head()

Unnamed: 0,sid,o1,o2,d1,d2
0,1972109,114.12,22.55,114.09,22.57
1,1684471,113.84,22.81,113.84,22.8
2,702620,121.51,31.23,121.5,31.24
3,1255247,113.31,23.11,113.3,23.15
4,1374354,113.26,23.14,113.3,23.1


In [16]:
data_timespace = data_time.merge(data_space, on='sid', how='left')

In [17]:
data_timespace.head()

Unnamed: 0,sid,o,d,req_time,dayofweek,weekofyear,isweekend,hour,hourminute,o1,o2,d1,d2
0,1972109,"114.12,22.55","114.09,22.57",2018-11-15 18:42:55,3,46,0,18,18.7,114.12,22.55,114.09,22.57
1,1684471,"113.84,22.81","113.84,22.80",2018-11-17 14:45:45,5,46,1,14,14.75,113.84,22.81,113.84,22.8
2,702620,"121.51,31.23","121.50,31.24",2018-10-06 22:14:36,5,40,1,22,22.233333,121.51,31.23,121.5,31.24
3,1255247,"113.31,23.11","113.30,23.15",2018-11-10 17:46:42,5,45,1,17,17.766667,113.31,23.11,113.3,23.15
4,1374354,"113.26,23.14","113.30,23.10",2018-11-11 14:05:27,6,45,1,14,14.083333,113.26,23.14,113.3,23.1


In [18]:
processing_list = [
    'o1','o2','d1','d2','hour','dayofweek','weekofyear','isweekend'
]
for col in processing_list:
    data_timespace = preprocessing2wordbag(data=data_timespace, col=col)
data_timespace.head()

Unnamed: 0,sid,o,d,req_time,dayofweek,weekofyear,isweekend,hour,hourminute,o1,o2,d1,d2
0,1972109,"114.12,22.55","114.09,22.57",2018-11-15 18:42:55,dayofweek_3,weekofyear_46,isweekend_0,hour_18,18.7,o1_114.12,o2_22.55,d1_114.09,d2_22.57
1,1684471,"113.84,22.81","113.84,22.80",2018-11-17 14:45:45,dayofweek_5,weekofyear_46,isweekend_1,hour_14,14.75,o1_113.84,o2_22.81,d1_113.84,d2_22.8
2,702620,"121.51,31.23","121.50,31.24",2018-10-06 22:14:36,dayofweek_5,weekofyear_40,isweekend_1,hour_22,22.233333,o1_121.51,o2_31.23,d1_121.5,d2_31.24
3,1255247,"113.31,23.11","113.30,23.15",2018-11-10 17:46:42,dayofweek_5,weekofyear_45,isweekend_1,hour_17,17.766667,o1_113.31,o2_23.11,d1_113.3,d2_23.15
4,1374354,"113.26,23.14","113.30,23.10",2018-11-11 14:05:27,dayofweek_6,weekofyear_45,isweekend_1,hour_14,14.083333,o1_113.26,o2_23.14,d1_113.3,d2_23.1


In [20]:
processing_list = [
    'o1','o2','d1','d2','hour','dayofweek','weekofyear','isweekend'
]

In [21]:
model = get_deepgraph_model(
    data = data_timespace,
    cols = processing_list
)

node o1 to o2
node o2 to d1
node d1 to d2
node d2 to hour
node hour to dayofweek
node dayofweek to weekofyear
node weekofyear to isweekend


Computing transition probabilities: 100%|██████████| 1771/1771 [00:05<00:00, 348.73it/s]


In [22]:
result_merge, result_concat = get_deepnode2v_embedding(
    data = data_timespace,
    cols=processing_list,
    model=model,
    prefix='o1o2d1d2hdowwoywd',
)

In [23]:
result_all = concat_result_agg(data=result_concat)

In [24]:
result_all.to_pickle('../cache/p2_timespace_agg_49_dpt7.pkl')

In [28]:
result_merge.to_pickle('../cache/p2_timespace_org_65_dpt7_提升0.2个千分点.pkl')

In [26]:
result_all.head()

Unnamed: 0,sid,n2v_o1o2d1d2hdowwoywd_vec_0_sum,n2v_o1o2d1d2hdowwoywd_vec_0_mean,n2v_o1o2d1d2hdowwoywd_vec_0_std,n2v_o1o2d1d2hdowwoywd_vec_0_max,n2v_o1o2d1d2hdowwoywd_vec_0_min,n2v_o1o2d1d2hdowwoywd_vec_0_multi_method,n2v_o1o2d1d2hdowwoywd_vec_1_sum,n2v_o1o2d1d2hdowwoywd_vec_1_mean,n2v_o1o2d1d2hdowwoywd_vec_1_std,...,n2v_o1o2d1d2hdowwoywd_vec_6_std,n2v_o1o2d1d2hdowwoywd_vec_6_max,n2v_o1o2d1d2hdowwoywd_vec_6_min,n2v_o1o2d1d2hdowwoywd_vec_6_multi_method,n2v_o1o2d1d2hdowwoywd_vec_7_sum,n2v_o1o2d1d2hdowwoywd_vec_7_mean,n2v_o1o2d1d2hdowwoywd_vec_7_std,n2v_o1o2d1d2hdowwoywd_vec_7_max,n2v_o1o2d1d2hdowwoywd_vec_7_min,n2v_o1o2d1d2hdowwoywd_vec_7_multi_method
0,0,-1.482367,-0.185296,3.768091,4.848811,-5.644242,31.279555,-0.129617,-0.016202,2.510478,...,2.817604,5.214931,-4.65442,-0.280956,9.338462,1.167308,2.648454,5.249259,-2.109113,-2.608662
1,1,-0.973502,-0.121688,3.613436,4.807146,-5.003404,-10.515536,-0.162761,-0.020345,2.527474,...,2.784057,5.214931,-4.596247,0.117861,8.72789,1.090986,2.61506,5.249259,-2.100688,-0.033686
2,2,-1.400336,-0.175042,3.583576,4.78417,-5.059458,-2.991416,0.369042,0.04613,2.56072,...,2.760592,5.238727,-4.517534,0.160933,8.769814,1.096227,2.607557,5.2519,-2.100688,0.017867
3,3,-0.923306,-0.115413,3.641817,4.774739,-5.128265,10.503671,0.317095,0.039637,2.531002,...,2.80564,5.214931,-4.684876,0.077315,8.976954,1.122119,2.6935,5.249259,-2.139433,1.468988
4,4,-1.311442,-0.16393,3.651794,4.848811,-5.272961,3.46005,-0.124816,-0.015602,2.54235,...,2.805384,5.177782,-4.65442,0.306287,8.794183,1.099273,2.650932,5.249259,-2.106559,1.947792


In [27]:
result_merge.head()

Unnamed: 0,o1_n2v_o1o2d1d2hdowwoywd_vec_0,o1_n2v_o1o2d1d2hdowwoywd_vec_1,o1_n2v_o1o2d1d2hdowwoywd_vec_2,o1_n2v_o1o2d1d2hdowwoywd_vec_3,o1_n2v_o1o2d1d2hdowwoywd_vec_4,o1_n2v_o1o2d1d2hdowwoywd_vec_5,o1_n2v_o1o2d1d2hdowwoywd_vec_6,o1_n2v_o1o2d1d2hdowwoywd_vec_7,sid,o2_n2v_o1o2d1d2hdowwoywd_vec_0,...,weekofyear_n2v_o1o2d1d2hdowwoywd_vec_6,weekofyear_n2v_o1o2d1d2hdowwoywd_vec_7,isweekend_n2v_o1o2d1d2hdowwoywd_vec_0,isweekend_n2v_o1o2d1d2hdowwoywd_vec_1,isweekend_n2v_o1o2d1d2hdowwoywd_vec_2,isweekend_n2v_o1o2d1d2hdowwoywd_vec_3,isweekend_n2v_o1o2d1d2hdowwoywd_vec_4,isweekend_n2v_o1o2d1d2hdowwoywd_vec_5,isweekend_n2v_o1o2d1d2hdowwoywd_vec_6,isweekend_n2v_o1o2d1d2hdowwoywd_vec_7
0,-0.021389,0.033446,0.007218,-0.025591,0.052929,0.052003,0.03977,0.002867,1972109,-0.105528,...,-4.596247,-1.099687,4.770511,-4.113794,-0.822136,2.055679,-2.756926,-0.449202,-0.557466,5.249259
1,0.03323,-0.058257,0.059826,0.055996,0.004538,0.021719,-0.011802,0.045829,1684471,0.445442,...,-4.596247,-1.099687,4.78417,-3.988956,-0.823715,2.113321,-2.737811,-0.574446,-0.499958,5.2519
2,0.009293,0.000243,-0.002376,-0.060933,0.011914,0.035899,0.0196,-0.05463,702620,1.558865,...,-4.517534,-1.096882,4.78417,-3.988956,-0.823715,2.113321,-2.737811,-0.574446,-0.499958,5.2519
3,0.036789,-0.033547,0.046498,0.047731,-0.022168,0.026637,0.059493,0.017157,1255247,1.127046,...,-4.652964,-1.107869,4.78417,-3.988956,-0.823715,2.113321,-2.737811,-0.574446,-0.499958,5.2519
4,-0.017843,0.008748,-0.045551,0.058378,-0.049001,-0.045818,-0.021498,0.047044,1374354,1.002555,...,-4.652964,-1.107869,4.78417,-3.988956,-0.823715,2.113321,-2.737811,-0.574446,-0.499958,5.2519
