In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame as DF
import xgboost as xgb
import lightgbm as lgb
import catboost as cbt
import json
from sklearn.metrics import f1_score
import geohash

In [2]:
input_dir = '../input/data_set_phase1/'
cv = 5
random_seed = 2019
for_test = True

In [3]:
import time
t1 = time.time()
print("Now Input Data...")
profiles = pd.read_csv(input_dir+'profiles.csv')
train_clicks = pd.read_csv(input_dir+'train_clicks.csv',parse_dates=['click_time'])
train_plans = pd.read_csv(input_dir+'train_plans.csv',parse_dates=['plan_time'])
train_queries = pd.read_csv(input_dir+'train_queries.csv',parse_dates=['req_time'])
test_plans = pd.read_csv(input_dir+'test_plans.csv',parse_dates=['plan_time'])
test_queries = pd.read_csv(input_dir+'test_queries.csv',parse_dates=['req_time'])
print("Use Time {}".format(time.time()-t1))

Now Input Data...
Use Time 3.926577091217041


In [4]:
print(profiles.shape,train_clicks.shape,train_plans.shape,train_queries.shape)
print(test_plans.shape,test_queries.shape)

(63090, 67) (453336, 3) (491054, 3) (500000, 5)
(92571, 3) (94358, 5)


In [5]:
def jsonLoads(strs,key):
    '''strs：传进来的json数据
       key：字典的键
    '''
    try:
        dict_ = json.loads(strs)
        return list(i[key] for i in dict_)
    except:
        return [-1]
    
def time_fun(x):
    try:
        return time.mktime(x.timetuple())
    except:
        return -1
    
def flatten_data(col):
    """
    把plans  flatten
    """
    df = pd.DataFrame(list(plans[col].values))
    df['sid'] = plans['sid']
    dis = pd.DataFrame()
    for i in df.columns[:-1]:
        dis_df = df.loc[:,[i,'sid']].copy()
        dis_df.columns = [col,'sid']
        dis = pd.concat([dis,dis_df],axis=0,sort=False)
    dis = dis.dropna()
#     dis = dis.sort_values('sid').reset_index(drop = True)
    return dis

In [6]:
plans = pd.concat([train_plans,test_plans],axis=0).reset_index(drop = True)
queries = pd.concat([train_queries,test_queries],axis=0).reset_index(drop = True)
train_clicks = train_clicks.merge(train_queries[['sid']],how='right',on='sid')
train_clicks.fillna(0,inplace=True)
# train_clicks.dropna(inplace=True)
print(train_clicks.shape)

data = train_clicks[['sid','click_mode']].copy()
test_id = test_queries[['sid']].copy()
data = pd.concat([data,test_id],axis=0,sort=False).fillna(-1).reset_index(drop = True)
plans = data[['sid']].merge(plans,on='sid',how='left').reset_index(drop = True)
queries = data[['sid']].merge(queries,on='sid',how='left').reset_index(drop = True)

(500000, 3)


In [7]:
from tqdm import tqdm

for i in tqdm(['distance','price','eta','transport_mode']):
    plans[i] = plans['plans'].apply(jsonLoads, key=i)

100%|██████████| 4/4 [00:18<00:00,  4.70s/it]


In [8]:
distance = flatten_data(col = 'distance')
price = flatten_data(col = 'price')
price.replace('',np.nan,inplace=True)
eta = flatten_data(col = 'eta')
transport_mode = flatten_data(col = 'transport_mode')

"""transport_mode_rank"""
plans['transport_mode_rank'] = plans['transport_mode'].apply(lambda x:np.arange(len(x)))
transport_mode_rank = flatten_data(col = 'transport_mode_rank')

plans_df = pd.concat([distance,transport_mode_rank.iloc[:,0],eta.iloc[:,0],transport_mode.iloc[:,0],price.iloc[:,0]],axis=1)

transport_mode_list = plans[['sid','transport_mode']].copy()
transport_mode_list.columns = ['sid','transport_mode_list']
plans_df = plans_df.merge(plans[['sid','plan_time']], on='sid',how='left')
# plans_df = plans_df.merge(transport_mode_list, on='sid',how='left')

In [9]:
data = data.merge(plans_df, on='sid',how='left')
data = data.merge(queries, on='sid',how='left')
data['ep'] = data['eta'] / data['price'] # 单位时间所需价格
data['dp'] = data['distance'] / data['price'] # 单位距离所需价格
data['de'] = data['distance'] / data['eta'] # 单位距离所需时间
data['ed'] = data['eta'] / data['distance'] # 单位eta所需距离
data['pe'] = data['price'] / data['eta'] 
data['pd'] = data['price'] / data['distance']

data['price_rank'] = data[['sid','price']].groupby(['sid'])['price'].rank(method='min')
data['distance_rank'] = data[['sid','distance']].groupby(['sid'])['distance'].rank(method='min')
data['eta_rank'] = data[['sid','eta']].groupby(['sid'])['eta'].rank(method='min')

In [10]:
# Get Label
data['label'] = list(map(lambda x,y:1 if x==y else 0,data['transport_mode'],data['click_mode']))
data['label'] = list(map(lambda x,y:-1 if x==-1 else y,data['click_mode'],data['label']))
print(data['label'].value_counts())

# have_0 = data[data['label']==0]
# have_1 = data[data['label']==1]
# have_test = data[data['label']==-1]
# have_1.loc[~have_1.index.isin(have_1.drop_duplicates(subset=['sid','click_mode','label'],keep='last').index),'label'] = 0

# data = pd.concat([have_0.reset_index(drop=True),have_1.reset_index(drop=True),have_test.reset_index(drop=True)],axis=0).reset_index(drop=True)
# print(data['label'].value_counts())

data = data.sort_values(by=['sid','transport_mode_rank'])
data.head()


 0    1782333
 1     503435
-1     431590
Name: label, dtype: int64


Unnamed: 0,sid,click_mode,distance,transport_mode_rank,eta,transport_mode,price,plan_time,pid,req_time,...,ep,dp,de,ed,pe,pd,price_rank,distance_rank,eta_rank,label
2111516,2,0.0,2898.0,0.0,1794.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,8.97,14.49,1.615385,0.619048,0.111483,0.069013,1.0,3.0,4.0,0
2111517,2,0.0,2714.0,1.0,818.0,6.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,,,3.317848,0.3014,,,,2.0,1.0,0
2111518,2,0.0,3365.0,2.0,1146.0,3.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,,,2.9363,0.340565,,,,5.0,2.0,0
2111519,2,0.0,3365.0,3.0,1446.0,4.0,1700.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,0.850588,1.979412,2.327109,0.429718,1.175657,0.505201,3.0,5.0,3.0,0
2111520,2,0.0,2957.0,4.0,2158.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,10.79,14.785,1.37025,0.729794,0.092678,0.067636,1.0,4.0,5.0,0


In [11]:
from math import radians, atan, tan, sin, acos, cos

def get_ktime_feature(k,data,i):
    kfc = data.copy()
    for time in range(1,k):
        tmp = kfc.sort_values(by=['sid',i]).drop_duplicates(subset=['sid'],keep='first')
        kfc = kfc[~kfc.index.isin(tmp.index)]
    tmp = kfc.sort_values(by=['sid',i]).drop_duplicates(subset=['sid'],keep='first')
    return tmp

from scipy import stats

def get_mode(x):
    return stats.mode(x)[0][0]

def get_mode_count(x):
    return stats.mode(x)[1][0]

def getDistance(latA, lonA, latB, lonB):  
    ra = 6378140     # radius of equator: meter  
    rb = 6356755     # radius of polar: meter  
    flatten = (ra - rb) / ra   # Partial rate of the earth  
    # change angle to radians  
    radLatA = radians(latA)  
    radLonA = radians(lonA)  
    radLatB = radians(latB)  
    radLonB = radians(lonB)  
  
    try: 
        pA = atan(rb / ra * tan(radLatA))  
        pB = atan(rb / ra * tan(radLatB))  
        x = acos(sin(pA) * sin(pB) + cos(pA) * cos(pB) * cos(radLonA - radLonB))  
        c1 = (sin(x) - x) * (sin(pA) + sin(pB))**2 / cos(x / 2)**2  
        c2 = (sin(x) + x) * (sin(pA) - sin(pB))**2 / sin(x / 2)**2  
        dr = flatten / 8 * (c1 - c2)  
        distance = ra * (x + dr)  
        return distance   # meter   
    except:
        return 0.0000001
    
for i in tqdm(['o','d']):
    data[i+'x'] = data[i].apply(lambda x:float(x.split(',')[0]))
    data[i+'y'] = data[i].apply(lambda x:float(x.split(',')[1]))
data['odl2_dis'] = np.sqrt((data['dx']-data['ox'])**2+(data['dy']-data['oy'])**2)

# 球面距离及其衍生特征
sphere_dis = []
for i in tqdm(data[['oy','ox','dy','dx']].values):
    sphere_dis.append(getDistance(i[0],i[1],i[2],i[3]))
data['sphere_dis'] = sphere_dis

data['req_time_dow'] = data['req_time'].dt.dayofweek
data['req_time_woy'] = data['req_time'].dt.weekofyear
data['req_is_weekend'] = (data['req_time'].dt.weekday>=5).astype(int)
data['req_time_hour'] = data['req_time'].dt.hour+data['req_time'].dt.minute/60

data['o_geohash'] = list(map(lambda x,y:geohash.encode(x,y,8),data['oy'],data['ox']))
data['d_geohash'] = list(map(lambda x,y:geohash.encode(x,y,8),data['dy'],data['dx']))
base32 = {x:i+1 for i,x in enumerate(list('0123456789bcdefghjkmnpqrstuvwxyz') )}
print(base32)
def geohash2int(geohash_id):
    result = 0
    base = 1
    for each in geohash_id[::-1]:
        result = result + base32[each] * base
        base = base*32
    return result
    
data['o_geohash'] = data['o_geohash'].map(geohash2int)
data['d_geohash'] = data['d_geohash'].map(geohash2int)

data.head()

100%|██████████| 2/2 [00:05<00:00,  2.71s/it]
100%|██████████| 2717358/2717358 [00:09<00:00, 290788.45it/s]


{'0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'j': 18, 'k': 19, 'm': 20, 'n': 21, 'p': 22, 'q': 23, 'r': 24, 's': 25, 't': 26, 'u': 27, 'v': 28, 'w': 29, 'x': 30, 'y': 31, 'z': 32}


Unnamed: 0,sid,click_mode,distance,transport_mode_rank,eta,transport_mode,price,plan_time,pid,req_time,...,dx,dy,odl2_dis,sphere_dis,req_time_dow,req_time_woy,req_is_weekend,req_time_hour,o_geohash,d_geohash
2111516,2,0.0,2898.0,0.0,1794.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,116.41,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609
2111517,2,0.0,2714.0,1.0,818.0,6.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,116.41,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609
2111518,2,0.0,3365.0,2.0,1146.0,3.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,116.41,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609
2111519,2,0.0,3365.0,3.0,1446.0,4.0,1700.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,116.41,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609
2111520,2,0.0,2957.0,4.0,2158.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,116.41,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609


In [12]:
print("Now Add Columns")
data['length_sid'] = data['sid'].map(data.groupby('sid').size())
add_columns = []

Now Add Columns


In [13]:
# 统计全局plan里 price_eta_distance的互相交叉作用

stat_1 = data[['distance','price','eta']].copy().drop_duplicates()

for i in tqdm(['price','eta','distance']):
    for j in ['price','eta','distance']:
        if i!=j:
            now = data[[i,j]].groupby([i])[j].agg(['mean','min','max','std']).add_prefix("{}_{}_".format(i,j))
            add_columns.extend(now.columns)
            now = now.reset_index()
            stat_1 = stat_1.merge(now,on=i,how='left')

stat_1.head()

100%|██████████| 3/3 [00:03<00:00,  1.27s/it]


Unnamed: 0,distance,price,eta,price_eta_mean,price_eta_min,price_eta_max,price_eta_std,price_distance_mean,price_distance_min,price_distance_max,...,eta_distance_max,eta_distance_std,distance_price_mean,distance_price_min,distance_price_max,distance_price_std,distance_eta_mean,distance_eta_min,distance_eta_max,distance_eta_std
0,2898.0,200.0,1794.0,2181.607738,445.0,12933.0,883.943658,4627.434087,331.0,53331.0,...,33308.0,6138.437704,667.241379,200.0,1800.0,570.437792,1399.0,411.0,2729.0,760.618301
1,2714.0,,818.0,,,,,,,,...,10988.0,1959.392855,613.043478,200.0,2000.0,536.303537,1372.166667,259.0,2544.0,723.450851
2,3365.0,,1146.0,,,,,,,,...,19477.0,3433.232057,617.1875,200.0,2700.0,628.567201,1604.537879,441.0,3101.0,832.760455
3,3365.0,1700.0,1446.0,2959.530882,1.0,18978.0,2632.387731,17047.997839,1.0,118436.0,...,26273.0,4543.045634,617.1875,200.0,2700.0,628.567201,1604.537879,441.0,3101.0,832.760455
4,2957.0,200.0,2158.0,2181.607738,445.0,12933.0,883.943658,4627.434087,331.0,53331.0,...,35249.0,6726.342007,794.545455,200.0,2000.0,584.496675,1490.685315,366.0,2762.0,812.454934


In [None]:
# 统计全局plan里 和od的交叉作用

stat_2 = data[['o','d','sphere_dis','odl2_dis']].copy().drop_duplicates()

for i in tqdm(['o','d','sphere_dis','odl2_dis']):
    now = data[[i]].groupby(i)[i].agg(['count']).add_prefix("{}_".format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    data = data.merge(now,on=i,how='left')
    for j in ['price','eta','distance']:
        if i!=j:
            now = data[[i,j]].groupby([i])[j].agg(['mean','min','max','std']).add_prefix("{}_{}_".format(i,j))
            add_columns.extend(now.columns)
            now = now.reset_index()
            stat_2 = stat_2.merge(now,on=i,how='left')
            
for i in tqdm(['price','eta','distance','ep','de','ed','pd','dp','pe','transport_mode_rank']):
    now = data[['o','d',i]].groupby(['o','d'])[i].agg(['mean','min','max','std']).add_prefix('o_d_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_2 = stat_2.merge(now,on=['o','d'],how='left')

for i in tqdm(['transport_mode']):
    now = data[['o','d',i]].groupby(['o','d'])[i].agg(['nunique','std',get_mode,get_mode_count]).add_prefix('o_d_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_2 = stat_2.merge(now,on=['o','d'],how='left')

stat_2.head()

100%|██████████| 4/4 [00:14<00:00,  3.57s/it]
100%|██████████| 10/10 [00:07<00:00,  1.36it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# pid 交叉统计

stat_3 = data[['pid']].copy().drop_duplicates()
for i in tqdm(['price','distance','eta','sphere_dis','odl2_dis','transport_mode_rank']):
    now = data[['pid',i]].groupby('pid')[i].agg(['mean','min','max','std']).add_prefix('pid_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_3 = stat_3.merge(now,on='pid',how='left')
    
for i in tqdm(['transport_mode']):
    now = data[['pid',i]].groupby('pid')[i].agg(['nunique','std',get_mode,get_mode_count]).add_prefix('pid_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_3 = stat_3.merge(now,on='pid',how='left')

stat_3.head()

In [None]:
# 与时间交叉

stat_4 = data[['req_time_dow']].copy().drop_duplicates()
for i in tqdm(['price','distance','eta','sphere_dis','odl2_dis','transport_mode_rank']):
    now = data[['req_time_dow',i]].groupby('req_time_dow')[i].agg(['mean','min','max','std','skew']).add_prefix('req_time_dow_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_4 = stat_4.merge(now,on='req_time_dow',how='left')
    
for i in tqdm(['transport_mode']):
    now = data[['req_time_dow',i]].groupby('req_time_dow')[i].agg(['nunique','std',get_mode,get_mode_count,'median']).add_prefix('req_time_dow_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_4 = stat_4.merge(now,on='req_time_dow',how='left')

stat_4.head()

In [None]:
# 与时间交叉

stat_5 = data[['req_time_hour']].copy().drop_duplicates()
for i in tqdm(['price','distance','eta','sphere_dis','odl2_dis','transport_mode_rank']):
    now = data[['req_time_hour',i]].groupby('req_time_hour')[i].agg(['mean','min','max','std','skew']).add_prefix('req_time_hour_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_5 = stat_5.merge(now,on='req_time_hour',how='left')
    
for i in tqdm(['transport_mode']):
    now = data[['req_time_hour',i]].groupby('req_time_hour')[i].agg(['nunique','std',get_mode,get_mode_count,'median']).add_prefix('req_time_hour_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_5 = stat_5.merge(now,on='req_time_hour',how='left')

stat_5.head()

In [None]:
# 时空交叉
stat_6 = data[['req_time_hour','o','d']].copy().drop_duplicates()
for i in tqdm(['price','distance','eta','sphere_dis','odl2_dis']):
    
    now = data[['req_time_hour','o','d',i]].groupby(['req_time_hour','o','d'])[i].agg(['mean','min','max','std','median']).add_prefix('od_hour_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_6 = stat_6.merge(now,on=['req_time_hour','o','d'],how='left')
    
stat_6.head()

In [None]:
stat_7 = data[['sid']].drop_duplicates()

for i in tqdm(['transport_mode','transport_mode_rank']):
    for j in ['price','eta','distance','odl2_dis','sphere_dis']:
        if i!=j:
            tmp = data[['sid',i,j]].groupby(['sid',i])[j].agg(['mean','min','max','std']).add_prefix("double_sid_{}_{}_".format(i,j)).reset_index()
            tmp_fea = [n for n in tmp if n not in ['sid',i]]
            tmp = tmp.groupby(['sid'])[tmp_fea].agg(['mean','std','max','min'])
            tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
            stat_7 = stat_7.merge(tmp.reset_index(),on='sid',how='left')
            
stat_7.head()

In [None]:
stat_8 = data[['sid','distance','price','eta']].drop_duplicates()

for i in tqdm(['distance','price','eta']):
    tmp = data[['sid',i]].groupby(['sid'])[i].agg(['mean','min','max','std'])
    tmp['mean_max_ratio'] = tmp['mean'] / tmp['max'] 
    tmp['min_mean_ratio'] = tmp['min'] / tmp['mean']
    tmp = tmp.add_prefix('sid_'+i+'_')
    stat_8 = stat_8.merge(tmp,on='sid',how='left')
    
stat_8.head()

In [None]:
stat_9 = data[['transport_mode']].drop_duplicates()

for i in tqdm(['transport_mode']):

    tmp = data[[i,'price','eta','distance','ep','pe','ed','de','dp','pd']].groupby([i]).agg({'price' : ['mean','min','max','std'],
                                                                                             'eta' : ['mean','min','max','std'],
                                                                                             'distance' : ['mean','min','max','std'],
                                                                                             'ep' : ['mean','min','max','std'],
                                                                                             'pe' : ['mean','min','max','std'],
                                                                                             'dp' : ['mean','min','max','std'],
                                                                                             'pd' : ['mean','min','max','std'],
                                                                                             'de' : ['mean','min','max','std'],
                                                                                             'ed' : ['mean','min','max','std'],})

    tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
    tmp = tmp.add_prefix(i+'_').reset_index()
    stat_9 = stat_9.merge(tmp,how='left',on=i)

    tmp = data[[i,'sphere_dis','odl2_dis']].groupby([i]).agg({'sphere_dis' : ['mean','min','max','std'],
                                                              'odl2_dis' : ['mean','min','max','std'],})

    tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
    tmp = tmp.add_prefix(i+'_').reset_index()
    stat_9 = stat_9.merge(tmp,how='left',on=i)
    
stat_9.shape

In [None]:
stat_10 = data[['transport_mode_rank']].drop_duplicates()

for i in tqdm(['transport_mode_rank']):

    tmp = data[[i,'price','eta','distance','ep','pe','ed','de','dp','pd']].groupby([i]).agg({'price' : ['mean','min','max','std'],
                                                                                             'eta' : ['mean','min','max','std'],
                                                                                             'distance' : ['mean','min','max','std'],
                                                                                             'ep' : ['mean','min','max','std'],
                                                                                             'pe' : ['mean','min','max','std'],
                                                                                             'dp' : ['mean','min','max','std'],
                                                                                             'pd' : ['mean','min','max','std'],
                                                                                             'de' : ['mean','min','max','std'],
                                                                                             'ed' : ['mean','min','max','std'],})

    tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
    tmp = tmp.add_prefix(i+'_').reset_index()
    stat_10 = stat_10.merge(tmp,how='left',on=i)

    tmp = data[[i,'sphere_dis','odl2_dis']].groupby([i]).agg({'sphere_dis' : ['mean','min','max','std'],
                                                              'odl2_dis' : ['mean','min','max','std'],})

    tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
    tmp = tmp.add_prefix(i+'_').reset_index()
    stat_10 = stat_10.merge(tmp,how='left',on=i)
    
stat_10.shape

In [None]:
for i in tqdm(data.columns):
    try:
        now = data[i].std()
        if now==0:
            del data[i]
    except:
        continue
        
data.shape

In [None]:
# More Feature

data['distance_sphere_ratio'] = data['distance'] / data['sphere_dis']
data['ori_eta'] = data['eta'] / data['distance_sphere_ratio']
data['ori_price'] = data['price'] / data['distance_sphere_ratio']


In [None]:
# profiles & plans
# 60 + 4*5 = 80

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

data = data.merge(profiles,on='pid',how='left')
print(data.shape)

tmp = data[['pid','transport_mode']].groupby(['pid'])['transport_mode'].agg(['median','std','nunique','count',get_mode,get_mode_count]).add_prefix('pid_transport_mode_').reset_index()
data = data.merge(tmp,how='left',on='pid')

tmp = data[['transport_mode','pid']].groupby(['transport_mode'])['pid'].agg(['std','nunique','count',get_mode,get_mode_count]).add_prefix('transport_mode_pid_').reset_index()
data = data.merge(tmp,how='left',on='transport_mode')

now = data[['pid']].drop_duplicates()
for i in tqdm(['price','eta','distance']):
    tmp = data[['pid',i]].groupby(['pid'])[i].agg(['std','nunique',get_mode,'mean']).add_prefix('pid_{}_'.format(i)).reset_index()
    now = now.merge(tmp,how='left',on='pid')

data = data.merge(now,how='left',on='pid')
data.shape

In [None]:
N_COM = 5

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

def tokenize(data):
    tokenized_docs = [word_tokenize(doc) for doc in data]
    alpha_tokens = [[t.lower() for t in doc if t.isalpha() == True] for doc in tokenized_docs]
    lemmatizer = WordNetLemmatizer()
    lem_tokens = [[lemmatizer.lemmatize(alpha) for alpha in doc] for doc in alpha_tokens]
    X_stem_as_string = [" ".join(x_t) for x_t in lem_tokens]
    return X_stem_as_string

vct = CountVectorizer(lowercase=False)
svd = TruncatedSVD(n_components=N_COM, random_state=2019)
tfvec = TfidfVectorizer(ngram_range=(1, 6),analyzer='char_wb')

for i in tqdm(['plans','distance','price','eta','transport_mode']):
    x = plans[i].astype(str).fillna('NAN').values
    x = tfvec.fit_transform(x)
    x = svd.fit_transform(x)
    svd_feas = pd.DataFrame(x)
    svd_feas.columns = ['{}_svd_fea_{}'.format(i,j) for j in range(N_COM)]
    svd_feas['sid'] = plans['sid'].values
    data = data.merge(svd_feas, on='sid', how='left')

print(data.shape)

In [None]:
plans_ana = plans.merge(train_clicks,how='left',on='sid')
plans_ana['transport_mode'] = plans_ana['transport_mode'].astype('str')
tmp = plans_ana.groupby(['transport_mode'])['click_mode'].agg(['nunique','count',get_mode,get_mode_count]).add_prefix("all_transport_mode_click_mode_").reset_index()
new_col = [i for i in tmp.columns if i not in ['transport_mode']]
tmp = plans_ana.merge(tmp,how='left',on=['transport_mode'])
data = data.merge(tmp[new_col+['sid']],how='left',on=['sid'])
data.shape

In [None]:
tmp = data[['transport_mode','transport_mode_rank','price','eta','distance']].groupby(['transport_mode','transport_mode_rank']).\
                    agg({'price' : ['mean','min','max','std'],
                        'eta' : ['mean','min','max','std'],
                        'distance' : ['mean','min','max','std']})

tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
tmp = tmp.add_prefix('transport_mode_and_rank_').reset_index()
data = data.merge(tmp,how='left',on=['transport_mode','transport_mode_rank'])
data.shape

In [None]:
data = data.merge(stat_1,on=['distance','price','eta'],how='left').\
            merge(stat_2,on=['o','d','sphere_dis','odl2_dis'],how='left').\
            merge(stat_3,on='pid',how='left').\
            merge(stat_4,on='req_time_dow',how='left').\
            merge(stat_5,on='req_time_hour',how='left').\
            merge(stat_6,on=['o','d','req_time_hour'],how='left').\
            merge(stat_7,on=['sid'],how='left').\
            merge(stat_8,on=['sid','price','eta','distance'],how='left').\
            merge(stat_9,on=['transport_mode'],how='left').\
            merge(stat_10,on=['transport_mode_rank'],how='left')

all_data['nan_total'] = all_data.isnull().sum(axis=1)

data.shape

In [None]:
to_del = []
for i in tqdm(data.columns):
    try:
        now = data[i].std()
        if now<=0.1:
            to_del.append(i)
    except:
        continue
        
len(to_del)

In [None]:
all_data = data.copy()

feature_name = [i for i in all_data.columns if i not in ['label','click_mode','plan_time','req_time','transport_mode']]
feature_name = [i for i in feature_name if i not in to_del]
cate_feature = ['sid','o','d','req_time_woy','req_is_weekend','pid','o_geohash','d_geohash','ox','oy','dx','dy']

for i in tqdm(cate_feature):
    all_data[i] = all_data[i].astype('category')
    
print(len(cate_feature),' ',len(feature_name))

In [31]:
%%time
data.to_pickle("../cache_data/{}_raw_data.pickle".format(data.shape[1]))
# all_data.to_pickle("../cache_data/{}_all_data.pickle".format(all_data.shape[1]))

CPU times: user 7.3 s, sys: 12.8 s, total: 20.1 s
Wall time: 20.1 s


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

def f1_macro(preds, labels):
    labels = np.argmax(labels.reshape(12, -1), axis=0)
    score = f1_score(y_true=labels, y_pred=preds, average='weighted')
    return 'f1_score', score, True

def get_f1_score(y, pred):
    pred_lst = pred.tolist()
    pred_lst = [item.index(max(item)) for item in pred_lst]
    score = []
    for i in range(12):
        score.append(f1_score([1 if i==item else 0 for item in y],
                              [1 if i==item else 0 for item in pred_lst]))
    c = Counter(y)
    score = [item*c[ix]/len(y) for ix, item in enumerate(score)]
    score = np.sum(score)
    print('f1-score = {:.4f}'.format(score))
    return score

# Define F1 Train

In [36]:
train_index = (all_data.req_time < '2018-11-23')
X_train     = all_data[train_index][feature_name].reset_index(drop=True)
train_label = all_data[train_index].label.reset_index(drop=True)

valid_index = (all_data.req_time > '2018-11-23') & (all_data.req_time < '2018-12-01')
X_val       = all_data[valid_index][feature_name].reset_index(drop=True)
y_val       = all_data[valid_index].label.reset_index(drop=True)

train_index = (all_data.req_time < '2018-12-01')
train       = all_data[train_index].reset_index(drop=True)

test_index = (all_data.req_time > '2018-12-01')
X_test     = all_data[test_index][feature_name].reset_index(drop=True)

In [37]:
from collections import Counter

# Offline
lgb_model = lgb.LGBMClassifier(
        boosting_type="gbdt", num_leaves=128, reg_alpha=0.1, reg_lambda=2,
        max_depth=-1, n_estimators=3000, objective='binary',
        subsample=0.7, colsample_bytree=0.5, subsample_freq=1,min_child_samples=20,
        learning_rate=0.03, random_state=2019 , n_jobs=40, metric="None", importance_type='gain'
    )

eval_set = [(X_val, y_val)]
lgb_model.fit(X_train, train_label, eval_set=eval_set,verbose=10,early_stopping_rounds=40,eval_metric='auc')
iters = int(lgb_model.best_iteration_*1.1)
y_test = lgb_model.predict_proba(X_test)  
fi = DF()
fi['name'] = feature_name
fi['score'] = lgb_model.feature_importances_
print(list(fi.sort_values(by=['score'],ascending=False)['name'])[:100])

offline = data[valid_index][['sid','transport_mode']].copy().reset_index(drop=True)
offline['label'] = lgb_model.predict_proba(X_val)[:,1]
offline = offline.sort_values(['sid','label'],ascending=False)
offline = offline.groupby('sid',as_index=False).head(1)[['sid','transport_mode','label']].copy()
offline.replace(-1,0,inplace=True)
offline = offline.merge(train_clicks[['sid','click_mode']],how='left',on='sid')
print(f1_score(offline['click_mode'].values,offline['transport_mode'].values,average='weighted'))
offline.loc[offline['label']<0.2,'transport_mode'] = 0

# 0.14 6895

from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score,recall_score,precision_score

dic_ = offline['click_mode'].value_counts(normalize = True)
def get_weighted_fscore(y_pred, y_true):
    f_score = 0
    for i in range(12):
        yt = y_true == i
        yp = y_pred == i
        f_score += dic_[i] * f1_score(y_true=yt, y_pred= yp)
        print(i,dic_[i],f1_score(y_true=yt, y_pred= yp), precision_score(y_true=yt, y_pred= yp),recall_score(y_true=yt, y_pred= yp))
    print(f_score)
get_weighted_fscore(y_true =offline['click_mode'] , y_pred = offline['transport_mode'])

Training until validation scores don't improve for 40 rounds.
[10]	valid_0's auc: 0.919181
[20]	valid_0's auc: 0.919364
[30]	valid_0's auc: 0.91979
[40]	valid_0's auc: 0.920199
[50]	valid_0's auc: 0.92053
[60]	valid_0's auc: 0.920801
[70]	valid_0's auc: 0.921211
[80]	valid_0's auc: 0.921655
[90]	valid_0's auc: 0.921989
[100]	valid_0's auc: 0.922221
[110]	valid_0's auc: 0.922408
[120]	valid_0's auc: 0.922508
[130]	valid_0's auc: 0.922719
[140]	valid_0's auc: 0.92289
[150]	valid_0's auc: 0.922972
[160]	valid_0's auc: 0.923015
[170]	valid_0's auc: 0.923119
[180]	valid_0's auc: 0.923149
[190]	valid_0's auc: 0.923156
[200]	valid_0's auc: 0.92316
[210]	valid_0's auc: 0.923205
[220]	valid_0's auc: 0.923222
[230]	valid_0's auc: 0.923263
[240]	valid_0's auc: 0.923259
[250]	valid_0's auc: 0.923267
[260]	valid_0's auc: 0.923261
[270]	valid_0's auc: 0.923254
[280]	valid_0's auc: 0.923254
Early stopping, best iteration is:
[249]	valid_0's auc: 0.923271
['transport_mode_rank', 'transport_mode_rank_p

In [38]:
offline['transport_mode'].value_counts()

2.0     22036
7.0     14181
1.0     11823
5.0      7344
10.0     2462
9.0      2416
0.0      1471
11.0      436
6.0       417
8.0       385
3.0       348
4.0        69
Name: transport_mode, dtype: int64

In [38]:
lgb_model = lgb.LGBMClassifier(
        boosting_type="gbdt", num_leaves=128, reg_alpha=0.1, reg_lambda=10,
        max_depth=-1, n_estimators=iters, objective='binary',
        subsample=0.6, colsample_bytree=0.5, subsample_freq=1,min_child_samples=20,
        learning_rate=0.1, random_state=2019 , n_jobs=-1, metric="None", importance_type='gain'
    )

lgb_model.fit(train[feature_name], train['label'])
y_test_all = lgb_model.predict_proba(X_test)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

def auc(y,pred):
#     fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    return roc_auc_score(y, pred)

X_test = all_data[all_data['label']==-1].drop(['transport_mode','label','click_mode'],axis=1)[feature_name]
train = all_data[all_data['label']!=-1].drop(['transport_mode','click_mode'],axis=1)
X_train = train.drop('label',axis=1)[feature_name]
y_train = train['label']

K = 5
skf = StratifiedKFold(n_splits = K, shuffle = True ,random_state=1998)

lgb_pred_te_all = 0
lgb_auc_mean = 0
lgb_auc_mean2 = 0
cv_pred = np.zeros(X_train.shape[0])

for i, (train_index, test_index) in enumerate(skf.split(X_train,y_train)):
#     if i==2:
#     if i!=10:
    y_tr, y_val = y_train.iloc[train_index].copy(), y_train.iloc[test_index].copy()
    X_tr, X_val= X_train.iloc[train_index,:].copy(), X_train.iloc[test_index,:].copy()
    print( "\nFold ", i)
    lgb_params = {
                        'task': 'train',
                        'learning_rate': 0.05,
#                         'max_bin' : 200,
                        'min_sum_hessian_in_leaf': 10,
#                         'tree_learner' : 'voting',
                        'num_leaves' : 128,
#                         'max_depth': 5, 
                        'boosting_type': 'gbdt',
                        'objective': 'xentlambda',
#                         'is_unbalance':True,
                        'colsample_bytree' : 0.5,
                        'metric':'auc',
#                         'lambda_l1': 0.0001,
                        'lambda_l2': 3,
                        'bagging_freq': 1,
                        'verbose': 1,
                        'random_state': 2019,
                        'num_threads':40,}
    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_val = lgb.Dataset(X_val, label=y_val)
#     watchlist = {(lgb_train, 'train'), (lgb_val, 'eval')}
    lgb_model = lgb.train( 
                           lgb_params, 
                           lgb_train, 
                           num_boost_round=3000, 
                           valid_sets = [lgb_train,lgb_val], 
                           verbose_eval=50, 
#                           feval=gini_lgb, 
                           early_stopping_rounds=50
                         )
    print( " Best iteration = ", lgb_model.best_iteration )

    pred = lgb_model.predict(X_val)
    cv_pred[test_index] = pred
    lgb_auc_mean = auc(y_val,pred) / K + lgb_auc_mean
    lgb_auc_mean2 = lgb_model.best_score['valid_1']['auc'] / K +lgb_auc_mean2
    print( " auc_LGB = ",lgb_model.best_score['valid_1']['auc'])
    print( " auc = ", auc(y_val,pred) )
    pred_te = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
    lgb_pred_te_all = lgb_pred_te_all + pred_te / K
print( " mean_auc_LGB = ", lgb_auc_mean2 )
print( " mean_auc = ", lgb_auc_mean )

In [None]:
offline = all_data[all_data['label']!=-1][['sid','transport_mode']].copy().reset_index(drop=True)
offline['label'] = cv_pred
offline = offline.sort_values(['sid','label'],ascending=False)
offline = offline.groupby('sid',as_index=False).head(1)[['sid','transport_mode','label']].copy()
offline.replace(-1,0,inplace=True)
offline = offline.merge(train_clicks[['sid','click_mode']],how='left',on='sid')
print(f1_score(offline['click_mode'].values,offline['transport_mode'].values,average='weighted'))
offline.loc[offline['label']<0.2,'transport_mode'] = 0

# 0.14 6895

from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score,recall_score,precision_score

dic_ = offline['click_mode'].value_counts(normalize = True)
def get_weighted_fscore(y_pred, y_true):
    f_score = 0
    for i in range(12):
        yt = y_true == i
        yp = y_pred == i
        f_score += dic_[i] * f1_score(y_true=yt, y_pred= yp)
        print(i,dic_[i],f1_score(y_true=yt, y_pred= yp), precision_score(y_true=yt, y_pred= yp),recall_score(y_true=yt, y_pred= yp))
    print(f_score)
get_weighted_fscore(y_true =offline['click_mode'] , y_pred = offline['transport_mode'])

# 67438 67456

In [72]:
sub = data[data['label']==-1][['sid','transport_mode']].copy()
sub['label'] =  lgb_pred_te_all
sub = sub.sort_values(['sid','label'],ascending=False)
sub = sub.groupby('sid',as_index=False).head(1)[['sid','transport_mode','label']].copy()
sub.replace(-1,0,inplace=True)
sub.loc[sub['label']<0.2,'transport_mode'] = 0
sub['transport_mode'].value_counts()

2.0     33701
7.0     22256
1.0     18507
5.0     10993
10.0     3530
0.0      2038
9.0      1647
8.0       563
11.0      440
6.0       309
3.0       305
4.0        69
Name: transport_mode, dtype: int64

In [None]:
sub[['sid','transport_mode']].to_csv('../submit/baseline.csv',index = False,header=None)

In [79]:
1

1

In [None]:
ana_ans = []
for i in range(len(y_test)):
    if y_test[i]!=y_test_all[i]:
        ana_ans.append(i)
        
len(ana_ans)

In [30]:
submit = []
for line in cv_pred:
    now = []
    for j in line:
        now.append(int(j))
    submit.append(np.argmax(np.bincount(now)))

In [31]:
np.mean(cv_score)

0.6814643765842411

In [29]:
sub = DF()
sub['sid'] = all_data[choose]['sid'].values
sub['label'] = y_test_all

In [30]:
sub.to_csv('../submit/带最后14天.csv',index = False,header=None)

In [None]:
kfc1 = pd.read_csv('../submit/688不带最后7天.csv',header=None)
kfc1.columns = ['sid','k1']
kfc2 = pd.read_csv('../submit/688带了最后7天.csv',header=None)

In [58]:
to_stack = 
all_data[feature_name]

Unnamed: 0,Recommand_0_transport_mode,transport_mode_svd_fea_2,odl2_dis,price_inMin_transport_mode,price_svd_fea_1,transport_mode_svd_fea_3,transport_mode_svd_fea_5,transport_mode_svd_fea_6,distance_mean,transport_mode_svd_fea_9
0,1.0,-0.094032,0.022361,1.0,0.187898,-0.361078,0.066919,0.067537,2960.666667,0.050871
1,2.0,-0.009575,0.416293,11.0,-0.182656,-0.144930,-0.097490,0.038642,48412.333333,-0.113302
2,2.0,0.646982,0.058310,2.0,0.015238,-0.035331,0.113422,0.012515,7163.666667,0.039171
3,1.0,-0.090497,0.022361,1.0,0.252328,-0.430623,0.097569,0.042598,3341.833333,0.020274
4,9.0,-0.069946,0.205183,9.0,-0.222302,0.076463,0.097680,-0.111863,26951.000000,0.252058
5,1.0,-0.090497,0.031623,1.0,0.252328,-0.430623,0.097569,0.042598,3626.166667,0.020274
6,1.0,-0.090497,0.022361,1.0,0.252328,-0.430623,0.097569,0.042598,2963.333333,0.020274
7,2.0,0.228207,0.111803,2.0,-0.303439,-0.016658,-0.258885,0.157420,16841.200000,0.229999
8,2.0,0.231946,0.101980,1.0,-0.194354,0.047922,-0.338281,-0.295221,10291.750000,-0.219847
9,1.0,-0.211129,0.110000,1.0,-0.115306,-0.361749,-0.012318,0.039733,10360.750000,-0.309252


In [24]:
fi = DF()
fi['name'] = feature_name
fi['score'] = lgb_model.feature_importances_
feature_name = fi.sort_values(by=['score'],ascending=False).head(10)['name'].values
fi.sort_values(by=['score'],ascending=False)

Unnamed: 0,name,score
110,Recommand_0_transport_mode,1.444737e+06
284,transport_mode_svd_fea_2,1.113954e+06
163,odl2_dis,3.902617e+05
39,price_inMin_transport_mode,2.947026e+05
263,price_svd_fea_1,2.904361e+05
285,transport_mode_svd_fea_3,2.521495e+05
287,transport_mode_svd_fea_5,1.702749e+05
288,transport_mode_svd_fea_6,1.557480e+05
2,distance_mean,1.536084e+05
291,transport_mode_svd_fea_9,1.252806e+05


In [27]:
y_test

array([7., 9., 7., ..., 2., 2., 1.])

In [28]:
y_test_all

array([7., 9., 7., ..., 2., 2., 1.])

In [36]:
kfc = pd.read_csv("baseline.csv")
rfl = pd.read_csv("../submit/auc_xgb_lgb.csv",header=None)
rfl.columns = ['sid','label']

In [41]:
jb = sub.merge(kfc,how='left',on='sid').merge(rfl,how='left',on='sid')[['label_x','recommend_mode','label_y']].values

In [50]:
rh = []
for i in jb:
    if (i[0]==i[1]) & (i[0]==i[2]):
        rh.append(i[0])
    elif (i[0]==i[2]):
        rh.append(i[0])
    elif (i[0]==i[1]):
        rh.append(i[0])
    elif i[1]==i[2]:
        rh.append(i[1])
    else:
        rh.append(i[1])

In [51]:
a = 0
b = 0
c = 0
for i in range(len(jb)):
    if rh[i]!=jb[i][0]:
        a+=1
    if rh[i]!=jb[i][1]:
        b+=1
    if rh[i]!=jb[i][2]:
        c+=1

print(a)
print(b)
print(c)

1706
1338
3834


In [54]:
sub['rh'] = rh

In [56]:
sub[['sid','rh']].to_csv("try_rh.csv",index=False,header=False)