In [1]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt
import geohash
import lightgbm as lgb
import time
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from scipy import stats

In [2]:
data_path = '../input/data_set_phase1/'

In [3]:
profiles = pd.read_csv(f'{data_path}profiles.csv')
test_plans = pd.read_csv(f'{data_path}test_plans.csv')
test_queries = pd.read_csv(f'{data_path}test_queries.csv')
train_clicks = pd.read_csv(f'{data_path}train_clicks.csv')
train_plans = pd.read_csv(f'{data_path}train_plans.csv')
train_queries = pd.read_csv(f'{data_path}train_queries.csv')

global fillna_value
fillna_value = 0-1
def fill_pid(x):
    global fillna_value
    result = fillna_value
    fillna_value = fillna_value-1
    return 0    #用0填充
train_queries['pid'] = train_queries['pid'].map(lambda x:x if not np.isnan(x) else fill_pid(x))
test_queries['pid'] = test_queries['pid'].map(lambda x:x if not np.isnan(x) else fill_pid(x))

In [4]:
print(train_plans['plan_time'].min())
print(train_plans['plan_time'].max())
print(test_plans['plan_time'].min())
print(test_plans['plan_time'].max())

2018-10-01 00:00:10
2018-11-30 23:59:17
2018-12-01 00:00:10
2018-12-07 23:59:31


In [5]:
train_len = train_plans.shape[0]
plans = pd.concat([train_plans,test_plans],ignore_index=True,sort=False)
queries = pd.concat([train_queries,test_queries],ignore_index=True,sort=False)
simple_set = queries.merge(plans,'left',['sid'])
simple_set['plan_time'] = simple_set['plan_time'].fillna('-1')
simple_set['plan_time'] = list(map(lambda x,y:x if y=='-1' else y,simple_set['req_time'],
                                   simple_set['plan_time']))
simple_set['plans'] = simple_set['plans'].fillna("[{'distance':3953,'price':300,'eta':1786,'transport_mode':0}]")

simple_set = simple_set.merge(train_clicks,'left',['sid']).fillna(0)
simple_set = simple_set.merge(profiles,'left',['pid']).fillna(0)

In [6]:
def geodistance(lng1,lat1,lng2,lat2):
    lng1, lat1, lng2, lat2 = map(radians, [lng1, lat1, lng2, lat2])
    dlon=lng2-lng1
    dlat=lat2-lat1
    a=sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 
    dis=2*asin(sqrt(a))*6371*1000
    return dis

def mhtdistance(lng1,lat1,lng2,lat2):
    lng1, lat1, lng2, lat2 = map(radians, [lng1, lat1, lng2, lat2])
    dlon=lng2-lng1
    dlat=lat2-lat1
    return (6371*1000)*(abs(dlon)+abs(dlat))

def get_most_common(x):
    return Counter(x).most_common(1)[0][0]

def get_most_common2(x):
    temp = Counter(x).most_common(2)
    if len(temp)>1:
        return temp[1][0]
    else:
        return 0
    
def get_mode(x):
    return stats.mode(x)[0][0]

def get_mode_count(x):
    return stats.mode(x)[1][0]
    
def get_most_common3(x):
    temp = Counter(x).most_common(3)
    if len(temp)>2:
        return temp[2][0]
    else:
        return 0
    
# 计算出发点到目的地的角度方向,参考的是wiki的内容
def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6378.137  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [7]:
simple_set['req_time_str'] = simple_set['req_time'].map(lambda x:x[:13])

simple_set['o_lng'] = simple_set['o'].map(lambda x:float(x.split(',')[0]))
simple_set['o_lat'] = simple_set['o'].map(lambda x:float(x.split(',')[1]))
simple_set['d_lng'] = simple_set['d'].map(lambda x:float(x.split(',')[0]))
simple_set['d_lat'] = simple_set['d'].map(lambda x:float(x.split(',')[1]))

simple_set['geodistance'] = list(map(geodistance,simple_set['o_lng'],simple_set['o_lat'],
                                 simple_set['d_lng'],simple_set['d_lat']))
simple_set['mhtdistance'] = list(map(mhtdistance,simple_set['o_lng'],simple_set['o_lat'],
                                 simple_set['d_lng'],simple_set['d_lat']))

simple_set['bearing_array'] = list(map(bearing_array,simple_set['o_lat'],simple_set['o_lng'],
                                 simple_set['d_lat'],simple_set['d_lng']))


simple_set['o_geohash'] = list(map(lambda x,y:geohash.encode(x,y,7),simple_set['o_lat'],
                                   simple_set['o_lng']))
simple_set['d_geohash'] = list(map(lambda x,y:geohash.encode(x,y,7),simple_set['d_lat'],
                                   simple_set['d_lng']))
simple_set['geodistance_rank'] = simple_set['geodistance'].rank()
simple_set['geodistance_id'] = simple_set['geodistance_rank']//(simple_set.shape[0]/5000)

base32 = {x:i+1 for i,x in enumerate(list('0123456789bcdefghjkmnpqrstuvwxyz') )}
print(base32)
def geohash2int(geohash_id):
    result = 0
    base = 1
    for each in geohash_id[::-1]:
        result = result + base32[each] * base
        base = base*32
    return result


simple_set['o_geohash_num'] = simple_set['o_geohash'].map(geohash2int)%10000000
simple_set['d_geohash_num'] = simple_set['d_geohash'].map(geohash2int)%10000000
simple_set['lujing'] = simple_set['o_geohash_num']*10000000+simple_set['d_geohash_num']
simple_set['juedui_lujing'] = list(map(lambda x,y:x*10000000+y if x>y else y*10000000+x,
                     simple_set['o_geohash_num'], simple_set['d_geohash_num']))
def get_plans_list(x):
    result = eval(x)
    for i,each in enumerate(result):
        result[i]['price'] = 0 if result[i]['price'] =='' else int(each['price'])
    return result

def price_del_0(x):
    sums = 0
    geshu = 0
    for each in x:
        sums = sums+each
        if each !=0:
            geshu = geshu+each
    if geshu !=0:
        a = sums/geshu
    else:
        a = 0
        
    result = []
    for each in x:
        if each ==0:
            result.append(a)
        else:
            result.append(each)
    return result
    
simple_set['plans_list'] = simple_set['plans'].map(get_plans_list)

simple_set['plans_price_list'] = simple_set['plans_list'].map(lambda x:[k['price'] for k in x])
simple_set['plans_price_del_0_list'] = simple_set['plans_price_list'].map(price_del_0)

simple_set['plans_distance_list'] = simple_set['plans_list'].map(lambda x:[k['distance'] for k in x])
simple_set['plans_eta_list'] = simple_set['plans_list'].map(lambda x:[k['eta'] for k in x])
simple_set['plans_mode_list'] = simple_set['plans_list'].map(lambda x:[k['transport_mode'] for k in x])

simple_set['plans_distance_/_eta_list'] = simple_set['plans_list'].map(lambda x:[k['distance']/k['eta'] \
                                                                               for k in x])
simple_set['plans_price_/_distance_list'] = simple_set['plans_list'].map(lambda x:[k['price']/ \
                                                                      k['distance'] for k in x])

simple_set['plans_price_/_eta_list'] = simple_set['plans_list'].map(lambda x:[k['price']/k['eta'] \
                                                                               for k in x])

simple_set['plans_price_*_eta_list'] = simple_set['plans_list'].map(lambda x:[k['price']*k['eta'] \
                                                                               for k in x])
simple_set['plans_distance_list/_geodistance'] = list(map(lambda x,y:[i/(y+0.001) for i in x],
                                            simple_set['plans_distance_list'],simple_set['geodistance']))
simple_set['plans_distance_list/_mhtdistance'] = list(map(lambda x,y:[i/(y+0.001) for i in x],
                                            simple_set['plans_distance_list'],simple_set['mhtdistance']))
simple_set['plans_distance_list/max'] = simple_set['plans_distance_list'].map(lambda x:
                                                                [i/np.max(x) for i in x])
simple_set['plans_price_list/max'] = simple_set['plans_price_list'].map(lambda x:
                                                                [i/np.max(x) for i in x])
simple_set['plans_eta_list/max'] = simple_set['plans_eta_list'].map(lambda x:
                                                                [i/np.max(x) for i in x])


{'0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'j': 18, 'k': 19, 'm': 20, 'n': 21, 'p': 22, 'q': 23, 'r': 24, 's': 25, 't': 26, 'u': 27, 'v': 28, 'w': 29, 'x': 30, 'y': 31, 'z': 32}




In [8]:
pid_geodistance = simple_set.groupby(['pid'],as_index = False)['geodistance'].agg({
    'pid_count':'count','pid_geodistance_mean':'mean','pid_geodistance_sum':'sum',
    'pid_geodistance_max':'max','pid_geodistance_min':'min','pid_geodistance_std':'std'
})
simple_set = simple_set.merge(pid_geodistance,'left',['pid'])


In [9]:
simple_set['plans_distance_list/pid_geodistance_mean'] = list(map(lambda x,y:[i/(y+0.001) for i in x],
                    simple_set['plans_distance_list'],simple_set['pid_geodistance_mean']))
#用户推荐的前几个分别是啥

In [10]:
list_col = ['plans_price_list','plans_distance_list','plans_eta_list','plans_mode_list',
           'plans_distance_/_eta_list','plans_price_/_distance_list','plans_price_/_eta_list',
           'plans_price_*_eta_list','plans_distance_list/_geodistance','plans_distance_list/_mhtdistance',
           'plans_distance_list/max','plans_price_list/max','plans_eta_list/max',
            'plans_distance_list/pid_geodistance_mean','plans_price_del_0_list']

def get_list_feat(simple_set,list_feat_col):
    simple_set[f'{list_feat_col}_max'] = simple_set[list_feat_col].map(lambda x:np.max(x))
    simple_set[f'{list_feat_col}_min'] = simple_set[list_feat_col].map(lambda x:np.min(x))
    simple_set[f'{list_feat_col}_argmax'] = simple_set[list_feat_col].map(lambda x:np.argmax(x))
    simple_set[f'{list_feat_col}_argmin'] = simple_set[list_feat_col].map(lambda x:np.argmin(x))
    simple_set[f'{list_feat_col}_std'] = simple_set[list_feat_col].map(lambda x:np.std(x))
    simple_set[f'{list_feat_col}_mean'] = simple_set[list_feat_col].map(lambda x:np.mean(x))
    simple_set[f'{list_feat_col}_1th'] = simple_set[list_feat_col].map(lambda x:x[0])
    simple_set[f'{list_feat_col}_2th'] = simple_set[list_feat_col].map(lambda x:0 if len(x)<2 else x[1])
    simple_set[f'{list_feat_col}_3th'] = simple_set[list_feat_col].map(lambda x:0 if len(x)<3 else x[2])
#     simple_set[f'{list_feat_col}_4th'] = simple_set[list_feat_col].map(lambda x:x[3])

    simple_set[f'{list_feat_col}_max_mode_cate'] = list(map(lambda x,y:y[x],
                        simple_set[f'{list_feat_col}_argmax'],simple_set['plans_mode_list']))
    simple_set[f'{list_feat_col}_min_mode_cate'] = list(map(lambda x,y:y[x],
                        simple_set[f'{list_feat_col}_argmin'],simple_set['plans_mode_list']))
    
    simple_set[f'{list_feat_col}_max_eta'] = list(map(lambda x,y:y[x],
                        simple_set[f'{list_feat_col}_argmax'],simple_set['plans_eta_list']))
    simple_set[f'{list_feat_col}_min_eta'] = list(map(lambda x,y:y[x],
                        simple_set[f'{list_feat_col}_argmin'],simple_set['plans_eta_list']))
    
    simple_set[f'{list_feat_col}_max_distance'] = list(map(lambda x,y:y[x],
                        simple_set[f'{list_feat_col}_argmax'],simple_set['plans_distance_list']))
    simple_set[f'{list_feat_col}_min_distance'] = list(map(lambda x,y:y[x],
                        simple_set[f'{list_feat_col}_argmin'],simple_set['plans_distance_list']))
    
    simple_set[f'{list_feat_col}_max_price'] = list(map(lambda x,y:y[x],
                        simple_set[f'{list_feat_col}_argmax'],simple_set['plans_price_list']))
    simple_set[f'{list_feat_col}_min_price'] = list(map(lambda x,y:y[x],
                        simple_set[f'{list_feat_col}_argmin'],simple_set['plans_price_list']))
    
for each in list_col:
    get_list_feat(simple_set,each)
simple_set['plans_mode_list_4th'] = simple_set['plans_mode_list'].map(lambda x:0 if len(x)<4 else x[3])
simple_set['plans_mode_list_5th'] = simple_set['plans_mode_list'].map(lambda x:0 if len(x)<5 else x[4])
simple_set['plans_mode_list_6th'] = simple_set['plans_mode_list'].map(lambda x:0 if len(x)<6 else x[5])
simple_set['plans_mode_list_7th'] = simple_set['plans_mode_list'].map(lambda x:0 if len(x)<7 else x[6])
simple_set['mode_list_encode12_cate'] = simple_set['plans_mode_list_1th']*12+simple_set['plans_mode_list_2th']
simple_set['mode_list_encode123'] = simple_set['mode_list_encode12_cate']*12+simple_set['plans_mode_list_3th']
simple_set['mode_list_encode1234'] = simple_set['mode_list_encode123']*12+simple_set['plans_mode_list_4th']

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [11]:
def get_rank(x,z):
    for kk,each in enumerate(x):
        if each ==z:
            return kk
    return 10

for i in range(11):
    simple_set[f'plans_mode_list_has_{i+1}'] = simple_set['plans_mode_list'].map(
        lambda x:Counter(x).get(i+1)).fillna(0)
    simple_set[f'plans_mode_list_{i+1}_rank'] = simple_set['plans_mode_list'].map(lambda x:get_rank(x,i+1))

In [12]:
plan_time = pd.to_datetime(simple_set['plan_time'])
req_time = pd.to_datetime(simple_set['req_time'])

simple_set['plan_time_req_time_Pvalue'] = plan_time.map(lambda x:x.value)//10**9-req_time.map(
    lambda x:x.value)//10**9
simple_set['plan_time_month'] = req_time.dt.month
simple_set['plan_time_day'] = req_time.dt.day
simple_set['plan_time_dayofyear'] = req_time.dt.dayofyear
simple_set['plan_time_hour'] = req_time.dt.hour
simple_set['plan_time_minute'] = req_time.dt.minute
simple_set['plan_hour_minute'] = simple_set['plan_time_hour']*60+simple_set['plan_time_minute']
simple_set['plan_time_dayofweek'] = req_time.dt.dayofweek
simple_set['plan_time_weekend'] = simple_set['plan_time_dayofweek'].map(lambda x:0 if x<2 else 1)
simple_set['hour_time'] = simple_set['plan_time_hour']+ simple_set['plan_time_dayofyear']*24
simple_set['guoqing'] = simple_set['plan_time'].map(lambda x:0 if x<'2018-10-08 00:00:00' else 1)
hour_range_dict = {0:1,1:1,2:1,3:1,4:1,5:1,6:2,7:2,8:2,9:2,10:3,11:3,12:4,13:4,14:4,
                   15:5,16:5,17:5,18:6,19:6,20:7,21:7,22:7,23:7}
simple_set['hour_range'] = simple_set['plan_time_hour'].map(hour_range_dict)
simple_set['plan_time_hour_12'] = simple_set['plan_time_hour'].map(
         lambda x:x+12 if x<12 else x-12)

In [13]:
agg_list = ['mean','sum','min','max','std','skew']
common_list = [get_mode,get_mode_count,get_most_common2,get_most_common3]
def groupby_merge(simple_set,on_col_list,feat_col,aggs,name= ''):
    on_col_str = '_'.join(on_col_list)
    agg_dict = {f'{name}{on_col_str}_on_{feat_col}_{x}':x for x in aggs}
    feat = simple_set.groupby(on_col_list,as_index = False)[feat_col].agg(
                                agg_dict)
    simple_set = simple_set.merge(feat,'left',on_col_list)
    return simple_set

#    hour_time plan_time_hour geodistance pid plan_time_hour_12 o_geohash_num
#  d_geohash_num juedui_lujing lujing geodistance_id
on_list = [['hour_time'],['plan_time_hour'],
#            ['geodistance_id'],
           ['pid'],
#                 ['o_geohash_num'],['d_geohash_num'],
#            ['plan_time_hour','o_geohash_num'],
#            ['pid','o_geohash_num'],
           ['plan_time_hour','d_geohash_num'],
           ['o','d'],
#            ['o'],['d'],
#            ['pid','d_geohash_num'],
#            ['pid','lujing'], 
           ['pid','plan_time_hour'],
#           ['juedui_lujing'],['lujing'],['lujing','plan_time_hour']
          ]
num_list = ['plans_price_/_eta_list_1th',
           ]

for each in on_list:
    simple_set = groupby_merge(simple_set,each,'geodistance',['count'])
    print(each,'count')
    simple_set = groupby_merge(simple_set,each,'plans_mode_list_1th',
                               common_list,'cate_')

for on_each in on_list:
    for num_each in num_list:
        if num_each in on_each:
            continue
        if num_each == 'plan_time_hour_12' and 'plan_time_hour' in on_each:
            continue
        if  num_each == 'geodistance' and 'juedui_lujing' in on_each:
            continue
        if  num_each == 'geodistance' and 'lujing' in on_each:
            continue
        simple_set = groupby_merge(simple_set,on_each,num_each,agg_list)
        print(on_each,num_each)


['hour_time'] count
['plan_time_hour'] count
['geodistance_id'] count
['pid'] count
['pid', 'o_geohash_num'] count
['plan_time_hour', 'd_geohash_num'] count
['o', 'd'] count
['o'] count
['d'] count
['pid', 'plan_time_hour'] count
['juedui_lujing'] count
['lujing'] count
['lujing', 'plan_time_hour'] count


In [14]:
simple_set

Unnamed: 0,sid,pid,req_time,o,d,plan_time,plans,click_time,click_mode,p0,...,lujing_on_geodistance_count,cate_lujing_on_plans_mode_list_1th_<function get_mode at 0x000001CC1C301840>,cate_lujing_on_plans_mode_list_1th_<function get_mode_count at 0x000001CC1C301AE8>,cate_lujing_on_plans_mode_list_1th_<function get_most_common2 at 0x000001CC1C301A60>,cate_lujing_on_plans_mode_list_1th_<function get_most_common3 at 0x000001CC1C301BF8>,lujing_plan_time_hour_on_geodistance_count,cate_lujing_plan_time_hour_on_plans_mode_list_1th_<function get_mode at 0x000001CC1C301840>,cate_lujing_plan_time_hour_on_plans_mode_list_1th_<function get_mode_count at 0x000001CC1C301AE8>,cate_lujing_plan_time_hour_on_plans_mode_list_1th_<function get_most_common2 at 0x000001CC1C301A60>,cate_lujing_plan_time_hour_on_plans_mode_list_1th_<function get_most_common3 at 0x000001CC1C301BF8>
0,3000821,0.0,2018-11-02 17:54:30,"116.29,39.97","116.32,39.96",2018-11-02 17:54:30,"[{""distance"": 5219, ""price"": 300, ""eta"": 1367,...",2018-11-02 17:54:32,9.0,0.0,...,4,2,3,9,0,2,2,1,2,0
1,3085857,210736.0,2018-11-16 10:53:10,"116.39,39.84","116.33,39.79",2018-11-16 10:53:10,"[{""distance"": 13864, ""price"": 600, ""eta"": 3221...",2018-11-16 10:53:32,1.0,0.0,...,1,7,1,0,0,1,7,1,0,0
2,2944522,0.0,2018-10-06 10:33:58,"116.31,39.93","116.27,40.00",2018-10-06 10:33:58,"[{""distance"": 12294, ""price"": 400, ""eta"": 2472...",2018-10-06 10:34:23,9.0,0.0,...,5,3,2,3,2,2,9,2,0,0
3,559931,202427.0,2018-11-23 14:54:11,"116.27,39.88","116.39,39.90",2018-11-23 14:54:11,"[{""distance"": 14853, ""price"": 1700, ""eta"": 291...",2018-11-23 14:54:18,1.0,1.0,...,9,7,4,10,11,3,7,2,10,0
4,2819352,172251.0,2018-10-30 11:48:41,"116.34,39.96","116.37,39.86",2018-10-30 11:48:41,"[{""distance"": 12882, ""price"": 600, ""eta"": 3211...",2018-10-30 11:49:04,7.0,0.0,...,44,7,21,2,10,3,2,2,7,0
5,2754575,135606.0,2018-10-18 07:37:32,"116.54,39.73","116.48,39.76",2018-10-18 07:37:32,"[{""distance"": 8217, ""price"": """", ""eta"": 1743, ...",2018-10-18 07:38:25,1.0,0.0,...,1,3,1,0,0,1,3,1,0,0
6,2224795,189023.0,2018-11-20 14:34:03,"116.34,39.73","116.79,40.35",2018-11-20 14:34:03,"[{""distance"": 99097, ""price"": """", ""eta"": 6596,...",2018-11-20 14:34:48,1.0,1.0,...,1,3,1,0,0,1,3,1,0,0
7,1598541,156954.0,2018-10-03 09:19:47,"116.26,40.22","116.24,40.22",2018-10-03 09:19:47,"[{""distance"": 1484, ""price"": 200, ""eta"": 881, ...",2018-10-03 09:19:55,1.0,1.0,...,34,1,27,6,5,7,1,7,0,0
8,676316,135045.0,2018-11-15 15:25:52,"116.39,39.91","116.39,39.90",2018-11-15 15:25:52,"[{""distance"": 1511, ""price"": """", ""eta"": 1294, ...",2018-11-15 15:25:55,5.0,1.0,...,174,5,96,1,6,13,5,7,6,1
9,1068802,176605.0,2018-11-08 23:13:57,"116.29,40.02","116.27,39.95",2018-11-08 23:13:57,"[{""distance"": 10278, ""price"": """", ""eta"": 790, ...",2018-11-08 23:14:06,3.0,0.0,...,1,3,1,0,0,1,3,1,0,0


2018-10-01 00:00:10 
2018-11-30 23:59:17 
2018-12-01 00:00:10 
2018-12-07 23:59:31 

In [15]:
text_dict = {}
simple_set['plans_mode_str_list'] = simple_set['plans_mode_list'].map(
    lambda x:list(map(str,x)))

# mode_pid = simple_set['']
def list2str(x,y):
    result = []
    for i,each in enumerate(x):
        result.append(x[i]+'_'+y) 
    return ' '.join(result)
        
text_dict['mode_pid_text'] = list(map(list2str ,simple_set['plans_mode_str_list'],
                                      simple_set['pid'].map(int).map(str)))
text_dict['mode_geodistance_id_text'] = list(map(list2str,
                                        simple_set['plans_mode_str_list'],
                            simple_set['geodistance_id'].map(int).map(str)))
text_dict['mode_o_geohash_num_text'] = list(map(list2str ,
                                                simple_set['plans_mode_str_list'],
                            simple_set['o_geohash_num'].map(int).map(str)))

text_dict['mode_d_geohash_num_text'] = list(map(list2str ,
                                                simple_set['plans_mode_str_list'],
                            simple_set['d_geohash_num'].map(int).map(str)))
text_dict['mode_lujing_text'] = list(map(list2str ,
                                                simple_set['plans_mode_str_list'],
                            simple_set['lujing'].map(str)))

text_dict['mode_plan_time_hour_text'] = list(map(list2str ,
                                                simple_set['plans_mode_str_list'],
                            simple_set['plan_time_hour'].map(str)))
def mode_eta2str(x,y):
    result = []
    for i,each in enumerate(x):
        result.append(x[i]+'_'+str(y[i]//60))
    return ' '.join(result)

def mode_distance2str(x,y):
    result = []
    for i,each in enumerate(x):
        result.append(x[i]+'_'+str(y[i]//1000))
    return ' '.join(result)

text_dict['mode_eta_text'] = list(map(mode_eta2str ,
                                      simple_set['plans_mode_str_list'],
                                    simple_set['plans_eta_list']))
text_dict['mode_distance_text'] = list(map(mode_distance2str ,
                                           simple_set['plans_mode_str_list'],
                                    simple_set['plans_distance_list']))
def mode_cixu2str(x):
    result = []
    for i,each in enumerate(x):
        result.append(x[i]+'_'+str(i))
    return ' '.join(result)
    
text_dict['mode_cixu_text'] = list(map(mode_cixu2str ,
                                       simple_set['plans_mode_str_list']))
def mode2str(x):
    result = []
    for i,each in enumerate(x):
        result.append(x[i]+'_'+'mode')
    return ' '.join(result)
text_dict['mode_text'] = list(map(mode2str ,
                                simple_set['plans_mode_str_list']))

tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))
svd_enc = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)

svd_feat_list = []
for each in text_dict:
    each_tfidf_vec = tfidf_enc.fit_transform(text_dict[each])
    each_mode_svd = svd_enc.fit_transform(each_tfidf_vec)
    
    each_mode_svd = pd.DataFrame(each_mode_svd)
    each_mode_svd.columns = [f'svd_{each}_{i}' for i in range(10)]
    svd_feat_list.append(each_mode_svd)
    
    
    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [16]:
del simple_set['click_time']

In [17]:
simple_set = pd.concat([simple_set]+svd_feat_list, axis=1)

In [None]:
str_col = simple_set.select_dtypes(include=['object']).columns.tolist()
ignore_col = ['click_mode','sid']
pre_col = [x for x in simple_set.columns if x not in str_col+ignore_col]
print('This columns NOT use in lgb:',pre_col)
simple_set.to_csv('xuan_feat.csv',index =False)

In [18]:
# str_col = simple_set.select_dtypes(include=['object']).columns.tolist()
# ignore_col = ['click_mode','sid']
# pre_col = [x for x in simple_set.columns if x not in str_col+ignore_col]

# train_index = (simple_set.req_time < '2018-11-16')
# train_x     = simple_set[train_index][pre_col].reset_index(drop=True)
# train_y     = simple_set[train_index].click_mode.reset_index(drop=True)

# valid_index = (simple_set.req_time > '2018-11-16') & (simple_set.req_time < '2018-12-01')
# valid_x     = simple_set[valid_index][pre_col].reset_index(drop=True)
# valid_y     = simple_set[valid_index].click_mode.reset_index(drop=True)

# test_index = (simple_set.req_time > '2018-12-01')
# test_x     = simple_set[test_index][pre_col].reset_index(drop=True)

In [28]:
# cate_list = [f'plans_mode_list_{x+1}th' for x in range(7)]+\
#   ['mode_list_encode12_cate'] + [x for x in pre_col if 'cate' in x]
# # cate_list
# cate_list = []

In [30]:
# def f1_weighted(labels,preds):
#     preds = np.argmax(preds.reshape(12, -1), axis=0)
#     score = f1_score(y_true=labels, y_pred=preds, average='weighted')
#     return 'f1_weighted', score, True

# lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=61, reg_alpha=0, reg_lambda=0.01,
#     max_depth=-1, n_estimators=2000, objective='multiclass',
#     subsample=0.8, colsample_bytree=0.8, subsample_freq=1,min_child_samples = 50,  
#                             learning_rate=0.05, random_state=2019, metric="None",n_jobs=-1)
# eval_set = [(valid_x, valid_y)]
# lgb_model.fit(train_x, train_y, eval_set=eval_set, eval_metric=f1_weighted,
#               categorical_feature=cate_list, verbose=10, early_stopping_rounds=100)

# # [158]	valid_0's f1_weighted: 0.689149      0.69155313
# # [391]	valid_0's f1_weighted: 0.689904
# # [226]	valid_0's f1_weighted: 0.691119
# # [444]	valid_0's f1_weighted: 0.691219
# # [207]	valid_0's f1_weighted: 0.692387      0.69396139
# # [243]	valid_0's f1_weighted: 0.691971
# # [285]	valid_0's f1_weighted: 0.691804 0.69361978   



Training until validation scores don't improve for 100 rounds.
[10]	valid_0's f1_weighted: 0.688597
[20]	valid_0's f1_weighted: 0.689423
[30]	valid_0's f1_weighted: 0.68933
[40]	valid_0's f1_weighted: 0.689762
[50]	valid_0's f1_weighted: 0.689988
[60]	valid_0's f1_weighted: 0.690168
[70]	valid_0's f1_weighted: 0.69028
[80]	valid_0's f1_weighted: 0.69053
[90]	valid_0's f1_weighted: 0.690954
[100]	valid_0's f1_weighted: 0.691099
[110]	valid_0's f1_weighted: 0.691231
[120]	valid_0's f1_weighted: 0.691302
[130]	valid_0's f1_weighted: 0.691151
[140]	valid_0's f1_weighted: 0.691484
[150]	valid_0's f1_weighted: 0.691566
[160]	valid_0's f1_weighted: 0.691461
[170]	valid_0's f1_weighted: 0.691544
[180]	valid_0's f1_weighted: 0.691669
[190]	valid_0's f1_weighted: 0.69168
[200]	valid_0's f1_weighted: 0.691564
[210]	valid_0's f1_weighted: 0.69153
[220]	valid_0's f1_weighted: 0.691537
[230]	valid_0's f1_weighted: 0.6915
[240]	valid_0's f1_weighted: 0.691562
[250]	valid_0's f1_weighted: 0.691667
[26

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
        learning_rate=0.05, max_depth=-1, metric='None',
        min_child_samples=50, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=2000, n_jobs=-1, num_leaves=127,
        objective='multiclass', random_state=2019, reg_alpha=0.2,
        reg_lambda=0.01, silent=True, subsample=0.5,
        subsample_for_bin=200000, subsample_freq=1)

In [31]:
# imp = pd.DataFrame()
# imp['fea'] = pre_col
# imp['imp'] = lgb_model.feature_importances_ 
# imp = imp.sort_values('imp',ascending = False)
# imp.to_csv('../output/imp_baseline.csv')
# imp

Unnamed: 0,fea,imp
379,hour_time,2203
85,pid_geodistance_std,2132
385,cate_hour_time_on_plans_mode_list_1th_<functio...,2015
383,hour_time_on_geodistance_count,1907
376,plan_hour_minute,1840
420,cate_o_on_plans_mode_list_1th_<function get_mo...,1752
296,plans_eta_list/max_1th,1674
375,plan_time_minute,1642
73,bearing_array,1641
477,svd_mode_o_geohash_num_text_9,1590


In [22]:
# all_train_x = simple_set[simple_set.req_time < '2018-12-01'][pre_col].reset_index(drop=True)
# all_train_y = simple_set[simple_set.req_time < '2018-12-01'].click_mode.reset_index(drop=True)
# print(lgb_model.best_iteration_)
# lgb_model.n_estimators   = lgb_model.best_iteration_
# lgb_model.fit(all_train_x, all_train_y,categorical_feature=[])
# print('fit over')
# result  = pd.DataFrame()
# result['sid'] = simple_set[test_index]['sid']
# result['recommend_mode'] = lgb_model.predict(test_x)
# result['recommend_mode'] = result['recommend_mode'].astype(int)
# print(len(result))
# print(result['recommend_mode'].value_counts())
# result[['sid', 'recommend_mode']].to_csv('../output/baseline.csv', index=False)

9




fit over


  if diff:


94358
2     33977
7     22369
1     18264
5     11067
10     3551
0      1974
9      1686
11      443
3       399
8       320
6       260
4        48
Name: recommend_mode, dtype: int64


In [23]:
# result['plans'] = simple_set[test_index]['plans']
# a = result[result['plans'] == "[{'distance':3953,'price':300,'eta':1786,'transport_mode':0}]"]

In [24]:
# a.recommend_mode.value_counts()

0    1787
Name: recommend_mode, dtype: int64

In [25]:
# simple_set.to_csv('../cache/feat_pid_1.csv',index= False)

In [26]:
# a = pd.read_csv('../output/sub_04-27-01-51.csv')
# f = result.merge(a,'left',['sid'])
# (f['recommend_mode'] - f['recommended_mode']).value_counts()

 0.0     87396
-2.0       325
 4.0       275
-3.0       271
 1.0       214
 6.0       176
-1.0       152
 8.0       134
-7.0       110
 7.0        74
-8.0        72
 2.0        55
 3.0        50
-4.0        50
 5.0        50
-6.0        34
 9.0        32
-5.0        31
-9.0         8
 10.0        3
-10.0        2
dtype: int64

In [27]:
#转化率特征