In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame as DF
import xgboost as xgb
import lightgbm as lgb
import catboost as cbt
import json
from sklearn.metrics import f1_score
import geohash

In [2]:
input_dir = '../input/data_set_phase1/'
cv = 5
random_seed = 2019
for_test = True

In [3]:
import time
t1 = time.time()
print("Now Input Data...")
profiles = pd.read_csv(input_dir+'profiles.csv')
train_clicks = pd.read_csv(input_dir+'train_clicks.csv',parse_dates=['click_time'])
train_plans = pd.read_csv(input_dir+'train_plans.csv',parse_dates=['plan_time'])
train_queries = pd.read_csv(input_dir+'train_queries.csv',parse_dates=['req_time'])
test_plans = pd.read_csv(input_dir+'test_plans.csv',parse_dates=['plan_time'])
test_queries = pd.read_csv(input_dir+'test_queries.csv',parse_dates=['req_time'])
print("Use Time {}".format(time.time()-t1))

Now Input Data...
Use Time 7.701779365539551


In [4]:
print(profiles.shape,train_clicks.shape,train_plans.shape,train_queries.shape)
print(test_plans.shape,test_queries.shape)

(63090, 67) (453336, 3) (491054, 3) (500000, 5)
(92571, 3) (94358, 5)


In [5]:
def jsonLoads(strs,key):
    '''strs：传进来的json数据
       key：字典的键
    '''
    try:
        dict_ = json.loads(strs)
        return list(i[key] for i in dict_)
    except:
        return [-1]
    
def time_fun(x):
    try:
        return time.mktime(x.timetuple())
    except:
        return -1
    
def flatten_data(col):
    """
    把plans  flatten
    """
    df = pd.DataFrame(list(plans[col].values))
    df['sid'] = plans['sid']
    dis = pd.DataFrame()
    for i in df.columns[:-1]:
        dis_df = df.loc[:,[i,'sid']].copy()
        dis_df.columns = [col,'sid']
        dis = pd.concat([dis,dis_df],axis=0,sort=False)
    dis = dis.dropna()
#     dis = dis.sort_values('sid').reset_index(drop = True)
    return dis

In [6]:
plans = pd.concat([train_plans,test_plans],axis=0).reset_index(drop = True)
queries = pd.concat([train_queries,test_queries],axis=0).reset_index(drop = True)
train_clicks = train_clicks.merge(train_queries[['sid']],how='right',on='sid')
train_clicks.fillna(0,inplace=True)
# train_clicks.dropna(inplace=True)
print(train_clicks.shape)

data = train_clicks[['sid','click_mode']].copy()
test_id = test_queries[['sid']].copy()
data = pd.concat([data,test_id],axis=0,sort=False).fillna(-1).reset_index(drop = True)
plans = data[['sid']].merge(plans,on='sid',how='left').reset_index(drop = True)
queries = data[['sid']].merge(queries,on='sid',how='left').reset_index(drop = True)

(500000, 3)


In [7]:
from tqdm import tqdm

for i in tqdm(['distance','price','eta','transport_mode']):
    plans[i] = plans['plans'].apply(jsonLoads, key=i)

100%|██████████| 4/4 [00:18<00:00,  4.70s/it]


In [8]:
distance = flatten_data(col = 'distance')
price = flatten_data(col = 'price')
price.replace('',np.nan,inplace=True)
eta = flatten_data(col = 'eta')
transport_mode = flatten_data(col = 'transport_mode')

"""transport_mode_rank"""
plans['transport_mode_rank'] = plans['transport_mode'].apply(lambda x:np.arange(len(x)))
transport_mode_rank = flatten_data(col = 'transport_mode_rank')

plans_df = pd.concat([distance,transport_mode_rank.iloc[:,0],eta.iloc[:,0],transport_mode.iloc[:,0],price.iloc[:,0]],axis=1)

transport_mode_list = plans[['sid','transport_mode']].copy()
transport_mode_list.columns = ['sid','transport_mode_list']
plans_df = plans_df.merge(plans[['sid','plan_time']], on='sid',how='left')
# plans_df = plans_df.merge(transport_mode_list, on='sid',how='left')

In [9]:
data = data.merge(plans_df, on='sid',how='left')
data = data.merge(queries, on='sid',how='left')
data['ep'] = data['eta'] / data['price'] # 单位时间所需价格
data['dp'] = data['distance'] / data['price'] # 单位距离所需价格
data['de'] = data['distance'] / data['eta'] # 单位距离所需时间
data['ed'] = data['eta'] / data['distance'] # 单位eta所需距离
data['pe'] = data['price'] / data['eta'] 
data['pd'] = data['price'] / data['distance']

data['price_rank'] = data[['sid','price']].groupby(['sid'])['price'].rank(method='min')
data['distance_rank'] = data[['sid','distance']].groupby(['sid'])['distance'].rank(method='min')
data['eta_rank'] = data[['sid','eta']].groupby(['sid'])['eta'].rank(method='min')

In [10]:
# Get Label
data['label'] = list(map(lambda x,y:1 if x==y else 0,data['transport_mode'],data['click_mode']))
data['label'] = list(map(lambda x,y:-1 if x==-1 else y,data['click_mode'],data['label']))
print(data['label'].value_counts())

# 线下来看 改比不改好 暂时没有对比删和不删的
have_0 = data[data['label']==0]
have_1 = data[data['label']==1]
have_test = data[data['label']==-1]
have_1.loc[~have_1.index.isin(have_1.drop_duplicates(subset=['sid','click_mode','label'],keep='last').index),'label'] = 0

data = pd.concat([have_0.reset_index(drop=True),have_1.reset_index(drop=True),have_test.reset_index(drop=True)],axis=0).reset_index(drop=True)
print(data['label'].value_counts())

data = data.sort_values(by=['sid','transport_mode_rank'])
data.head()


 0    1782333
 1     503435
-1     431590
Name: label, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


 0    1832432
 1     453336
-1     431590
Name: label, dtype: int64


Unnamed: 0,sid,click_mode,distance,transport_mode_rank,eta,transport_mode,price,plan_time,pid,req_time,...,ep,dp,de,ed,pe,pd,price_rank,distance_rank,eta_rank,label
1608081,2,0.0,2898.0,0.0,1794.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,8.97,14.49,1.615385,0.619048,0.111483,0.069013,1.0,3.0,4.0,0
1608082,2,0.0,2714.0,1.0,818.0,6.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,,,3.317848,0.3014,,,,2.0,1.0,0
1608083,2,0.0,3365.0,2.0,1146.0,3.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,,,2.9363,0.340565,,,,5.0,2.0,0
1608084,2,0.0,3365.0,3.0,1446.0,4.0,1700.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,0.850588,1.979412,2.327109,0.429718,1.175657,0.505201,3.0,5.0,3.0,0
1608085,2,0.0,2957.0,4.0,2158.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,10.79,14.785,1.37025,0.729794,0.092678,0.067636,1.0,4.0,5.0,0


In [11]:
from math import radians, atan, tan, sin, acos, cos

def get_ktime_feature(k,data,i):
    kfc = data.copy()
    for time in range(1,k):
        tmp = kfc.sort_values(by=['sid',i]).drop_duplicates(subset=['sid'],keep='first')
        kfc = kfc[~kfc.index.isin(tmp.index)]
    tmp = kfc.sort_values(by=['sid',i]).drop_duplicates(subset=['sid'],keep='first')
    return tmp

from scipy import stats

def get_mode(x):
    return stats.mode(x)[0][0]

def get_mode_count(x):
    return stats.mode(x)[1][0]

def getDistance(latA, lonA, latB, lonB):  
    ra = 6378140     # radius of equator: meter  
    rb = 6356755     # radius of polar: meter  
    flatten = (ra - rb) / ra   # Partial rate of the earth  
    # change angle to radians  
    radLatA = radians(latA)  
    radLonA = radians(lonA)  
    radLatB = radians(latB)  
    radLonB = radians(lonB)  
  
    try: 
        pA = atan(rb / ra * tan(radLatA))  
        pB = atan(rb / ra * tan(radLatB))  
        x = acos(sin(pA) * sin(pB) + cos(pA) * cos(pB) * cos(radLonA - radLonB))  
        c1 = (sin(x) - x) * (sin(pA) + sin(pB))**2 / cos(x / 2)**2  
        c2 = (sin(x) + x) * (sin(pA) - sin(pB))**2 / sin(x / 2)**2  
        dr = flatten / 8 * (c1 - c2)  
        distance = ra * (x + dr)  
        return distance   # meter   
    except:
        return 0.0000001
    
for i in tqdm(['o','d']):
    data[i+'x'] = data[i].apply(lambda x:float(x.split(',')[0]))
    data[i+'y'] = data[i].apply(lambda x:float(x.split(',')[1]))
data['odl2_dis'] = np.sqrt((data['dx']-data['ox'])**2+(data['dy']-data['oy'])**2)

# 球面距离及其衍生特征
sphere_dis = []
for i in tqdm(data[['oy','ox','dy','dx']].values):
    sphere_dis.append(getDistance(i[0],i[1],i[2],i[3]))
data['sphere_dis'] = sphere_dis

data['req_time_dow'] = data['req_time'].dt.dayofweek
data['req_time_woy'] = data['req_time'].dt.weekofyear
data['req_is_weekend'] = (data['req_time'].dt.weekday>=5).astype(int)
data['req_time_hour'] = data['req_time'].dt.hour+data['req_time'].dt.minute/60

data['o_geohash'] = list(map(lambda x,y:geohash.encode(x,y,8),data['oy'],data['ox']))
data['d_geohash'] = list(map(lambda x,y:geohash.encode(x,y,8),data['dy'],data['dx']))
base32 = {x:i+1 for i,x in enumerate(list('0123456789bcdefghjkmnpqrstuvwxyz') )}
print(base32)
def geohash2int(geohash_id):
    result = 0
    base = 1
    for each in geohash_id[::-1]:
        result = result + base32[each] * base
        base = base*32
    return result
    
data['o_geohash'] = data['o_geohash'].map(geohash2int)
data['d_geohash'] = data['d_geohash'].map(geohash2int)

data.head()

100%|██████████| 2/2 [00:05<00:00,  2.71s/it]
100%|██████████| 2717358/2717358 [00:09<00:00, 293305.68it/s]


{'0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'j': 18, 'k': 19, 'm': 20, 'n': 21, 'p': 22, 'q': 23, 'r': 24, 's': 25, 't': 26, 'u': 27, 'v': 28, 'w': 29, 'x': 30, 'y': 31, 'z': 32}


Unnamed: 0,sid,click_mode,distance,transport_mode_rank,eta,transport_mode,price,plan_time,pid,req_time,...,dx,dy,odl2_dis,sphere_dis,req_time_dow,req_time_woy,req_is_weekend,req_time_hour,o_geohash,d_geohash
1608081,2,0.0,2898.0,0.0,1794.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,116.41,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609
1608082,2,0.0,2714.0,1.0,818.0,6.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,116.41,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609
1608083,2,0.0,3365.0,2.0,1146.0,3.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,116.41,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609
1608084,2,0.0,3365.0,3.0,1446.0,4.0,1700.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,116.41,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609
1608085,2,0.0,2957.0,4.0,2158.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,116.41,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609


In [12]:
print("Now Add Columns")
train_set = data[data['label']!=-1].reset_index(drop=True)
test_set = data[data['label']==-1].reset_index(drop=True)
data = pd.concat([train_set,test_set],axis=0).reset_index(drop=True)
data['length_sid'] = data['sid'].map(data.groupby('sid').size())
add_columns = []
del train_set;del test_set;
data.head()

Now Add Columns


Unnamed: 0,sid,click_mode,distance,transport_mode_rank,eta,transport_mode,price,plan_time,pid,req_time,...,dy,odl2_dis,sphere_dis,req_time_dow,req_time_woy,req_is_weekend,req_time_hour,o_geohash,d_geohash,length_sid
0,2,0.0,2898.0,0.0,1794.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609,6
1,2,0.0,2714.0,1.0,818.0,6.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609,6
2,2,0.0,3365.0,2.0,1146.0,3.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609,6
3,2,0.0,3365.0,3.0,1446.0,4.0,1700.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609,6
4,2,0.0,2957.0,4.0,2158.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,39.91,0.022361,2038.849069,5,45,1,12.783333,1028829263933,1028829262609,6


In [13]:
# 统计全局plan里 price_eta_distance的互相交叉作用

stat_1 = data[['distance','price','eta']].copy().drop_duplicates()

for i in tqdm(['price','eta','distance']):
    for j in ['price','eta','distance']:
        if i!=j:
            now = data[[i,j]].groupby([i])[j].agg(['mean','min','max','std']).add_prefix("{}_{}_".format(i,j))
            add_columns.extend(now.columns)
            now = now.reset_index()
            stat_1 = stat_1.merge(now,on=i,how='left')

stat_1.head()

100%|██████████| 3/3 [00:03<00:00,  1.28s/it]


Unnamed: 0,distance,price,eta,price_eta_mean,price_eta_min,price_eta_max,price_eta_std,price_distance_mean,price_distance_min,price_distance_max,...,eta_distance_max,eta_distance_std,distance_price_mean,distance_price_min,distance_price_max,distance_price_std,distance_eta_mean,distance_eta_min,distance_eta_max,distance_eta_std
0,2898.0,200.0,1794.0,2181.607738,445.0,12933.0,883.943658,4627.434087,331.0,53331.0,...,33308.0,6138.437704,667.241379,200.0,1800.0,570.437792,1399.0,411.0,2729.0,760.618301
1,2714.0,,818.0,,,,,,,,...,10988.0,1959.392855,613.043478,200.0,2000.0,536.303537,1372.166667,259.0,2544.0,723.450851
2,3365.0,,1146.0,,,,,,,,...,19477.0,3433.232057,617.1875,200.0,2700.0,628.567201,1604.537879,441.0,3101.0,832.760455
3,3365.0,1700.0,1446.0,2959.530882,1.0,18978.0,2632.387731,17047.997839,1.0,118436.0,...,26273.0,4543.045634,617.1875,200.0,2700.0,628.567201,1604.537879,441.0,3101.0,832.760455
4,2957.0,200.0,2158.0,2181.607738,445.0,12933.0,883.943658,4627.434087,331.0,53331.0,...,35249.0,6726.342007,794.545455,200.0,2000.0,584.496675,1490.685315,366.0,2762.0,812.454934


In [14]:
# 统计全局plan里 和od的交叉作用

stat_2 = data[['o','d','sphere_dis','odl2_dis']].copy().drop_duplicates()

for i in tqdm(['o','d','sphere_dis','odl2_dis']):
    now = data[[i]].groupby(i)[i].agg(['count']).add_prefix("{}_".format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    data = data.merge(now,on=i,how='left')
    for j in ['price','eta','distance']:
        if i!=j:
            now = data[[i,j]].groupby([i])[j].agg(['mean','min','max','std']).add_prefix("{}_{}_".format(i,j))
            add_columns.extend(now.columns)
            now = now.reset_index()
            stat_2 = stat_2.merge(now,on=i,how='left')
            
for i in tqdm(['price','eta','distance','ep','de','ed','pd','dp','pe','transport_mode_rank']):
    now = data[['o','d',i]].groupby(['o','d'])[i].agg(['mean','min','max','std']).add_prefix('o_d_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_2 = stat_2.merge(now,on=['o','d'],how='left')

for i in tqdm(['transport_mode']):
    now = data[['o','d',i]].groupby(['o','d'])[i].agg(['nunique','std',get_mode,get_mode_count]).add_prefix('o_d_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_2 = stat_2.merge(now,on=['o','d'],how='left')

stat_2.head()

100%|██████████| 4/4 [00:14<00:00,  3.57s/it]
100%|██████████| 10/10 [00:07<00:00,  1.37it/s]
100%|██████████| 1/1 [01:16<00:00, 76.92s/it]


Unnamed: 0,o,d,sphere_dis,odl2_dis,o_price_mean,o_price_min,o_price_max,o_price_std,o_eta_mean,o_eta_min,...,o_d_pe_max,o_d_pe_std,o_d_transport_mode_rank_mean,o_d_transport_mode_rank_min,o_d_transport_mode_rank_max,o_d_transport_mode_rank_std,o_d_transport_mode_nunique,o_d_transport_mode_std,o_d_transport_mode_get_mode,o_d_transport_mode_get_mode_count
0,"116.39,39.92","116.41,39.91",2038.849069,0.022361,1496.918793,-1.0,39300.0,2150.502351,2382.48189,-1.0,...,1.845907,0.540284,2.438247,0.0,5.0,1.689052,7,1.905665,1.0,230.0
1,"116.58,40.08","116.20,39.91",37542.352624,0.416293,4780.547438,-1.0,29400.0,4098.637746,4295.699065,-1.0,...,4.38247,1.440461,2.272727,0.0,5.0,1.678744,6,3.2891,1.0,2.0
2,"116.32,39.89","116.27,39.86",5421.442823,0.05831,2316.077006,-1.0,46000.0,3234.432902,2856.174286,-1.0,...,1.985371,0.939906,1.0,0.0,2.0,0.894427,3,0.894427,2.0,2.0
3,"116.44,39.88","116.45,39.86",2379.751807,0.022361,2064.739955,-1.0,25900.0,3182.439933,2895.299024,-1.0,...,1.943199,0.857354,2.5,0.0,5.0,1.783765,6,2.081666,1.0,3.0
4,"116.29,40.07","116.44,39.93",20142.33964,0.205183,3140.0,200.0,18200.0,3975.443645,4272.906367,1.0,...,2.606232,1.232724,2.0,0.0,4.0,1.581139,5,2.915476,2.0,1.0


In [15]:
# pid 交叉统计

stat_3 = data[['pid']].copy().drop_duplicates()
for i in tqdm(['price','distance','eta','sphere_dis','odl2_dis','transport_mode_rank','ep','pe','dp','pd','de','ed']):
    now = data[['pid',i]].groupby('pid')[i].agg(['mean','min','max','std','nunique']).add_prefix('pid_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_3 = stat_3.merge(now,on='pid',how='left')
    
for i in tqdm(['transport_mode']):
    now = data[['pid',i]].groupby('pid')[i].agg(['nunique','std',get_mode,get_mode_count]).add_prefix('pid_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_3 = stat_3.merge(now,on='pid',how='left')

stat_3.head()

100%|██████████| 12/12 [00:13<00:00,  1.29s/it]
100%|██████████| 1/1 [00:16<00:00, 16.26s/it]


Unnamed: 0,pid,pid_price_mean,pid_price_min,pid_price_max,pid_price_std,pid_price_nunique,pid_distance_mean,pid_distance_min,pid_distance_max,pid_distance_std,...,pid_de_nunique,pid_ed_mean,pid_ed_min,pid_ed_max,pid_ed_std,pid_ed_nunique,pid_transport_mode_nunique,pid_transport_mode_std,pid_transport_mode_get_mode,pid_transport_mode_get_mode_count
0,,,,,,,,,,,...,,,,,,,,,,
1,199899.0,2455.292955,-1.0,39100.0,3725.382502,189.0,17892.809577,-1.0,127421.0,16630.349162,...,5220.0,0.485554,0.021277,300.0,6.884106,5220.0,12.0,2.583876,3.0,1154.0
2,177401.0,1200.0,300.0,2100.0,1272.792206,2.0,7163.666667,6157.0,7667.0,871.798906,...,3.0,0.163607,0.121169,0.209355,0.044186,3.0,3.0,1.0,2.0,1.0
3,176887.0,2233.333333,200.0,15700.0,3596.235226,21.0,15690.723684,1315.0,51856.0,16171.928992,...,74.0,0.326021,0.067742,0.909901,0.25478,74.0,10.0,2.623076,1.0,15.0
4,106441.0,2260.577264,-1.0,43900.0,3517.309254,154.0,15754.034956,-1.0,137466.0,15732.879822,...,3069.0,0.696335,0.061026,300.0,9.92793,3069.0,12.0,2.531815,3.0,678.0


In [16]:
# 与时间交叉

stat_4 = data[['req_time_dow']].copy().drop_duplicates()
for i in tqdm(['price','distance','eta','sphere_dis','odl2_dis','transport_mode_rank','ep','pe','dp','pd','de','ed']):
    now = data[['req_time_dow',i]].groupby('req_time_dow')[i].agg(['mean','min','max','std']).add_prefix('req_time_dow_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_4 = stat_4.merge(now,on='req_time_dow',how='left')
    
for i in tqdm(['transport_mode']):
    now = data[['req_time_dow',i]].groupby('req_time_dow')[i].agg(['nunique','std',get_mode,get_mode_count,'median']).add_prefix('req_time_dow_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_4 = stat_4.merge(now,on='req_time_dow',how='left')

stat_4.head()

100%|██████████| 12/12 [00:01<00:00,  6.34it/s]
100%|██████████| 1/1 [00:00<00:00,  1.19it/s]


Unnamed: 0,req_time_dow,req_time_dow_price_mean,req_time_dow_price_min,req_time_dow_price_max,req_time_dow_price_std,req_time_dow_price_skew,req_time_dow_distance_mean,req_time_dow_distance_min,req_time_dow_distance_max,req_time_dow_distance_std,...,req_time_dow_ed_mean,req_time_dow_ed_min,req_time_dow_ed_max,req_time_dow_ed_std,req_time_dow_ed_skew,req_time_dow_transport_mode_nunique,req_time_dow_transport_mode_std,req_time_dow_transport_mode_get_mode,req_time_dow_transport_mode_get_mode_count,req_time_dow_transport_mode_median
0,5,2251.099188,-1.0,56700.0,3469.793625,3.461063,16323.44075,-1.0,197384.0,15748.967414,...,0.695829,0.003425,313.0,9.629143,25.511272,12,2.569966,3.0,96416.0,4.0
1,2,2297.873134,-1.0,67700.0,3583.591998,3.504058,16601.894487,-1.0,221816.0,16023.421256,...,0.679489,0.00578,300.0,9.479721,25.891866,12,2.583507,3.0,87617.0,4.0
2,0,2284.402606,-1.0,92300.0,3568.677244,3.672837,16996.235748,-1.0,212163.0,16171.888743,...,0.786222,0.005333,380.0,10.756952,22.86682,12,2.598859,3.0,73807.0,4.0
3,3,2269.170812,-1.0,60200.0,3500.53198,3.45005,16513.472944,-1.0,225864.0,15813.227829,...,0.647176,0.012346,300.0,9.158196,27.067911,12,2.588712,3.0,78991.0,4.0
4,4,2335.253068,-1.0,57300.0,3635.498315,3.41919,16653.500207,-1.0,190162.0,16038.723011,...,0.803855,0.002924,300.0,10.848778,22.584649,12,2.586353,3.0,86750.0,4.0


In [17]:
# 与时间交叉

stat_5 = data[['req_time_hour']].copy().drop_duplicates()
for i in tqdm(['price','distance','eta','sphere_dis','odl2_dis','transport_mode_rank','ep','pe','dp','pd','de','ed']):
    now = data[['req_time_hour',i]].groupby('req_time_hour')[i].agg(['mean','min','max','std']).add_prefix('req_time_hour_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_5 = stat_5.merge(now,on='req_time_hour',how='left')
    
for i in tqdm(['transport_mode']):
    now = data[['req_time_hour',i]].groupby('req_time_hour')[i].agg(['nunique','std',get_mode,get_mode_count,'median']).add_prefix('req_time_hour_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_5 = stat_5.merge(now,on='req_time_hour',how='left')

stat_5.head()

100%|██████████| 12/12 [00:05<00:00,  2.13it/s]
100%|██████████| 1/1 [00:01<00:00,  1.76s/it]


Unnamed: 0,req_time_hour,req_time_hour_price_mean,req_time_hour_price_min,req_time_hour_price_max,req_time_hour_price_std,req_time_hour_price_skew,req_time_hour_distance_mean,req_time_hour_distance_min,req_time_hour_distance_max,req_time_hour_distance_std,...,req_time_hour_ed_mean,req_time_hour_ed_min,req_time_hour_ed_max,req_time_hour_ed_std,req_time_hour_ed_skew,req_time_hour_transport_mode_nunique,req_time_hour_transport_mode_std,req_time_hour_transport_mode_get_mode,req_time_hour_transport_mode_get_mode_count,req_time_hour_transport_mode_median
0,12.783333,2042.011882,-1.0,42100.0,3226.440225,3.828206,15461.429173,-1.0,141419.0,15690.178033,...,0.844704,0.048629,300.0,12.245852,23.196574,12,2.559418,3.0,692.0,4.0
1,11.166667,2026.408917,-1.0,24000.0,3024.618348,3.035398,16210.524687,-1.0,93463.0,14961.230032,...,0.903567,0.053174,300.0,11.931266,19.941594,12,2.573411,3.0,666.0,4.0
2,12.266667,2184.874016,-1.0,24400.0,3280.022142,3.20041,16706.460222,-1.0,99399.0,16353.819814,...,0.527603,0.047131,300.0,8.098656,35.089841,12,2.568545,3.0,629.0,4.0
3,23.85,2651.691293,-1.0,30400.0,4019.230929,2.979104,20559.770197,-1.0,94783.0,17366.526501,...,0.686053,0.05564,240.0,10.160963,23.582218,12,2.685473,1.0,126.0,4.0
4,13.7,2067.039409,-1.0,30200.0,3085.534911,3.269696,15132.171191,-1.0,112750.0,14511.324692,...,0.636128,0.054853,300.0,8.612217,26.817793,12,2.555609,3.0,676.0,4.0


In [18]:
# 时空交叉
stat_6 = data[['req_time_hour','o','d']].copy().drop_duplicates()
for i in tqdm(['price','distance','eta','sphere_dis','odl2_dis','ep','pe','dp','pd','de','ed']):
    
    now = data[['req_time_hour','o','d',i]].groupby(['req_time_hour','o','d'])[i].agg(['mean','min','max','std','median']).add_prefix('od_hour_{}_'.format(i))
    add_columns.extend(now.columns)
    now = now.reset_index()
    stat_6 = stat_6.merge(now,on=['req_time_hour','o','d'],how='left')
    
stat_6.head()

100%|██████████| 11/11 [00:15<00:00,  1.43s/it]


Unnamed: 0,req_time_hour,o,d,od_hour_price_mean,od_hour_price_min,od_hour_price_max,od_hour_price_std,od_hour_price_median,od_hour_distance_mean,od_hour_distance_min,...,od_hour_de_mean,od_hour_de_min,od_hour_de_max,od_hour_de_std,od_hour_de_median,od_hour_ed_mean,od_hour_ed_min,od_hour_ed_max,od_hour_ed_std,od_hour_ed_median
0,12.783333,"116.39,39.92","116.41,39.91",700.0,200.0,1700.0,866.025404,200.0,2960.666667,2465.0,...,2.110733,1.097507,3.317848,0.895078,1.971247,0.55528,0.3014,0.911156,0.239789,0.524383
1,11.166667,"116.58,40.08","116.20,39.91",5780.0,3000.0,14100.0,4766.235412,3200.0,48412.333333,47429.0,...,8.133766,6.607133,10.301694,1.730758,7.483773,0.127332,0.097071,0.151352,0.024856,0.133771
2,12.266667,"116.32,39.89","116.27,39.86",1200.0,300.0,2100.0,1272.792206,1200.0,7163.666667,6157.0,...,6.422645,4.776571,8.25296,1.745502,6.238405,0.163607,0.121169,0.209355,0.044186,0.160297
3,23.85,"116.44,39.88","116.45,39.86",566.666667,200.0,1300.0,635.085296,200.0,3341.833333,2714.0,...,3.637031,1.115037,9.086247,3.2384,2.277493,0.51534,0.110056,0.896831,0.358395,0.553313
4,13.7,"116.29,40.07","116.44,39.93",2850.0,600.0,9200.0,4237.530728,800.0,26951.0,24642.0,...,6.162776,4.577962,7.226393,1.046511,6.125594,0.166544,0.138382,0.218438,0.031811,0.163249


In [19]:
stat_7 = data[['sid']].drop_duplicates()

for i in tqdm(['transport_mode','transport_mode_rank']):
    for j in ['price','eta','distance','odl2_dis','sphere_dis']:
        if i!=j:
            tmp = data[['sid',i,j]].groupby(['sid',i])[j].agg(['mean','min','max','std']).add_prefix("double_sid_{}_{}_".format(i,j)).reset_index()
            tmp_fea = [n for n in tmp if n not in ['sid',i]]
            tmp = tmp.groupby(['sid'])[tmp_fea].agg(['mean','std','max','min'])
            tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
            stat_7 = stat_7.merge(tmp.reset_index(),on='sid',how='left')
            
stat_7.head()

100%|██████████| 2/2 [00:39<00:00, 19.45s/it]


Unnamed: 0,sid,double_sid_transport_mode_price_mean_mean,double_sid_transport_mode_price_mean_std,double_sid_transport_mode_price_mean_max,double_sid_transport_mode_price_mean_min,double_sid_transport_mode_price_min_mean,double_sid_transport_mode_price_min_std,double_sid_transport_mode_price_min_max,double_sid_transport_mode_price_min_min,double_sid_transport_mode_price_max_mean,...,double_sid_transport_mode_rank_sphere_dis_min_max,double_sid_transport_mode_rank_sphere_dis_min_min,double_sid_transport_mode_rank_sphere_dis_max_mean,double_sid_transport_mode_rank_sphere_dis_max_std,double_sid_transport_mode_rank_sphere_dis_max_max,double_sid_transport_mode_rank_sphere_dis_max_min,double_sid_transport_mode_rank_sphere_dis_std_mean,double_sid_transport_mode_rank_sphere_dis_std_std,double_sid_transport_mode_rank_sphere_dis_std_max,double_sid_transport_mode_rank_sphere_dis_std_min
0,2,950.0,1060.660172,1700.0,200.0,950.0,1060.660172,1700.0,200.0,950.0,...,2038.849069,2038.849069,2038.849069,0.0,2038.849069,2038.849069,,,,
1,10,5780.0,4766.235412,14100.0,3000.0,5780.0,4766.235412,14100.0,3000.0,5780.0,...,37542.352624,37542.352624,37542.352624,0.0,37542.352624,37542.352624,,,,
2,21,1200.0,1272.792206,2100.0,300.0,1200.0,1272.792206,2100.0,300.0,1200.0,...,5421.442823,5421.442823,5421.442823,0.0,5421.442823,5421.442823,,,,
3,25,750.0,777.817459,1300.0,200.0,750.0,777.817459,1300.0,200.0,750.0,...,2379.751807,2379.751807,2379.751807,0.0,2379.751807,2379.751807,,,,
4,34,2850.0,4237.530728,9200.0,600.0,2850.0,4237.530728,9200.0,600.0,2850.0,...,20142.33964,20142.33964,20142.33964,0.0,20142.33964,20142.33964,,,,


In [20]:
stat_8 = data[['sid']].drop_duplicates()

for i in tqdm(['distance','price','eta','ep','pe','dp','pd','de','ed']):
    tmp = data[['sid',i]].groupby(['sid'])[i].agg(['mean','min','max','std'])
    tmp['mean_max_ratio'] = tmp['mean'] / tmp['max'] 
    tmp['min_mean_ratio'] = tmp['min'] / tmp['mean']
    tmp = tmp.add_prefix('sid_'+i+'_')
    stat_8 = stat_8.merge(tmp,on='sid',how='left')
    
stat_8.head()

100%|██████████| 9/9 [00:05<00:00,  1.68it/s]


Unnamed: 0,sid,sid_distance_mean,sid_distance_min,sid_distance_max,sid_distance_std,sid_distance_mean_max_ratio,sid_distance_min_mean_ratio,sid_price_mean,sid_price_min,sid_price_max,...,sid_de_max,sid_de_std,sid_de_mean_max_ratio,sid_de_min_mean_ratio,sid_ed_mean,sid_ed_min,sid_ed_max,sid_ed_std,sid_ed_mean_max_ratio,sid_ed_min_mean_ratio
0,2,2960.666667,2465.0,3365.0,357.054991,0.879842,0.832583,700.0,200.0,1700.0,...,3.317848,0.895078,0.636175,0.519965,0.55528,0.3014,0.911156,0.239789,0.609424,0.542789
1,10,48412.333333,47429.0,49758.0,989.208101,0.972956,0.979688,5780.0,3000.0,14100.0,...,10.301694,1.730758,0.789556,0.812309,0.127332,0.097071,0.151352,0.024856,0.841298,0.76235
2,21,7163.666667,6157.0,7667.0,871.798906,0.934351,0.859476,1200.0,300.0,2100.0,...,8.25296,1.745502,0.778223,0.743708,0.163607,0.121169,0.209355,0.044186,0.781481,0.740608
3,25,3341.833333,2714.0,3898.0,503.026209,0.85732,0.812129,566.666667,200.0,1300.0,...,9.086247,3.2384,0.400279,0.306579,0.51534,0.110056,0.896831,0.358395,0.574624,0.213561
4,34,26951.0,24642.0,33507.0,3725.872985,0.804339,0.914326,2850.0,600.0,9200.0,...,7.226393,1.046511,0.852815,0.742841,0.166544,0.138382,0.218438,0.031811,0.762432,0.830901


In [21]:
stat_9 = data[['transport_mode']].drop_duplicates()

for i in tqdm(['transport_mode']):

    tmp = data[[i,'price','eta','distance','ep','pe','ed','de','dp','pd']].groupby([i]).agg({'price' : ['mean','min','max','std'],
                                                                                             'eta' : ['mean','min','max','std'],
                                                                                             'distance' : ['mean','min','max','std'],
                                                                                             'ep' : ['mean','min','max','std'],
                                                                                             'pe' : ['mean','min','max','std'],
                                                                                             'dp' : ['mean','min','max','std'],
                                                                                             'pd' : ['mean','min','max','std'],
                                                                                             'de' : ['mean','min','max','std'],
                                                                                             'ed' : ['mean','min','max','std'],})

    tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
    tmp = tmp.add_prefix(i+'_').reset_index()
    stat_9 = stat_9.merge(tmp,how='left',on=i)

    tmp = data[[i,'sphere_dis','odl2_dis']].groupby([i]).agg({'sphere_dis' : ['mean','min','max','std'],
                                                              'odl2_dis' : ['mean','min','max','std'],})

    tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
    tmp = tmp.add_prefix(i+'_').reset_index()
    stat_9 = stat_9.merge(tmp,how='left',on=i)
    
stat_9.shape

100%|██████████| 1/1 [00:01<00:00,  1.44s/it]


(12, 54)

In [22]:
stat_10 = data[['transport_mode_rank']].drop_duplicates()

for i in tqdm(['transport_mode_rank']):

    tmp = data[[i,'price','eta','distance','ep','pe','ed','de','dp','pd']].groupby([i]).agg({'price' : ['mean','min','max','std'],
                                                                                             'eta' : ['mean','min','max','std'],
                                                                                             'distance' : ['mean','min','max','std'],
                                                                                             'ep' : ['mean','min','max','std'],
                                                                                             'pe' : ['mean','min','max','std'],
                                                                                             'dp' : ['mean','min','max','std'],
                                                                                             'pd' : ['mean','min','max','std'],
                                                                                             'de' : ['mean','min','max','std'],
                                                                                             'ed' : ['mean','min','max','std'],})

    tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
    tmp = tmp.add_prefix(i+'_').reset_index()
    stat_10 = stat_10.merge(tmp,how='left',on=i)

    tmp = data[[i,'sphere_dis','odl2_dis']].groupby([i]).agg({'sphere_dis' : ['mean','min','max','std'],
                                                              'odl2_dis' : ['mean','min','max','std'],})

    tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
    tmp = tmp.add_prefix(i+'_').reset_index()
    stat_10 = stat_10.merge(tmp,how='left',on=i)
    
stat_10.shape

100%|██████████| 1/1 [00:01<00:00,  1.45s/it]


(7, 54)

In [23]:
# More Feature

data['distance_sphere_ratio'] = data['distance'] / data['sphere_dis']
data['ori_eta'] = data['eta'] / data['distance_sphere_ratio']
data['ori_price'] = data['price'] / data['distance_sphere_ratio']


In [24]:
# profiles & plans
# 60 + 4*5 = 80

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

data = data.merge(profiles,on='pid',how='left')
print(data.shape)

tmp = data[['transport_mode','pid']].groupby(['transport_mode'])['pid'].agg(['std','nunique','count',get_mode,get_mode_count]).add_prefix('transport_mode_pid_').reset_index()
data = data.merge(tmp,how='left',on='transport_mode')

data.shape

(2717358, 108)


(2717358, 113)

In [25]:
for i in tqdm(data.columns):
    try:
        now = data[i].std()
        if now==0:
            del data[i]
    except:
        continue
        
data.shape

100%|██████████| 113/113 [00:01<00:00, 65.20it/s]


(2717358, 113)

In [26]:
N_COM = 10

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

def tokenize(data):
    tokenized_docs = [word_tokenize(doc) for doc in data]
    alpha_tokens = [[t.lower() for t in doc if t.isalpha() == True] for doc in tokenized_docs]
    lemmatizer = WordNetLemmatizer()
    lem_tokens = [[lemmatizer.lemmatize(alpha) for alpha in doc] for doc in alpha_tokens]
    X_stem_as_string = [" ".join(x_t) for x_t in lem_tokens]
    return X_stem_as_string

vct = CountVectorizer(lowercase=False)
svd = TruncatedSVD(n_components=N_COM, random_state=2019)
tfvec = TfidfVectorizer(ngram_range=(1, 6),analyzer='char_wb')

for i in tqdm(['distance','price','eta','transport_mode']):
    x = plans[i].astype(str).fillna('NAN').values
    x = tfvec.fit_transform(x)
    x = svd.fit_transform(x)
    svd_feas = pd.DataFrame(x)
    svd_feas.columns = ['{}_svd_fea_{}'.format(i,j) for j in range(N_COM)]
    svd_feas['sid'] = plans['sid'].values
    data = data.merge(svd_feas, on='sid', how='left')

print(data.shape)

100%|██████████| 4/4 [03:06<00:00, 47.70s/it]

(2717358, 153)





In [27]:
tmp = data[['transport_mode','transport_mode_rank','price','eta','distance','ep','pe','dp','pd','de','ed']].groupby(['transport_mode','transport_mode_rank']).\
                    agg({'price' : ['mean','min','max','std','skew'],
                        'eta' : ['mean','min','max','std','skew'],
                        'distance' : ['mean','min','max','std','skew'],
                        'ep' : ['mean','min','max','std','skew'],
                        'pe' : ['mean','min','max','std','skew'],
                        'dp' : ['mean','min','max','std','skew'],
                        'pd' : ['mean','min','max','std','skew'],
                        'de' : ['mean','min','max','std','skew'],
                        'ed' : ['mean','min','max','std','skew'],})

tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
tmp = tmp.add_prefix('transport_mode_and_rank_').reset_index()
data = data.merge(tmp,how='left',on=['transport_mode','transport_mode_rank'])
data['req_time_month'] = data['req_time'].dt.month
data.shape

(2717358, 199)

In [28]:
from sklearn.model_selection import KFold

cv_num = 5

def label_mean(data,feat_set,cols,label_name):
    cols_label = feat_set.groupby([cols],as_index=False)[label_name].agg({'feats':'mean'})
    return data[[cols]].merge(cols_label,'left',[cols])['feats'].fillna(0).values

def cv_feat(feat_set,test_set,cv_num,f,f_params): 
    result = np.zeros((feat_set.shape[0], 1))
    label_fold = np.zeros((test_set.shape[0]))
    kf = KFold(n_splits=cv_num,
                         shuffle=True,
                         random_state=2019).split(feat_set)
    for k, (train_fold, test_fold) in enumerate(kf):
        result[test_fold, 0] = f(feat_set.loc[test_fold, :], feat_set.loc[train_fold, :],*f_params)
        label_fold += f(test_set, feat_set.loc[train_fold, :],*f_params)
    label_fold = label_fold/cv_num
    result = [x[0] for x in list(result)] + list(label_fold)
    return result

feat_set = data[data['label']!=-1].copy()
test_set = data[data['label']==-1].copy()

for each in tqdm(range(2)):
    feat_set[f'label_{each}'] = feat_set['click_mode'].map(lambda x:0 if x!=each else 1)
    feat_set.loc[feat_set['req_time']>'2018-12-01',f'label_{each}'] = -1

for i in tqdm(range(0,2)):
    for col in ['o','d']:
        data[f'{col}_label_{i}_cv'] = cv_feat(feat_set,test_set,cv_num,
                    label_mean,[col,f'label_{i}'])
        
del feat_set;del test_set;
data.head()

100%|██████████| 2/2 [00:02<00:00,  1.00s/it]
100%|██████████| 2/2 [01:11<00:00, 35.90s/it]


Unnamed: 0,sid,click_mode,distance,transport_mode_rank,eta,transport_mode,price,plan_time,pid,req_time,...,transport_mode_and_rank_ed_mean,transport_mode_and_rank_ed_min,transport_mode_and_rank_ed_max,transport_mode_and_rank_ed_std,transport_mode_and_rank_ed_skew,req_time_month,o_label_0_cv,d_label_0_cv,o_label_1_cv,d_label_1_cv
0,2,0.0,2898.0,0.0,1794.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,0.400001,0.050039,1.532258,0.169702,0.517836,11,0.056512,0.070533,0.312437,0.101475
1,2,0.0,2714.0,1.0,818.0,6.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,0.302156,0.111111,1.0,0.045328,14.547107,11,0.056304,0.069266,0.313953,0.101579
2,2,0.0,3365.0,2.0,1146.0,3.0,,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,0.538588,0.006192,206.0,3.999677,24.564294,11,0.057183,0.070244,0.310694,0.101463
3,2,0.0,3365.0,3.0,1446.0,4.0,1700.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,0.276119,0.067084,3.128049,0.100917,1.690846,11,0.056304,0.069266,0.313953,0.101579
4,2,0.0,2957.0,4.0,2158.0,1.0,200.0,2018-11-10 12:47:12,,2018-11-10 12:47:12,...,0.441789,0.085401,1.816794,0.219685,1.158395,11,0.056512,0.070533,0.312437,0.101475


In [29]:
data = data.merge(stat_1,on=['distance','price','eta'],how='left').\
            merge(stat_2,on=['o','d','sphere_dis','odl2_dis'],how='left').\
            merge(stat_3,on='pid',how='left').\
            merge(stat_4,on='req_time_dow',how='left').\
            merge(stat_5,on='req_time_hour',how='left').\
            merge(stat_6,on=['o','d','req_time_hour'],how='left').\
            merge(stat_7,on=['sid'],how='left').\
            merge(stat_8,on=['sid'],how='left').\
            merge(stat_9,on=['transport_mode'],how='left').\
            merge(stat_10,on=['transport_mode_rank'],how='left')

data['nan_total'] = data.isnull().sum(axis=1)

data.shape

(2717358, 889)

In [30]:
to_del = []
for i in tqdm(data.columns):
    try:
        now = data[i].std()
        if now<=0.1:
            to_del.append(i)
    except:
        continue
        
len(to_del)

100%|██████████| 889/889 [00:08<00:00, 104.52it/s]


132

In [31]:
import gc
gc.collect()

105

In [32]:
all_data = data.copy()

feature_name = [i for i in all_data.columns if i not in ['label','click_mode','plan_time','req_time','transport_mode']]
feature_name = [i for i in feature_name if i not in to_del]
cate_feature = ['sid','o','d','req_time_woy','req_is_weekend','pid','o_geohash','d_geohash','ox','oy','dx','dy']

for i in tqdm(cate_feature):
    all_data[i] = all_data[i].astype('category')
    
print(len(cate_feature),' ',len(feature_name))

100%|██████████| 12/12 [00:41<00:00,  6.46s/it]

12   752





In [33]:
%%time
data.to_pickle("../cache_data/{}_raw_data.pickle".format(data.shape[1]))
# all_data.to_pickle("../cache_data/{}_all_data.pickle".format(all_data.shape[1]))

CPU times: user 11.5 s, sys: 15.5 s, total: 27 s
Wall time: 38.7 s


In [34]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

def f1_macro(preds, labels):
    labels = np.argmax(labels.reshape(12, -1), axis=0)
    score = f1_score(y_true=labels, y_pred=preds, average='weighted')
    return 'f1_score', score, True

def get_f1_score(y, pred):
    pred_lst = pred.tolist()
    pred_lst = [item.index(max(item)) for item in pred_lst]
    score = []
    for i in range(12):
        score.append(f1_score([1 if i==item else 0 for item in y],
                              [1 if i==item else 0 for item in pred_lst]))
    c = Counter(y)
    score = [item*c[ix]/len(y) for ix, item in enumerate(score)]
    score = np.sum(score)
    print('f1-score = {:.4f}'.format(score))
    return score

# Define F1 Train

In [35]:
train_index = (all_data.req_time < '2018-11-23')
X_train     = all_data[train_index][feature_name].reset_index(drop=True)
train_label = all_data[train_index].label.reset_index(drop=True)

valid_index = (all_data.req_time > '2018-11-23') & (all_data.req_time < '2018-12-01')
X_val       = all_data[valid_index][feature_name].reset_index(drop=True)
y_val       = all_data[valid_index].label.reset_index(drop=True)

train_index = (all_data.req_time < '2018-12-01')
train       = all_data[train_index].reset_index(drop=True)

test_index = (all_data.req_time > '2018-12-01')
X_test     = all_data[test_index][feature_name].reset_index(drop=True)

In [36]:
from collections import Counter

# Offline
lgb_model = lgb.LGBMClassifier(
        boosting_type="gbdt", num_leaves=128, reg_alpha=0.1, reg_lambda=2,
        max_depth=-1, n_estimators=3000, objective='binary',
        subsample=0.7, colsample_bytree=0.5, subsample_freq=1,min_child_samples=20,
        learning_rate=0.01, random_state=2019 , n_jobs=40, metric="None", importance_type='gain'
    )

eval_set = [(X_val, y_val)]
lgb_model.fit(X_train, train_label, eval_set=eval_set,verbose=10,early_stopping_rounds=40,eval_metric='auc')
iters = int(lgb_model.best_iteration_*1.1)
y_test = lgb_model.predict_proba(X_test)  
fi = DF()
fi['name'] = feature_name
fi['score'] = lgb_model.feature_importances_
print(list(fi.sort_values(by=['score'],ascending=False)['name'])[:100])

offline = data[valid_index][['sid','transport_mode']].copy().reset_index(drop=True)
offline['label'] = lgb_model.predict_proba(X_val)[:,1]
offline = offline.sort_values(['sid','label'],ascending=False)
offline = offline.groupby('sid',as_index=False).head(1)[['sid','transport_mode','label']].copy()
offline.replace(-1,0,inplace=True)
offline = offline.merge(train_clicks[['sid','click_mode']],how='left',on='sid')
print(f1_score(offline['click_mode'].values,offline['transport_mode'].values,average='weighted'))
offline.loc[offline['label']<0.2,'transport_mode'] = 0

# 0.14 6895

from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score,recall_score,precision_score

dic_ = offline['click_mode'].value_counts(normalize = True)
def get_weighted_fscore(y_pred, y_true):
    f_score = 0
    for i in range(12):
        yt = y_true == i
        yp = y_pred == i
        f_score += dic_[i] * f1_score(y_true=yt, y_pred= yp)
        print(i,dic_[i],f1_score(y_true=yt, y_pred= yp), precision_score(y_true=yt, y_pred= yp),recall_score(y_true=yt, y_pred= yp))
    print(f_score)
get_weighted_fscore(y_true =offline['click_mode'] , y_pred = offline['transport_mode'])

Training until validation scores don't improve for 40 rounds.
[10]	valid_0's auc: 0.919866
[20]	valid_0's auc: 0.920298
[30]	valid_0's auc: 0.920596
[40]	valid_0's auc: 0.92089
[50]	valid_0's auc: 0.921146
[60]	valid_0's auc: 0.921405
[70]	valid_0's auc: 0.921534
[80]	valid_0's auc: 0.921648
[90]	valid_0's auc: 0.921694
[100]	valid_0's auc: 0.92176
[110]	valid_0's auc: 0.92189
[120]	valid_0's auc: 0.921926
[130]	valid_0's auc: 0.921982
[140]	valid_0's auc: 0.922068
[150]	valid_0's auc: 0.922134
[160]	valid_0's auc: 0.922184
[170]	valid_0's auc: 0.922283
[180]	valid_0's auc: 0.922347
[190]	valid_0's auc: 0.922402
[200]	valid_0's auc: 0.922496
[210]	valid_0's auc: 0.922571
[220]	valid_0's auc: 0.92267
[230]	valid_0's auc: 0.922765
[240]	valid_0's auc: 0.922816
[250]	valid_0's auc: 0.922918
[260]	valid_0's auc: 0.922997
[270]	valid_0's auc: 0.923071
[280]	valid_0's auc: 0.92315
[290]	valid_0's auc: 0.923236
[300]	valid_0's auc: 0.923318
[310]	valid_0's auc: 0.923372
[320]	valid_0's auc: 0

In [36]:
lgb_model = lgb.LGBMClassifier(
        boosting_type="gbdt", num_leaves=128, reg_alpha=0.1, reg_lambda=10,
        max_depth=-1, n_estimators=iters, objective='binary',
        subsample=0.6, colsample_bytree=0.5, subsample_freq=1,min_child_samples=20,
        learning_rate=0.1, random_state=2019 , n_jobs=-1, metric="None", importance_type='gain'
    )

lgb_model.fit(train[feature_name], train['label'])
y_test_all = lgb_model.predict_proba(X_test)

In [39]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

def auc(y,pred):
#     fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    return roc_auc_score(y, pred)

X_test = all_data[all_data['label']==-1].drop(['transport_mode','label','click_mode'],axis=1)[feature_name]
train = all_data[all_data['label']!=-1].drop(['transport_mode','click_mode'],axis=1)
X_train = train.drop('label',axis=1)[feature_name]
y_train = train['label']

K = 5
skf = StratifiedKFold(n_splits = K, shuffle = True ,random_state=1998)

lgb_pred_te_all = 0
lgb_auc_mean = 0
lgb_auc_mean2 = 0
cv_pred = np.zeros(X_train.shape[0])

for i, (train_index, test_index) in enumerate(skf.split(X_train,y_train)):
#     if i==2:
#     if i!=10:
    y_tr, y_val = y_train.iloc[train_index].copy(), y_train.iloc[test_index].copy()
    X_tr, X_val= X_train.iloc[train_index,:].copy(), X_train.iloc[test_index,:].copy()
    print( "\nFold ", i)
    lgb_params = {
                        'task': 'train',
                        'learning_rate': 0.01,
#                         'max_bin' : 200,
                        'min_sum_hessian_in_leaf': 10,
#                         'tree_learner' : 'voting',
                        'num_leaves' : 128,
#                         'max_depth': 5, 
                        'boosting_type': 'gbdt',
                        'objective': 'xentlambda',
#                         'is_unbalance':True,
                        'colsample_bytree' : 0.5,
                        'metric':'auc',
#                         'lambda_l1': 0.0001,
                        'lambda_l2': 3,
                        'bagging_freq': 1,
                        'verbose': 1,
                        'random_state': 2019,
                        'num_threads':40,}
    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_val = lgb.Dataset(X_val, label=y_val)
#     watchlist = {(lgb_train, 'train'), (lgb_val, 'eval')}
    lgb_model = lgb.train( 
                           lgb_params, 
                           lgb_train, 
                           num_boost_round=3000, 
                           valid_sets = [lgb_train,lgb_val], 
                           verbose_eval=50, 
#                           feval=gini_lgb, 
                           early_stopping_rounds=50
                         )
    print( " Best iteration = ", lgb_model.best_iteration )

    pred = lgb_model.predict(X_val)
    cv_pred[test_index] = pred
    lgb_auc_mean = auc(y_val,pred) / K + lgb_auc_mean
    lgb_auc_mean2 = lgb_model.best_score['valid_1']['auc'] / K +lgb_auc_mean2
    print( " auc_LGB = ",lgb_model.best_score['valid_1']['auc'])
    print( " auc = ", auc(y_val,pred) )
    pred_te = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
    lgb_pred_te_all = lgb_pred_te_all + pred_te / K
print( " mean_auc_LGB = ", lgb_auc_mean2 )
print( " mean_auc = ", lgb_auc_mean )


Fold  0




Training until validation scores don't improve for 50 rounds.
[50]	training's auc: 0.91623	valid_1's auc: 0.915396
[100]	training's auc: 0.918535	valid_1's auc: 0.916179
[150]	training's auc: 0.920358	valid_1's auc: 0.916614
[200]	training's auc: 0.922353	valid_1's auc: 0.91708
[250]	training's auc: 0.924515	valid_1's auc: 0.917596
[300]	training's auc: 0.926569	valid_1's auc: 0.918011
[350]	training's auc: 0.928401	valid_1's auc: 0.918355
[400]	training's auc: 0.930194	valid_1's auc: 0.918471
[450]	training's auc: 0.931776	valid_1's auc: 0.918408
Early stopping, best iteration is:
[403]	training's auc: 0.930298	valid_1's auc: 0.918475
 Best iteration =  403
 auc_LGB =  0.9184752825976616
 auc =  0.9184752825976615

Fold  1




Training until validation scores don't improve for 50 rounds.
[50]	training's auc: 0.916599	valid_1's auc: 0.913992
[100]	training's auc: 0.918888	valid_1's auc: 0.914762
[150]	training's auc: 0.920685	valid_1's auc: 0.915234
[200]	training's auc: 0.922659	valid_1's auc: 0.915702
[250]	training's auc: 0.924808	valid_1's auc: 0.91622
[300]	training's auc: 0.92683	valid_1's auc: 0.916632
[350]	training's auc: 0.928636	valid_1's auc: 0.916983
[400]	training's auc: 0.930419	valid_1's auc: 0.917111
[450]	training's auc: 0.93198	valid_1's auc: 0.917056
Early stopping, best iteration is:
[403]	training's auc: 0.930521	valid_1's auc: 0.917117
 Best iteration =  403
 auc_LGB =  0.9171170990547305
 auc =  0.9171170990547305

Fold  2




Training until validation scores don't improve for 50 rounds.
[50]	training's auc: 0.916442	valid_1's auc: 0.914362
[100]	training's auc: 0.918614	valid_1's auc: 0.915006
[150]	training's auc: 0.92048	valid_1's auc: 0.915401
[200]	training's auc: 0.92247	valid_1's auc: 0.915851
[250]	training's auc: 0.924645	valid_1's auc: 0.916368
[300]	training's auc: 0.926708	valid_1's auc: 0.916798
[350]	training's auc: 0.928563	valid_1's auc: 0.917183
[400]	training's auc: 0.930351	valid_1's auc: 0.917326
[450]	training's auc: 0.931918	valid_1's auc: 0.917267
Early stopping, best iteration is:
[409]	training's auc: 0.930662	valid_1's auc: 0.917328
 Best iteration =  409
 auc_LGB =  0.9173277050210599
 auc =  0.9173277050210599

Fold  3




Training until validation scores don't improve for 50 rounds.
[50]	training's auc: 0.916449	valid_1's auc: 0.914675
[100]	training's auc: 0.918739	valid_1's auc: 0.915463
[150]	training's auc: 0.920596	valid_1's auc: 0.915942
[200]	training's auc: 0.922598	valid_1's auc: 0.916407
[250]	training's auc: 0.924708	valid_1's auc: 0.916871
[300]	training's auc: 0.92675	valid_1's auc: 0.917275
[350]	training's auc: 0.928584	valid_1's auc: 0.917602
[400]	training's auc: 0.930372	valid_1's auc: 0.917736
[450]	training's auc: 0.931935	valid_1's auc: 0.917696
Early stopping, best iteration is:
[421]	training's auc: 0.931061	valid_1's auc: 0.917752
 Best iteration =  421
 auc_LGB =  0.9177519709268566
 auc =  0.9177519709268566

Fold  4




Training until validation scores don't improve for 50 rounds.
[50]	training's auc: 0.916494	valid_1's auc: 0.914329
[100]	training's auc: 0.918813	valid_1's auc: 0.915053
[150]	training's auc: 0.920606	valid_1's auc: 0.915449
[200]	training's auc: 0.922622	valid_1's auc: 0.9159
[250]	training's auc: 0.9248	valid_1's auc: 0.916357
[300]	training's auc: 0.926849	valid_1's auc: 0.916763
[350]	training's auc: 0.928695	valid_1's auc: 0.91708
[400]	training's auc: 0.930484	valid_1's auc: 0.917187
Early stopping, best iteration is:
[399]	training's auc: 0.930444	valid_1's auc: 0.91719
 Best iteration =  399
 auc_LGB =  0.9171899650319528
 auc =  0.9171899650319528
 mean_auc_LGB =  0.9175724045264524
 mean_auc =  0.9175724045264522


In [40]:
from sklearn.metrics import roc_auc_score,precision_score,recall_score

offline = all_data[all_data['label']!=-1][['sid','transport_mode']].copy().reset_index(drop=True)
offline['label'] = cv_pred / cv
offline = offline.sort_values(['sid','label'],ascending=False)

tmp = offline[offline['transport_mode']==3]
tmp = tmp[tmp['label']>0.04]
tmp = list(offline[offline.index.isin(tmp.index)]['sid'].values)

tmp1 = offline[offline['transport_mode']==4]
tmp1 = tmp1[tmp1['label']>0.03]
tmp1 = list(offline[offline.index.isin(tmp1.index)]['sid'].values)

tmp2 = offline[offline['transport_mode']==8]
tmp2 = tmp2[tmp2['label']>0.06]
tmp2 = list(offline[offline.index.isin(tmp2.index)]['sid'].values)

tmp3 = offline[offline['transport_mode']==6]
tmp3 = tmp3[tmp3['label']>0.05]
tmp3 = list(offline[offline.index.isin(tmp3.index)]['sid'].values)

print('Mobi 3: ',len(tmp))
print('Mobi 4: ',len(tmp1))
print('Mobi 8: ',len(tmp2))
print('Mobi 6: ',len(tmp3))
offline.replace(-1,0,inplace=True)
offline = offline.groupby('sid',as_index=False).head(1)[['sid','transport_mode','label']].copy()
offline = offline.merge(train_clicks[['sid','click_mode']],how='left',on='sid')
dic_ = offline['click_mode'].value_counts(normalize = True)
def get_weighted_fscore(y_pred, y_true):
    f_score = 0
    for i in range(12):
        yt = y_true == i
        yp = y_pred == i
        f_score += dic_[i] * f1_score(y_true=yt, y_pred= yp)
        print(i,dic_[i],f1_score(y_true=yt, y_pred= yp), precision_score(y_true=yt, y_pred= yp),recall_score(y_true=yt, y_pred= yp))
    print(f_score)
get_weighted_fscore(y_true =offline['click_mode'] , y_pred = offline['transport_mode'])

offline.loc[offline['label']<0.04,'transport_mode'] = 0

offline.loc[offline['sid'].isin(tmp2),'transport_mode'] = 8

offline.loc[offline['sid'].isin(tmp3),'transport_mode'] = 6

offline.loc[offline['sid'].isin(tmp1),'transport_mode'] = 4

offline.loc[offline['sid'].isin(tmp),'transport_mode'] = 3

dic_ = offline['click_mode'].value_counts(normalize = True)
get_weighted_fscore(y_true =offline['click_mode'] , y_pred = offline['transport_mode'])

Mobi 3:  15880
Mobi 4:  7394
Mobi 8:  2348
Mobi 6:  8945
0 0.093328 0.32174069411976264 1.0 0.19171095491170925
1 0.140738 0.6734605838183899 0.6012525396860836 0.765379641603547
2 0.272982 0.8853876292184616 0.8451537272872596 0.9296437127722706
3 0.049252 0.08576075706047612 0.47815333882934874 0.047104686104117596
4 0.025212 0.01931774626293599 0.2870159453302961 0.009995240361732508
5 0.09496 0.829505874591301 0.7689913834973287 0.900358045492839
6 0.023726 0.26503198294243063 0.36044657097288674 0.2095591334401079
7 0.156418 0.770146791442297 0.7056631892697467 0.8476006597706146
8 0.003714 0.33960489181561615 0.30146137787056365 0.3887991383952612
9 0.097728 0.728182465623345 0.6573906162418807 0.8160609037328095
10 0.029764 0.5469285937012784 0.47300809021819074 0.6482327644133853
11 0.012178 0.611111111111111 0.5219211536225142 0.7370668418459517
0.6728835746160603
0 0.093328 0.322937087863126 0.9591361422824476 0.19415395165438026
1 0.140738 0.6646626271340662 0.63215723480089

In [111]:
a1 = list(set(tmp)&set(tmp1))
a2 = list(set(tmp)&set(tmp2))
a3 = list(set(tmp)&set(tmp3))
a4 = list(set(tmp)&set(tmp1)&set(tmp2))
a5 = list(set(tmp)&set(tmp1)&set(tmp2)&set(tmp3))
len(a1),len(a2),len(a3),len(a4),len(a5)

(736, 119, 106, 1, 0)

In [41]:
offline = all_data[all_data['label']!=-1][['sid','transport_mode_rank','transport_mode']].copy().reset_index(drop=True)
offline['label'] = cv_pred / cv
offline['weighted_label'] = offline['label'] / ((offline['transport_mode_rank']+1))
offline = offline.sort_values(['sid','weighted_label'],ascending=False)

tmp = offline[offline['transport_mode']==3]
tmp = tmp[tmp['weighted_label']>0.04]
tmp = list(offline[offline.index.isin(tmp.index)]['sid'].values)

tmp1 = offline[offline['transport_mode']==4]
tmp1 = tmp1[tmp1['weighted_label']>0.03]
tmp1 = list(offline[offline.index.isin(tmp1.index)]['sid'].values)

tmp2 = offline[offline['transport_mode']==8]
tmp2 = tmp2[tmp2['weighted_label']>0.02]
tmp2 = list(offline[offline.index.isin(tmp2.index)]['sid'].values)

tmp3 = offline[offline['transport_mode']==6]
tmp3 = tmp3[tmp3['weighted_label']>0.05]
tmp3 = list(offline[offline.index.isin(tmp3.index)]['sid'].values)

print('Mobi 3: ',len(tmp))
print('Mobi 4: ',len(tmp1))
print('Mobi 8: ',len(tmp2))
print('Mobi 6: ',len(tmp3))

offline.replace(-1,0,inplace=True)
offline = offline.groupby('sid',as_index=False).head(1)[['sid','transport_mode','weighted_label']].copy()
offline = offline.merge(train_clicks[['sid','click_mode']],how='left',on='sid')
dic_ = offline['click_mode'].value_counts(normalize = True)
def get_weighted_fscore(y_pred, y_true):
    f_score = 0
    for i in range(12):
        yt = y_true == i
        yp = y_pred == i
        f_score += dic_[i] * f1_score(y_true=yt, y_pred= yp)
        print(i,dic_[i],f1_score(y_true=yt, y_pred= yp), precision_score(y_true=yt, y_pred= yp),recall_score(y_true=yt, y_pred= yp))
    print(f_score)
    return f_score
get_weighted_fscore(y_true =offline['click_mode'] , y_pred = offline['transport_mode'])

offline.loc[offline['weighted_label']<0.01,'transport_mode'] = 0

offline.loc[offline['sid'].isin(tmp2),'transport_mode'] = 8

# offline.loc[offline['sid'].isin(tmp3),'transport_mode'] = 6

# offline.loc[offline['sid'].isin(tmp1),'transport_mode'] = 4

# offline.loc[offline['sid'].isin(tmp),'transport_mode'] = 3

get_weighted_fscore(y_true =offline['click_mode'] , y_pred = offline['transport_mode'])

Mobi 3:  12717
Mobi 4:  7228
Mobi 8:  3248
Mobi 6:  8757
0 0.093328 0.32174069411976264 1.0 0.19171095491170925
1 0.140738 0.663728906196625 0.63913632715359 0.6902897582742401
2 0.272982 0.885940748715026 0.8521684398603102 0.9225003846407456
3 0.049252 0.2146206104678978 0.2611479799743858 0.18216519126126857
4 0.025212 0.14914536378762672 0.14250614250614252 0.15643344439155957
5 0.09496 0.835576101703432 0.7990773666506488 0.8755686604886268
6 0.023726 0.2925404577228765 0.33857918652332925 0.25752339205934416
7 0.156418 0.7679096126341239 0.721758498886314 0.8203659425385825
8 0.003714 0.36300102774922915 0.2935505319148936 0.4754981152396338
9 0.097728 0.7289253048780487 0.681938106103822 0.7828667321545514
10 0.029764 0.5447159156390456 0.4767981438515081 0.6351968821394974
11 0.012178 0.6088309379167284 0.5545957619111891 0.674823452126786
0.6822297919793219
0 0.093328 0.32175930088288707 0.9997765113420494 0.19173238470769757
1 0.140738 0.663701274298794 0.6391703734898534 0.6

0.6822847664516694

In [89]:
# offline = all_data[all_data['label']!=-1][['sid','transport_mode']].copy().reset_index(drop=True)
# offline['label'] = cv_pred / cv
# offline = offline.sort_values(['sid','label'],ascending=False)
# offline = offline.groupby('sid',as_index=False).head(1)[['sid','transport_mode','label']].copy()
# offline.replace(-1,0,inplace=True)
# offline = offline.merge(train_clicks[['sid','click_mode']],how='left',on='sid')
# print(f1_score(offline['click_mode'].values,offline['transport_mode'].values,average='weighted'))
# offline.loc[offline['label']<0.2,'transport_mode'] = 0

# # 0.14 6895

# from sklearn.metrics import accuracy_score
# from sklearn.metrics import accuracy_score,recall_score,precision_score

# dic_ = offline['click_mode'].value_counts(normalize = True)
# def get_weighted_fscore(y_pred, y_true):
#     f_score = 0
#     for i in range(12):
#         yt = y_true == i
#         yp = y_pred == i
#         f_score += dic_[i] * f1_score(y_true=yt, y_pred= yp)
#         print(i,dic_[i],f1_score(y_true=yt, y_pred= yp), precision_score(y_true=yt, y_pred= yp),recall_score(y_true=yt, y_pred= yp))
#     print(f_score)
# get_weighted_fscore(y_true =offline['click_mode'] , y_pred = offline['transport_mode'])

# # 67438 67456

0.67430226246268
0 0.093328 0.25351949760707904 0.1635400409145458 0.563625064289388
1 0.140738 0.48945982820576883 0.7556142276543356 0.3619633645497307
2 0.272982 0.8830355524436752 0.8575047629566226 0.9101332688602178
3 0.049252 0.019438531667938474 0.8864468864468864 0.009827012101031431
4 0.025212 0.00015859170565379432 0.2 7.932730445819452e-05
5 0.09496 0.8286614627314481 0.8107443678716801 0.8473883740522326
6 0.023726 0.004036326942482342 0.8275862068965517 0.00202309702436146
7 0.156418 0.7109099499455633 0.7652877697841727 0.6637471390760654


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


8 0.003714 0.0 0.0 0.0
9 0.097728 0.6689661311139793 0.7330049741539062 0.6152177472167649
10 0.029764 0.004017946829170294 0.5882352941176471 0.002015858083590915
11 0.012178 0.3098842386464826 0.6873589164785553 0.2000328461159468
0.5938149145614366


In [41]:
sub = all_data[all_data['label']==-1][['sid','transport_mode_rank','transport_mode']].copy().reset_index(drop=True)
sub['label'] = lgb_pred_te_all / cv
sub['weighted_label'] = sub['label'] / ((sub['transport_mode_rank']+1))
sub = sub.sort_values(['sid','weighted_label'],ascending=False)

tmp2 = sub[sub['transport_mode']==8]
tmp2 = tmp2[tmp2['weighted_label']>0.02]
tmp2 = list(sub[sub.index.isin(tmp2.index)]['sid'].values)

sub.replace(-1,0,inplace=True)
sub = sub.groupby('sid',as_index=False).head(1)[['sid','transport_mode','weighted_label']].copy()
sub = sub.merge(train_clicks[['sid','click_mode']],how='left',on='sid')
sub.loc[sub['weighted_label']<0.01,'transport_mode'] = 0
sub.loc[sub['sid'].isin(tmp2),'transport_mode'] = 8

In [42]:
sub[['sid','transport_mode']].to_csv('../submit/baseline_6841.csv',index = False,header=None)

In [79]:
1

1

In [None]:
ana_ans = []
for i in range(len(y_test)):
    if y_test[i]!=y_test_all[i]:
        ana_ans.append(i)
        
len(ana_ans)

In [30]:
submit = []
for line in cv_pred:
    now = []
    for j in line:
        now.append(int(j))
    submit.append(np.argmax(np.bincount(now)))

In [31]:
np.mean(cv_score)

0.6814643765842411

In [29]:
sub = DF()
sub['sid'] = all_data[choose]['sid'].values
sub['label'] = y_test_all

In [30]:
sub.to_csv('../submit/带最后14天.csv',index = False,header=None)

In [None]:
kfc1 = pd.read_csv('../submit/688不带最后7天.csv',header=None)
kfc1.columns = ['sid','k1']
kfc2 = pd.read_csv('../submit/688带了最后7天.csv',header=None)

In [58]:
to_stack = 
all_data[feature_name]

Unnamed: 0,Recommand_0_transport_mode,transport_mode_svd_fea_2,odl2_dis,price_inMin_transport_mode,price_svd_fea_1,transport_mode_svd_fea_3,transport_mode_svd_fea_5,transport_mode_svd_fea_6,distance_mean,transport_mode_svd_fea_9
0,1.0,-0.094032,0.022361,1.0,0.187898,-0.361078,0.066919,0.067537,2960.666667,0.050871
1,2.0,-0.009575,0.416293,11.0,-0.182656,-0.144930,-0.097490,0.038642,48412.333333,-0.113302
2,2.0,0.646982,0.058310,2.0,0.015238,-0.035331,0.113422,0.012515,7163.666667,0.039171
3,1.0,-0.090497,0.022361,1.0,0.252328,-0.430623,0.097569,0.042598,3341.833333,0.020274
4,9.0,-0.069946,0.205183,9.0,-0.222302,0.076463,0.097680,-0.111863,26951.000000,0.252058
5,1.0,-0.090497,0.031623,1.0,0.252328,-0.430623,0.097569,0.042598,3626.166667,0.020274
6,1.0,-0.090497,0.022361,1.0,0.252328,-0.430623,0.097569,0.042598,2963.333333,0.020274
7,2.0,0.228207,0.111803,2.0,-0.303439,-0.016658,-0.258885,0.157420,16841.200000,0.229999
8,2.0,0.231946,0.101980,1.0,-0.194354,0.047922,-0.338281,-0.295221,10291.750000,-0.219847
9,1.0,-0.211129,0.110000,1.0,-0.115306,-0.361749,-0.012318,0.039733,10360.750000,-0.309252


In [24]:
fi = DF()
fi['name'] = feature_name
fi['score'] = lgb_model.feature_importances_
feature_name = fi.sort_values(by=['score'],ascending=False).head(10)['name'].values
fi.sort_values(by=['score'],ascending=False)

Unnamed: 0,name,score
110,Recommand_0_transport_mode,1.444737e+06
284,transport_mode_svd_fea_2,1.113954e+06
163,odl2_dis,3.902617e+05
39,price_inMin_transport_mode,2.947026e+05
263,price_svd_fea_1,2.904361e+05
285,transport_mode_svd_fea_3,2.521495e+05
287,transport_mode_svd_fea_5,1.702749e+05
288,transport_mode_svd_fea_6,1.557480e+05
2,distance_mean,1.536084e+05
291,transport_mode_svd_fea_9,1.252806e+05


In [27]:
y_test

array([7., 9., 7., ..., 2., 2., 1.])

In [28]:
y_test_all

array([7., 9., 7., ..., 2., 2., 1.])

In [36]:
kfc = pd.read_csv("baseline.csv")
rfl = pd.read_csv("../submit/auc_xgb_lgb.csv",header=None)
rfl.columns = ['sid','label']

In [41]:
jb = sub.merge(kfc,how='left',on='sid').merge(rfl,how='left',on='sid')[['label_x','recommend_mode','label_y']].values

In [50]:
rh = []
for i in jb:
    if (i[0]==i[1]) & (i[0]==i[2]):
        rh.append(i[0])
    elif (i[0]==i[2]):
        rh.append(i[0])
    elif (i[0]==i[1]):
        rh.append(i[0])
    elif i[1]==i[2]:
        rh.append(i[1])
    else:
        rh.append(i[1])

In [51]:
a = 0
b = 0
c = 0
for i in range(len(jb)):
    if rh[i]!=jb[i][0]:
        a+=1
    if rh[i]!=jb[i][1]:
        b+=1
    if rh[i]!=jb[i][2]:
        c+=1

print(a)
print(b)
print(c)

1706
1338
3834


In [54]:
sub['rh'] = rh

In [56]:
sub[['sid','rh']].to_csv("try_rh.csv",index=False,header=False)