In [1]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt
import geohash
import lightgbm as lgb
import time
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import os
from sklearn.model_selection import StratifiedKFold,KFold

In [2]:
feat = pd.read_csv('../cache/feat_pid_0.csv')

In [3]:
feat.loc[feat['req_time']>'2018-12-01','click_mode'] = -1
test_set = feat.loc[feat['req_time']>'2018-12-01']
test_set['plan_time_dayofyear'].min()

335

In [4]:
def label_mean(data,feat_set,cols,label_name):
    cols_label = feat_set.groupby([cols],as_index=False)[label_name].agg({'feats':'mean'})
    return data[[cols]].merge(cols_label,'left',[cols])['feats'].fillna(0).values

def cv_feat(feat_set,test_set,cv_num,f,f_params): 
    result = np.zeros((feat_set.shape[0], 1))
    label_fold = np.zeros((test_set.shape[0]))
    kf = KFold(n_splits=cv_num,
                         shuffle=True,
                         random_state=520).split(feat_set)
    for k, (train_fold, test_fold) in enumerate(kf):
        result[test_fold, 0] = f(feat_set.loc[test_fold, :], feat_set.loc[train_fold, :],*f_params)
        label_fold += f(test_set, feat_set.loc[train_fold, :],*f_params)
    label_fold = label_fold/cv_num
    result = [x[0] for x in list(result)] + list(label_fold)
    return result

In [5]:
feat['o_id'] = LabelEncoder().fit_transform(feat['o'])
feat['d_id'] = LabelEncoder().fit_transform(feat['d'])
feat['od_id'] = LabelEncoder().fit_transform(feat['o'].map(str)+\
                                             feat['d'].map(str))
feat['plan_hour_minute'] = feat['plan_time_hour']*60+\
                            feat['plan_time_minute']
feat['o_hour_id'] = LabelEncoder().fit_transform(feat['o'].map(str)+\
                                             feat['plan_time_hour'].map(str))
feat['d_hour_id'] = LabelEncoder().fit_transform(feat['d'].map(str)+\
                                             feat['plan_time_hour'].map(str))

feat['pid_o_id'] = LabelEncoder().fit_transform(feat['pid'].map(str)+\
                                             feat['o'].map(str))
feat['pid_d_id'] = LabelEncoder().fit_transform(feat['pid'].map(str)+\
                                             feat['d'].map(str))
feat['pid_hour_id'] = LabelEncoder().fit_transform(feat['pid'].map(str)+\
                                             feat['plan_time_hour'].map(str))

In [6]:
for i in range(0,12):
    feat[f'plans_mode_list_1th_{i}'] = feat['plans_mode_list_1th'].map(
        lambda x:1 if x==i else 0)

In [7]:
for each in range(12):
    feat[f'label_{each}'] = feat['click_mode'].map(lambda x:0 if x!=each else 1)
    feat.loc[feat['req_time']>'2018-12-01',f'label_{each}'] = -1

In [8]:
feat_set = feat[feat['label_0'] != -1]
test_set = feat[feat['label_0'] == -1]
cv_num = 4

In [9]:
feat_set[['plans_mode_list_1th','click_mode']]
feat_set['yizhi'] = feat_set['plans_mode_list_1th']-feat_set['click_mode']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
def get_time_des(feat,start_date,end_date,id_feat):
    everyday_des = {}
    for i in range(start_date,end_date):
        history = feat[feat['plan_time_dayofyear']==i]
        everyday_list = []
        each_label = history.groupby(id_feat,as_index = False)[f'label_0'].agg({
                f'{id_feat}_label_0_{i}_his_sum':'sum',
                f'{id_feat}_label_0_{i}_his_count':'count',
            })
        for j in range(1,12):
            tmp = history.groupby(id_feat,as_index = False)[f'label_{j}'].agg({
                f'{id_feat}_label_{j}_{i}_his_sum':'sum',
                f'{id_feat}_label_{j}_{i}_his_count':'count',
            })
            each_label = each_label.merge(tmp,'left',[id_feat])
        everyday_des[str(i)] = each_label

    # everyday_des['274'][0]
    history_des = pd.DataFrame(columns=[id_feat])
    for i in range(start_date,end_date):
        history_des = history_des.merge(everyday_des[str(i)],
                                              'outer',[id_feat]).fillna(0)
    return history_des

def reshape_his_feat(start_date,end_date,id_feat,history_des):
    result = []
    for i in range(history_des.shape[0]):
        for j in range(start_date,end_date):
            eachline =  [history_des.loc[i,id_feat]]
            eachline.append(j)
            for k in range(0,12):
                eachline.append(history_des.loc[i,
                        f'{id_feat}_label_{k}_{j}_after_mean'])
                eachline.append(history_des.loc[i,
                        f'{id_feat}_label_{k}_{j}_day7_mean'])
            result.append(eachline)
    columns_name = [id_feat,'date_huachuang']
    for kk in range(0,12):
        columns_name.append(f'{id_feat}_after_label_{kk}')
        columns_name.append(f'{id_feat}_day7_label_{kk}')
    result = pd.DataFrame(result,columns = columns_name)  
    return result

def compute_his_feat(start_date,end_date,id_feat,history_des):
    for k in range(0,12):
        for i in range(start_date,end_date):
            name = f'{id_feat}_label_{k}_{i}'
            yes_name = f'{id_feat}_label_{k}_{i-1}'
            if i == start_date:
                history_des[f'{name}_after_count'] = 0
                history_des[f'{name}_after_sum'] = 0
                history_des[f'{name}_day7_count'] = 0
                history_des[f'{name}_day7_sum'] = 0
#                 history_des[f'{all_name}_mean'] = 0
#                 history_des[f'{day7_name}_mean'] = 0
            else:
#         {id_feat}_label_{j}_{i}_his_sum'
                history_des[f'{name}_after_count'] = history_des[f'{yes_name}_after_count']+\
                            history_des[f'{yes_name}_his_count']
                history_des[f'{name}_after_sum'] = history_des[f'{yes_name}_after_sum']+\
                            history_des[f'{yes_name}_his_sum']
                history_des[f'{name}_day7_count'] = history_des[f'{yes_name}_day7_count']+\
                            history_des[f'{yes_name}_his_count']
                history_des[f'{name}_day7_sum'] = history_des[f'{yes_name}_day7_sum']+\
                            history_des[f'{yes_name}_his_sum']
                day7_before = f'{id_feat}_label_{k}_{i-8}'
                if f'{day7_before}_his_sum' in history_des.columns:
                    history_des[f'{name}_day7_count'] = history_des[f'{name}_day7_count']-\
                                history_des[f'{day7_before}_his_count']
                    history_des[f'{name}_day7_sum'] = history_des[f'{name}_day7_sum']-\
                                history_des[f'{day7_before}_his_sum']
            history_des[f'{name}_after_mean'] = history_des[f'{name}_after_sum']/\
                    (history_des[f'{name}_after_count']+0.0001)
            history_des[f'{name}_day7_mean'] = history_des[f'{name}_day7_sum']/\
                    (history_des[f'{name}_day7_count']+0.0001)

    return reshape_his_feat(start_date,end_date,id_feat,history_des)

In [11]:
# 计算出发点到目的地的角度方向,参考的是wiki的内容
def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6378.137  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))



In [12]:
id_feat = ['geodistance_id',
           'plans_mode_list_1th',
           'pid',
           'o_id','d_id',
#       'od_id','juedui_lujing',
           'plan_hour_minute',
           'o_hour_id',
           'd_hour_id',
#            'pid_o_id','pid_d_id','pid_hour_id'
          ]
# feat['date_huachuang'] = list(map(lambda x,y:x if y!=-1 else 335,
#                             feat['plan_time_dayofyear'],feat['label_0']))
# for each in id_feat:
#     history_des = get_time_des(feat,274,335,each)
#     print(1)
#     tmep = compute_his_feat(274,335+1,each,history_des)
#     print(3)
#     del history_des
#     feat = feat.merge(tmep,'left',[each,'date_huachuang']).fillna(0)
#     print(each)
# del feat['date_huachuang']
    

In [13]:
# plan_time_hour
for i in range(12):
    for col in ['geodistance_id','plans_mode_list_1th','pid','o_id','d_id',
                'od_id','juedui_lujing','plan_hour_minute','o_hour_id',
                'd_hour_id','pid_o_id','pid_d_id','pid_hour_id']:
        feat[f'{col}_label_{i}_cv'] = cv_feat(feat_set,test_set,cv_num,
                    label_mean,[col,f'label_{i}'])
        

In [14]:
def get_deepwalk_feat(simple_set,col1,col2,flag=True):
    if os.path.exists(f'../cache/deepwalk_{col1}_{col2}.csv') and flag:
        col1_emb = pd.read_csv(f'../cache/deepwalk_{col1}_{col2}1.csv')
        col2_emb = pd.read_csv(f'../cache/deepwalk_{col1}_{col2}2.csv')
        return col1_emb,col2_emb
    else:
        temp = simple_set.groupby([col1,col2],
            as_index=False)[col1].agg({'count'}).reset_index()
        lbl1,lbl2 = LabelEncoder(),LabelEncoder()
        temp[col1] = lbl1.fit_transform(temp[col1])
        temp[col2] = lbl2.fit_transform(temp[col2]) + (temp[col1].max()+1)
        k = f'../cache/{col1}_{col2}_graph.csv'
        temp.to_csv(k,index=False,header=False,sep=' ')
        os.system(f"deepwalk --input {k} --format edgelist --representation-size 8 --output ../cache/{col1}_{col2}_deepwalk.csv --workers 10")
        kfc= pd.read_csv(f'../cache/{col1}_{col2}_deepwalk.csv',delimiter=" ", 
            names=['col'] + [f"{col1}_{col2}_{col2}_deepwalk_{j}" for j in range(8)], 
                         skiprows=1)
        
        kfc1= pd.read_csv(f'../cache/{col1}_{col2}_deepwalk.csv',delimiter=" ", 
            names=['col'] + [f"{col1}_{col2}_{col1}_deepwalk_{j}" for j in range(8)], 
                         skiprows=1)

        col2_emb= kfc[~kfc['col'].isin(temp[col1].unique())]
        col2_emb[col2] = lbl2.inverse_transform(list(col2_emb['col']-temp[col1].max()-1))
        col2_emb[col2] = col2_emb[col2].map(int)
        del col2_emb['col']
        
        col1_emb= kfc1[~kfc1['col'].isin(temp[col2].unique())]
        col1_emb[col1] = lbl1.inverse_transform(col1_emb['col'])
        col1_emb[col1] = col1_emb[col1].map(int)
        del col1_emb['col']
        
        col1_emb.to_csv(f'../cache/deepwalk_{col1}_{col2}1.csv')
        col2_emb.to_csv(f'../cache/deepwalk_{col1}_{col2}2.csv')
        return col1_emb,col2_emb

In [15]:
# col1_emb,col2_emb = get_deepwalk_feat(feat,'o_id','d_id')      
# feat = feat.merge(col1_emb,'left',['o_id'])
# feat = feat.merge(col2_emb,'left',['d_id'])

# col1_emb,col2_emb = get_deepwalk_feat(feat,'mode_list_encode1234','lujing')      
# feat = feat.merge(col1_emb,'left',['mode_list_encode1234'])
# feat = feat.merge(col2_emb,'left',['lujing'])

In [16]:
simple_set = feat

In [17]:
str_col = simple_set.select_dtypes(include=['object']).columns.tolist()
ignore_col = ['click_mode','sid'] + [f'label_{i}' for i in range(12)]
# [x for x in pre_col if 'after' in x]
pre_col = [x for x in simple_set.columns if x not in str_col+ignore_col]
pre_col = [x for x in pre_col if 'day7' not in x]   #删除after特征

train_index = (simple_set.req_time < '2018-11-23')
train_x     = simple_set[train_index][pre_col].reset_index(drop=True)
train_y     = simple_set[train_index].click_mode.reset_index(drop=True)

valid_index = (simple_set.req_time > '2018-11-23') & (simple_set.req_time < '2018-12-01')
valid_x     = simple_set[valid_index][pre_col].reset_index(drop=True)
valid_y     = simple_set[valid_index].click_mode.reset_index(drop=True)

test_index = (simple_set.req_time > '2018-12-01')
test_x     = simple_set[test_index][pre_col].reset_index(drop=True)

In [18]:
cate_list = [f'plans_mode_list_{x+1}th' for x in range(7)]+\
  ['mode_list_encode12_cate'] + [x for x in pre_col if 'max_mode_cate' in x] 
#     _max_mode_cate
cate_list = []

In [19]:
def f1_weighted(labels,preds):
    preds = np.argmax(preds.reshape(12, -1), axis=0)
    score = f1_score(y_true=labels, y_pred=preds, average='weighted')
    return 'f1_weighted', score, True

lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=61, reg_alpha=0, reg_lambda=0.01,
    max_depth=-1, n_estimators=2000, objective='multiclass',
    subsample=0.8, colsample_bytree=0.8, subsample_freq=1,min_child_samples = 50,  
                            learning_rate=0.05, random_state=2019, metric="None",n_jobs=-1)
eval_set = [(valid_x, valid_y)]
lgb_model.fit(train_x, train_y, eval_set=eval_set, eval_metric=f1_weighted,
              categorical_feature=cate_list, verbose=10, early_stopping_rounds=100)
# [186]	valid_0's f1_weighted: 0.69201
# [237]	valid_0's f1_weighted: 0.697271



Training until validation scores don't improve for 100 rounds.
[10]	valid_0's f1_weighted: 0.690482
[20]	valid_0's f1_weighted: 0.690521
[30]	valid_0's f1_weighted: 0.691488
[40]	valid_0's f1_weighted: 0.692299
[50]	valid_0's f1_weighted: 0.692651
[60]	valid_0's f1_weighted: 0.693265
[70]	valid_0's f1_weighted: 0.693725
[80]	valid_0's f1_weighted: 0.694405
[90]	valid_0's f1_weighted: 0.694636
[100]	valid_0's f1_weighted: 0.695113
[110]	valid_0's f1_weighted: 0.695265
[120]	valid_0's f1_weighted: 0.695484
[130]	valid_0's f1_weighted: 0.695757
[140]	valid_0's f1_weighted: 0.696157
[150]	valid_0's f1_weighted: 0.696329
[160]	valid_0's f1_weighted: 0.696741
[170]	valid_0's f1_weighted: 0.696672
[180]	valid_0's f1_weighted: 0.696755
[190]	valid_0's f1_weighted: 0.696943
[200]	valid_0's f1_weighted: 0.696674
[210]	valid_0's f1_weighted: 0.69684
[220]	valid_0's f1_weighted: 0.696973
[230]	valid_0's f1_weighted: 0.696949
[240]	valid_0's f1_weighted: 0.696945
[250]	valid_0's f1_weighted: 0.6970

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
        learning_rate=0.05, max_depth=-1, metric='None',
        min_child_samples=50, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=2000, n_jobs=-1, num_leaves=61,
        objective='multiclass', random_state=2019, reg_alpha=0,
        reg_lambda=0.01, silent=True, subsample=0.8,
        subsample_for_bin=200000, subsample_freq=1)

In [20]:
# print("Distance Eta Price -- In Min State 统计")
# for i in tqdm(['distance','eta','price']):
#     for j in range(1,3):
#         tmp = get_ktime_feature(j,data,i)
#         now = ['sid','distance','eta','price','sphere_dis','odl2_dis','distance_sphere_ratio',
#                'transport_mode_rank','transport_mode','req_time_dow','req_time_hour']
#         now = [j for j in now if i not in j]
#         tmp = tmp[now].set_index('sid').add_prefix("{}_inMin_{}_".format(i,j-1)).reset_index()
#         feature = feature.merge(tmp,on='sid',how='left')
        
# print(feature.shape)

In [21]:
imp = pd.DataFrame()
imp['fea'] = pre_col
imp['imp'] = lgb_model.feature_importances_ 
imp = imp.sort_values('imp',ascending = False)
imp.to_csv('../output/imp_baseline_label.csv')
imp

Unnamed: 0,fea,imp
276,hour_time,939
675,o_id_label_3_cv,930
466,pid_d_geohash_num_on_plan_time_hour_std,888
223,plans_eta_list/max_1th,882
636,o_id_label_0_cv,782
135,plans_distance_/_eta_list_1th,753
349,hour_time_on_geodistance_sum,753
674,pid_label_3_cv,736
221,plans_eta_list/max_std,727
437,plan_time_hour_o_geohash_num_on_geodistance_skew,727


In [22]:
all_train_x = simple_set[simple_set.req_time < '2018-12-01'][pre_col].reset_index(drop=True)
all_train_y = simple_set[simple_set.req_time < '2018-12-01'].click_mode.reset_index(drop=True)
print(lgb_model.best_iteration_)
lgb_model.n_estimators   = lgb_model.best_iteration_
lgb_model.fit(all_train_x, all_train_y,categorical_feature=[])
print('fit over')
result  = pd.DataFrame()
result['sid'] = simple_set[test_index]['sid']
result['recommend_mode'] = lgb_model.predict(test_x)
result['recommend_mode'] = result['recommend_mode'].astype(int)
print(len(result))
print(result['recommend_mode'].value_counts())
result[['sid', 'recommend_mode']].to_csv('../output/baseline.csv', index=False)

260




fit over


  if diff:


94358
2     34081
7     22243
1     17252
5     10745
10     3437
0      3404
9      1778
3       574
11      333
6       257
8       212
4        42
Name: recommend_mode, dtype: int64


In [23]:
test_pred = lgb_model.predict_proba(test_x)

In [34]:
test_pred_pd = pd.DataFrame(test_pred,columns=[f'proba{i}' for i in range(12)])

In [45]:
a = pd.concat([result[['sid', 'recommend_mode']].reset_index(drop=True),test_pred_pd],axis=1)

In [47]:
a.to_csv('../output/result_xuan.csv', index=False)

In [24]:
from sklearn.metrics import precision_score,recall_score

offline = []
for i in test_pred:
    now = np.argmax(i)
    if i[3]>0.25:
        offline.append(3)
    elif i[6]>0.3:
        offline.append(6)
    elif i[4]>0.2:
        offline.append(4)
    elif i[8]>0.2:
        offline.append(8)
    elif i[0]>0.29:
        offline.append(0)
    else:
        offline.append(now)
result['recommend_mode'] = offline
result[['sid', 'recommend_mode']].to_csv('../output/gogogo.csv', index=False)


In [25]:
test_simple = simple_set[test_index].reset_index(drop=True)
result['plans_mode_list'] = test_simple['plans_mode_list'].values
result['is_in_plans'] = list(map(lambda x,y:1 if str(x) in y else 0,
                                 result['recommend_mode'],
                               result['plans_mode_list'] ))
result['plans_mode_list_1th'] = test_simple['plans_mode_list_1th'].values
result['yizhi'] = result['plans_mode_list_1th']-result['recommend_mode']

In [26]:
result['is_in_plans'].value_counts()

1    92042
0     2316
Name: is_in_plans, dtype: int64

In [27]:
result['recommend_mode'] = list(map(lambda x,y,z:0 if y==0 else x,
                                    result['recommend_mode'] ,
                                   result['is_in_plans'] ,
                                   result['plans_mode_list_1th'] ))

In [28]:
feat_set['click_mode'].value_counts()/feat_set.shape[0]

2.0     0.272982
7.0     0.156418
1.0     0.140738
9.0     0.097728
5.0     0.094960
0.0     0.093328
3.0     0.049252
10.0    0.029764
4.0     0.025212
6.0     0.023726
11.0    0.012178
8.0     0.003714
Name: click_mode, dtype: float64

In [29]:
# result['yizhi'] = result['plans_mode_list_1th']-result['recommend_mode']
result['recommend_mode'].value_counts()/result.shape[0]

2     0.359991
7     0.230929
1     0.173054
5     0.110865
0     0.044501
10    0.036086
9     0.017879
3     0.012569
8     0.005257
11    0.003201
6     0.003158
4     0.002512
Name: recommend_mode, dtype: float64

In [30]:
result[['sid', 'recommend_mode']].to_csv('../output/baseline_xiuzheng.csv', index=False)

In [31]:
result

Unnamed: 0,sid,recommend_mode,plans_mode_list,is_in_plans,plans_mode_list_1th,yizhi
500000,1126541,2,"[2, 3, 4, 9, 6, 7]",1,2,0
500001,453685,1,"[1, 6, 3, 4, 1, 5]",1,1,0
500002,2066073,7,"[7, 3, 4, 10, 1]",1,7,0
500003,452746,2,"[4, 3, 6, 2, 1, 5]",1,4,2
500004,1431672,5,"[5, 3, 6, 1]",1,5,0
500005,1362848,1,"[1, 3, 4, 6, 11]",1,1,0
500006,2043270,5,"[5, 6, 3]",1,5,0
500007,1404698,2,"[2, 3, 4]",1,2,0
500008,1365075,1,"[1, 3, 4, 8, 1]",1,1,0
500009,1344781,1,"[1, 3, 4, 6, 9, 1]",1,1,0
