In [1]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm 
import datetime, time, gc
from utils import distance, haversine, standard, pad_seq 
from scipy.stats import skew, kurtosis
from zipfile import ZipFile
from collections import Counter 
from sklearn.metrics import mean_squared_error as mse 
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
%%time

myzip=ZipFile('data/Round2_train_717.zip')
f=myzip.open('Round2_train_717.csv')
train_df=pd.read_csv(f)
print(train_df)
f.close()
myzip.close() 

train_df.drop_duplicates(['loadingOrder', 'label'], inplace=True)

del_cols = ['arrive_time_index', 'arrive_time', 'direction', 'geo_hash3', 'selcte_index']  # 'or_last_index', 'begin_year', 
train_df.drop(del_cols, axis=1, inplace=True) 

columns = ['TRANSPORT_TRACE', 'begin_port_name', 'begin_port_position',
        'begin_port_position_hash3', 'begin_port_position_hash4',
        'begin_port_position_hash5', 'carrierName', 'end_port_name',
        'end_port_position', 'end_port_position_hash3',
        'end_port_position_hash4', 'end_port_position_hash5', 'loadingOrder',
        'vesselMMSI', 'test_index', 'geo_hash3', 'geo_hash5',
        'geo_hash4', 'speed', 'longitude',
        'latitude', 'timestamp', 'direction', 'label']
train_df.columns = columns

test_df = pd.read_csv("data/Round2_test_717.csv")

       TRANSPORT_TRACE               arrive_time  arrive_time_index  \
0          HKHKG-CLVAP  2019-02-17T06:19:14.000Z               4569   
1          HKHKG-CLVAP  2019-02-17T06:19:14.000Z               4569   
2          HKHKG-CLVAP  2019-02-17T06:19:14.000Z               4569   
3          HKHKG-CLVAP  2019-02-17T06:19:14.000Z               4569   
4          HKHKG-CLVAP  2019-02-17T06:19:14.000Z               4569   
...                ...                       ...                ...   
153910       HKG-MXZLO  2020-01-24T07:23:14.000Z              54010   
153911     CNSHK-MXZLO  2020-02-09T02:51:02.000Z              49879   
153912       HKG-MXZLO  2020-01-24T07:23:14.000Z              54010   
153913     HKHKG-MXZLO  2020-02-21T12:30:48.000Z              31274   
153914     HKHKG-MXZLO  2020-02-21T12:30:48.000Z              31274   

       begin_port_name           begin_port_position  \
0                HKHKG  114.13970900000001 22.419915   
1                HKHKG  114.1397090

In [3]:
train_df.shape 

(153915, 24)

In [4]:
train_df = train_df.drop_duplicates(['loadingOrder', 'test_index'])
print(train_df.shape)

(46656, 24)


In [5]:
train_df = train_df[train_df['label'] > -100]
train_df.reset_index(drop=True)
print(train_df.shape)

train_df = train_df.sample(10000)
train_df = train_df.reset_index(drop=True)
train_df.head()

(46457, 24)


Unnamed: 0,TRANSPORT_TRACE,begin_port_name,begin_port_position,begin_port_position_hash3,begin_port_position_hash4,begin_port_position_hash5,carrierName,end_port_name,end_port_position,end_port_position_hash3,end_port_position_hash4,end_port_position_hash5,loadingOrder,vesselMMSI,test_index,geo_hash3,geo_hash5,geo_hash4,speed,longitude,latitude,timestamp,direction,label
0,CNSHK-SGSIN,CNSHK,113.86305800000001 22.559462,ws0,ws0b,ws0br,JCMFTA,SGSIN,103.70461999999999 1.3031409999999999,w21,w21x,w21xr,CNSHK-SGSINSJ545912828494,G8796550152,JE319829909965,web web web web web web web web web web web we...,webzj webyv webyv webyv webyv webyv webyv weby...,webz weby weby weby weby weby weby weby weby w...,28 28 28 28 29 28 29 30 31 31 31 31 34 36 36 3...,113.796075 113.798207 113.799868 113.802293 11...,22.326255 22.317537 22.310838 22.301148 22.290...,2019-08-27T15:32:39.000Z 2019-08-27T15:34:40.0...,16530 16780 16690 16710 16970 17360 17270 1683...,20.024167
1,CNYTN-BRSSZ,CNYTN,114.275347 22.5777,ws1,ws12,ws122,OIEQNT,BRSSZ,-46.28402 -23.954513000000002,6gx,6gxp,6gxpd,UP336086570849,E2703817036,-1,wec wec wec wec wec wec wec wec wec wec wec we...,weck3 weck3 weck0 wec7b wec7b wec7b wec5x wec5...,weck weck weck wec7 wec7 wec7 wec5 wec5 wec5 w...,31 31 30 30 30 30 26 26 25 25 25 26 26 26 26 3...,114.316612 114.314653 114.289602 114.278773 11...,21.85632 21.8529 21.809027 21.791225 21.781673...,2019-06-14T19:26:34.000Z 2019-06-14T19:27:23.0...,20670 20880 20790 20910 21020 20930 21250 2125...,382.026944
2,CNSHK-MYTPP,CNSHK,113.86305800000001 22.559462,ws0,ws0b,ws0br,RWHZVZ,MYTPP,103.545456 1.399416,w21,w21x,w21xu,CNSHK-MYTPPAH734641844883,J8003619600,BN604096720961,wec wec wec wec wec wec wec wec wec wec wec we...,wecjy wecjy wecjz wecjz wecjz wecmb wecmb wecm...,wecj wecj wecj wecj wecj wecm wecm wecm wecm w...,14 19 19 19 22 25 26 26 26 26 26 26 26 26 26 2...,114.206735 114.211468 114.228033 114.2286 114....,22.147101 22.140785 22.12277000000001 22.1227 ...,2020-03-31T06:46:50.000Z 2020-03-31T06:49:49.0...,14400 14900 10010 10070 13900 9710 9690 9620 9...,82.502222
3,CNNSA-SGSIN-AEJEA,CNNSA,113.653433 22.694213,ws0,ws0c,ws0c4,OYSCFP,AEJEA,55.04979 25.022073000000002,thr,thrn,thrnk,CS960403482025,M7759940892,GR378851973317,w26 w26 w26 w26 w26 w26 w26 w26 w26 w26 w26 w2...,w26eg w26ef w26ef w26ef w26ed w26ed w26ed w26e...,w26e w26e w26e w26e w26e w26e w26e w26e w26e w...,26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 2...,104.901955 104.896648 104.895168 104.885593 10...,2.100855 2.090117 2.08721 2.067875 2.061433 2....,2019-09-30T15:23:27.000Z 2019-09-30T15:26:27.0...,20660 20650 20640 20620 20640 20510 20350 2034...,218.456944
4,CNSHK-MYTPP,CNSHK,113.86305800000001 22.559462,ws0,ws0b,ws0br,OYSCFP,MYTPP,103.545456 1.399416,w21,w21x,w21xu,CNSHK-MYTPPYU706608412593,J3014600292,JE319829909965,web web web web web web web web web web web we...,webzx webzx webzx webzx webzx webzx webzx webz...,webz webz webz webz webz webz webz webz webz w...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,113.89441200000002 113.894697 113.894858 113.8...,22.455908 22.45567 22.455615 22.455782 22.4556...,2019-04-22T23:41:49.000Z 2019-04-22T23:56:48.0...,11800 31200 34700 22300 22300 32300 35200 3400...,-8.970278


In [6]:
gc.collect()
!free -m

              total        used        free      shared  buff/cache   available
Mem:          96166       22198       70654          18        3314       73804
Swap:             0           0           0


In [7]:
%%time

def feature_engineering(df, is_train=True):
    numerical_fea = []
    categorical_fea = []
    lo = []
    if is_train:
        label = []
        
    for idx in tqdm(range(df.shape[0])):
        line = df.iloc[idx]

        timestamp = [datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.000Z") for x in line['timestamp'].split(' ')]
        
        # 时间特征
        timestamp_hours = np.array([(x-timestamp[0]).total_seconds()/3600.0 for x in timestamp])  # 减去最初的时间
        timestamp_diff1 = np.diff(timestamp_hours, prepend=timestamp_hours[0])    # 时间一阶差分
        # print(timestamp_diff1[:20])
        xlen = len(timestamp_hours)   # 长度

        begin_port = [float(x) for x in line['begin_port_position'].split(' ')]
        end_port = [float(x) for x in line['end_port_position'].split(' ')]

        # 原始的四个特征
        lon = [float(x) for x in line['longitude'].split(' ')]  # 经度
        lat  = [float(x) for x in line['latitude'].split(' ')]  # 纬度
        speed = [float(x) for x in line['speed'].split(' ')]
        direction = [float(x) for x in line['direction'].split(' ')]
        lon_diff1 = np.diff(lon, prepend=lon[0])
        lat_diff1 = np.diff(lat, prepend=lat[0])
        speed_diff1 = np.diff(speed, prepend=speed[0])
        direction_diff1 = np.diff(direction, prepend=direction[0])

        # 距离特征
        begin_distance = [distance(begin_port[0], begin_port[1], lon[i], lat[i]) for i in range(xlen)]  # 与起始距离
        begin_dis_cumsum = np.cumsum(begin_distance)   # 累积距离
        begin_dis_diff1 = np.diff(begin_distance, prepend=begin_distance[0])   # 一阶差分距离

        end_distance = [distance(end_port[0], end_port[1], lon[i], lat[i]) for i in range(xlen)]    # 与终止距离
        end_dis_cumsum = np.cumsum(end_distance)    # 累积距离
        end_dis_diff1 = np.diff(end_distance, prepend=end_distance[0]) 

        begin_haversine = [haversine(begin_port[0], begin_port[1], lon[i], lat[i]) for i in range(xlen)] 
        begin_hav_cumsum = np.cumsum(begin_haversine)   # 累积距离
        begin_hav_diff1 = np.diff(begin_haversine, prepend=begin_haversine[0])   # 一阶差分距离

        end_haversine = [haversine(end_port[0], end_port[1], lon[i], lat[i]) for i in range(xlen)]
        end_hav_cumsum = np.cumsum(end_haversine)   # 累积距离
        end_hav_diff1 = np.diff(end_haversine, prepend=end_haversine[0])   # 一阶差分距离

        # 速度特征
        timestamp_diff1_tmp = timestamp_diff1.copy()
        timestamp_diff1_tmp[timestamp_diff1_tmp==0] = np.mean(timestamp_diff1_tmp)
        diff_speed = begin_hav_diff1 / (timestamp_diff1_tmp + 1e-8)    # 差分路程 / 差分时间 = 速度1
        
        timestamp_hours_tmp = timestamp_hours.copy()
        timestamp_hours_tmp[timestamp_hours_tmp == 0] = np.mean(timestamp_hours_tmp)
        cumsum_speed = begin_hav_cumsum / (timestamp_hours_tmp + 1e-8)  # 累积路程 / 累积时间 = 速度2

        # 简单数值特征
        speed_0_nums = (np.array(speed) == 0).sum()
        speed_0_rate = speed_0_nums / xlen 
        speed_0_time = timestamp_diff1[np.array(speed) == 0].sum()
        hash3_nunique = len(set(line['geo_hash3'].split(' ')))
        hash4_nunique = len(set(line['geo_hash4'].split(' ')))
        hash5_nunique = len(set(line['geo_hash5'].split(' ')))
        hash3_rate, hash4_rate, hash5_rate = hash3_nunique/xlen,  hash4_nunique/xlen, hash5_nunique/xlen 
        first_begin_dis, first_begin_hav = distance(begin_port[0], begin_port[1], lon[0], lat[0]), haversine(begin_port[0], begin_port[1], lon[0], lat[0])
        last_begin_dis, last_begin_hav = distance(begin_port[0], begin_port[1], lon[-1], lat[-1]), haversine(begin_port[0], begin_port[1], lon[-1], lat[-1])
        first_end_dis, first_end_hav = distance(end_port[0], end_port[1], lon[0], lat[0]), haversine(end_port[0], end_port[1], lon[0], lat[0])
        last_end_dis, last_end_hav = distance(end_port[0], end_port[1], lon[-1], lat[-1]), haversine(end_port[0], end_port[1], lon[-1], lat[-1])
        fbd_lbd, fbd_lbd_rate = last_begin_dis/first_begin_dis, last_begin_dis-first_begin_dis
        fbh_lbh, fbh_lbh_rate = last_begin_hav/first_begin_hav, last_begin_hav-first_begin_hav
        fed_led, fed_led_rate = first_end_dis/last_end_dis, first_end_dis-last_end_dis
        fed_leh, fed_leh_rate = first_end_hav/last_end_hav, first_end_hav-last_end_hav
        
        # 类别特征
        carrierName = line['carrierName']
        vesselMMSI = line['vesselMMSI']
        begin_hash3, begin_hash4, begin_hash5 = line['begin_port_position_hash3'], line['begin_port_position_hash4'], line['begin_port_position_hash5']
        end_hash3, end_hash4, end_hash5 = line['end_port_position_hash3'], line['end_port_position_hash4'], line['end_port_position_hash5']
        first_hash3, first_hash4, first_hash5 = line['geo_hash3'].split(' ')[0], line['geo_hash4'].split(' ')[0], line['geo_hash5'].split(' ')[0]
        last_hash3, last_hash4, last_hash5 = line['geo_hash3'].split(' ')[-1], line['geo_hash4'].split(' ')[-1], line['geo_hash5'].split(' ')[-1]
        first_year, first_month, first_day = timestamp[0].year, timestamp[0].month, timestamp[0].day
        last_year, last_month, last_day = timestamp[-1].year, timestamp[-1].month, timestamp[-1].day

        
        num_feas1 = np.array([timestamp_hours, timestamp_diff1, 
                              lon, lat, speed, direction, lon_diff1, lat_diff1, speed_diff1, direction_diff1, 
                              begin_distance, begin_dis_cumsum, begin_dis_diff1, 
                              end_distance, end_dis_cumsum, end_dis_diff1, 
                              begin_haversine, begin_hav_cumsum, begin_hav_diff1, 
                              end_haversine, end_hav_cumsum, end_hav_diff1, 
                              diff_speed, cumsum_speed,
                             ]).T
#         print(xlen)
#         print(num_feas1)
#         print(num_feas1.shape)
        
        
        # 简单数值特征集合
        num_feas2 = np.array([[xlen, speed_0_nums, speed_0_rate, speed_0_time, hash3_nunique, hash4_nunique, hash5_nunique, 
                     hash3_rate, hash4_rate, hash5_rate, first_begin_dis, first_begin_hav,  last_begin_dis, last_begin_hav,
                     first_end_dis, first_end_hav, last_end_dis, last_end_hav, fbd_lbd, fbd_lbd_rate, 
                     fbh_lbh, fbh_lbh_rate, fed_led, fed_led_rate, fed_leh, fed_leh_rate]])
        num_feas2 = np.repeat(num_feas2, xlen, axis=0)
        
        num_feas = np.concatenate((num_feas1, num_feas2), axis=1)
#         print(num_feas.shape)
#         print(num_feas)
        
        # print(len(num_feas1+num_feas2+num_feas3))
        # break 

        # 添加类别型特征
        cat_feas = np.array([[carrierName, vesselMMSI, begin_hash3, begin_hash4, begin_hash5, 
                     end_hash3, end_hash4, end_hash5, first_hash3, first_hash4, first_hash5, 
                     last_hash3, last_hash4, last_hash5, first_year, first_month, first_day, 
                     last_year, last_month, last_day]])
        cat_feas = np.repeat(cat_feas, xlen, axis=0)
#         print(cat_feas.shape)
#         print(cat_feas)
        
        numerical_fea.append(num_feas)
        categorical_fea.append(cat_feas)
        if is_train:
            label.extend([line['label']] * xlen)
        if not is_train:
            lo.extend([line['loadingOrder']] * xlen)
#         break
    print(len(lo))
    numerical_fea = np.concatenate(numerical_fea, axis=0)
    categorical_fea = np.concatenate(categorical_fea, axis=0)
    # print(numerical_fea.shape)
    num_cols = ['num_{}'.format(str(i)) for i in range(numerical_fea.shape[1])]
    cat_cols = ['cat_{}'.format(str(i)) for i in range(categorical_fea.shape[1])]
    num_df = pd.DataFrame(numerical_fea)
    print(num_df.shape)
    cat_df = pd.DataFrame(categorical_fea)
    print(cat_df.shape)
    fea_df = pd.concat([num_df, cat_df], axis=1)
    
    fea_df.columns = num_cols + cat_cols
    if is_train:
        fea_df['label'] = label
    if not is_train:
        fea_df['lo'] = lo
    print(fea_df.shape)
    return fea_df

train_fea = feature_engineering(train_df)
test_fea = feature_engineering(test_df, is_train=False)

100%|██████████| 10000/10000 [02:34<00:00, 64.93it/s]


0
(6258637, 50)
(6258637, 20)


  3%|▎         | 7/219 [00:00<00:03, 66.66it/s]

(6258637, 71)


100%|██████████| 219/219 [00:01<00:00, 126.76it/s]


58682
(58682, 50)
(58682, 20)
(58682, 71)
CPU times: user 3min 14s, sys: 16.3 s, total: 3min 31s
Wall time: 3min 30s


In [8]:
test_lo = test_fea['lo'].values
del test_fea['lo']

In [10]:
train_fea.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,num_8,num_9,num_10,num_11,num_12,num_13,num_14,num_15,num_16,num_17,num_18,num_19,num_20,num_21,num_22,num_23,num_24,num_25,num_26,num_27,num_28,num_29,num_30,num_31,num_32,num_33,num_34,num_35,num_36,num_37,num_38,num_39,num_40,num_41,num_42,num_43,num_44,num_45,num_46,num_47,num_48,num_49,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18,cat_19,label
0,0.0,0.0,113.796075,22.326255,28.0,16530.0,0.0,0.0,0.0,0.0,0.242636,0.242636,0.0,23.319708,23.319708,0.0,26.829636,26.829636,0.0,2579.832071,2579.832071,0.0,0.0,0.402698,396.0,46.0,0.116162,73.063333,18.0,61.0,138.0,0.045455,0.15404,0.348485,0.242636,26.829636,23.69291,2620.687378,23.319708,2579.832071,0.180716,20.090423,97.64796,23.450274,97.678826,2593.857742,129.040957,23.138993,128.411039,2559.741648,JCMFTA,G8796550152,ws0,ws0b,ws0br,w21,w21x,w21xr,web,webz,webzj,w21,w21x,w21xh,2019,8,27,2019,9,1,20.024167
1,0.033611,0.033611,113.798207,22.317537,28.0,16780.0,0.002132,-0.008718,0.0,250.0,0.250466,0.493102,0.00783,23.312772,46.63248,-0.006936,27.714236,54.543873,0.8846,2579.061369,5158.89344,-0.770702,26.318666,1622.792424,396.0,46.0,0.116162,73.063333,18.0,61.0,138.0,0.045455,0.15404,0.348485,0.242636,26.829636,23.69291,2620.687378,23.319708,2579.832071,0.180716,20.090423,97.64796,23.450274,97.678826,2593.857742,129.040957,23.138993,128.411039,2559.741648,JCMFTA,G8796550152,ws0,ws0b,ws0br,w21,w21x,w21xr,web,webz,webzj,w21,w21x,w21xh,2019,8,27,2019,9,1,20.024167
2,0.061667,0.028056,113.799868,22.310838,28.0,16690.0,0.001661,-0.006699,0.0,-90.0,0.256528,0.749631,0.006062,23.307453,69.939933,-0.005319,28.398343,82.942216,0.684107,2578.470304,7737.363743,-0.591065,24.383994,1345.008686,396.0,46.0,0.116162,73.063333,18.0,61.0,138.0,0.045455,0.15404,0.348485,0.242636,26.829636,23.69291,2620.687378,23.319708,2579.832071,0.180716,20.090423,97.64796,23.450274,97.678826,2593.857742,129.040957,23.138993,128.411039,2559.741648,JCMFTA,G8796550152,ws0,ws0b,ws0br,w21,w21x,w21xr,web,webz,webzj,w21,w21x,w21xh,2019,8,27,2019,9,1,20.024167
3,0.098889,0.037222,113.802293,22.301148,28.0,16710.0,0.002425,-0.00969,0.0,20.0,0.265365,1.014996,0.008836,23.29977,93.239703,-0.007683,29.394381,112.336597,0.996038,2577.616531,10314.980274,-0.853773,26.759232,1135.987946,396.0,46.0,0.116162,73.063333,18.0,61.0,138.0,0.045455,0.15404,0.348485,0.242636,26.829636,23.69291,2620.687378,23.319708,2579.832071,0.180716,20.090423,97.64796,23.450274,97.678826,2593.857742,129.040957,23.138993,128.411039,2559.741648,JCMFTA,G8796550152,ws0,ws0b,ws0br,w21,w21x,w21xr,web,webz,webzj,w21,w21x,w21xh,2019,8,27,2019,9,1,20.024167
4,0.141944,0.043056,113.804803,22.290102,29.0,16970.0,0.00251,-0.011046,1.0,260.0,0.275587,1.290583,0.010223,23.290904,116.530608,-0.008866,30.544136,142.880733,1.149755,2576.631867,12891.612141,-0.984664,26.703976,1006.596093,396.0,46.0,0.116162,73.063333,18.0,61.0,138.0,0.045455,0.15404,0.348485,0.242636,26.829636,23.69291,2620.687378,23.319708,2579.832071,0.180716,20.090423,97.64796,23.450274,97.678826,2593.857742,129.040957,23.138993,128.411039,2559.741648,JCMFTA,G8796550152,ws0,ws0b,ws0br,w21,w21x,w21xr,web,webz,webzj,w21,w21x,w21xh,2019,8,27,2019,9,1,20.024167


In [11]:
# label_encoder
from sklearn.preprocessing import LabelEncoder

test_fea['label'] = -1e8
data_fea = pd.concat([train_fea, test_fea])
data_fea = data_fea.reset_index(drop=True)

for col in ['cat_{}'.format(str(i)) for i in range(20)]:
    print(col)
    le = LabelEncoder()
    data_fea[col] = le.fit_transform(data_fea[col])
    data_fea[col] = data_fea[col].astype('category')

train_feas = data_fea[data_fea['label'] != -1e8].reset_index(drop=True)
test_feas = data_fea[data_fea['label'] == -1e8].reset_index(drop=True)
del test_feas['label']
print(train_feas.shape, test_feas.shape)

all_feas = ['num_{}'.format(str(i)) for i in range(50)] + ['cat_{}'.format(str(i)) for i in range(20)]
cat_feas = ['cat_{}'.format(str(i)) for i in range(20)]
print(len(all_feas), len(cat_feas))

cat_0
cat_1
cat_2
cat_3
cat_4
cat_5
cat_6
cat_7
cat_8
cat_9
cat_10
cat_11
cat_12
cat_13
cat_14
cat_15
cat_16
cat_17
cat_18
cat_19
(6258637, 71) (58682, 70)
70 20


In [12]:
# Log
file_name = datetime.date.today().strftime('%m%d')+"_{}.log".format("lgb_round2")
def write_log(w):
    t0 = datetime.datetime.now().strftime('%H:%M:%S')
    info = "{} : {}\n".format(t0, w)
    print(info)
    with open(file_name, 'a') as f:
        f.write(info)
        f.write("-"*80+"\n")

In [18]:
%%time

def mse_score_eval(preds, valid):
    labels = valid.get_label()
    #weight = valid.data['num_336'].values
    scores = mse(y_true=labels, y_pred=preds)
    return 'mse_score', scores, False

def build_model(train, test, label, seed=2020, is_shuffle=True):
    imp = pd.DataFrame()  # 特征重要性
    imp['feat'] = all_feas

    train_pred = np.zeros((train.shape[0], ))
    test_pred = np.zeros((test.shape[0], ))
    n_splits = 5
    # Kfold
    # fold = GroupKFold(n_splits=n_splits)
    fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed)
    # groups = train_df['loadingOrder'].values
    kf_way = fold.split(train)
    # params
    params = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'num_leaves': 128,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.75,
        'bagging_freq': 5,
        'seed': 8,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': 45,
        'verbose': 1,
    }
    # train
    for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1):
        write_log("fold {}".format(n_fold))
        train_x, train_y = train.iloc[train_idx], label[train_idx]
        valid_x, valid_y = train.iloc[valid_idx], label[valid_idx]
        # 数据加载
        n_train = lgb.Dataset(train_x, label=train_y, 
                             # free_raw_data=False
                             )
        n_valid = lgb.Dataset(valid_x, label=valid_y, 
                              #free_raw_data=False
                             )

        clf = lgb.train(
            params=params,
            train_set=n_train,
            categorical_feature=cat_feas,
            num_boost_round=10000,
            valid_sets=[n_train, n_valid],
            early_stopping_rounds=50,
            verbose_eval=100,
            feval=mse_score_eval
        )
        clf.save_model("data/lgb_save_model/lgb_model_fold_{}.txt".format(n_fold))
        train_pred[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        
        write_log("val_mse = {}".format(mse(valid_y, train_pred[valid_idx])))
        
        test_pred += clf.predict(test, num_iteration=clf.best_iteration)/fold.n_splits

        imp['gain' + str(n_fold + 1)] = clf.feature_importance(importance_type='gain')
        imp['split' + str(n_fold + 1)] = clf.feature_importance(importance_type='split')
        
    write_log("train mse: {}".format(mse(label, train_pred)))
    result = pd.DataFrame({
        'loadingOrder': test_lo, 
        'label': test_pred,
    })
    return result, imp

result, imp = build_model(train_feas[all_feas], test_feas[all_feas], train_feas['label'], is_shuffle=True)

result.to_pickle("result/result1.pkl")
imp.to_pickle("result/fea1.pkl")

05:32:18 : fold 1



New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 7.76645	training's mse_score: 7.76645	valid_1's l2: 7.87819	valid_1's mse_score: 7.87819
[200]	training's l2: 0.88367	training's mse_score: 0.88367	valid_1's l2: 0.91191	valid_1's mse_score: 0.91191
[300]	training's l2: 0.261811	training's mse_score: 0.261811	valid_1's l2: 0.27579	valid_1's mse_score: 0.27579
[400]	training's l2: 0.116321	training's mse_score: 0.116321	valid_1's l2: 0.126286	valid_1's mse_score: 0.126286
[500]	training's l2: 0.0628978	training's mse_score: 0.0628978	valid_1's l2: 0.0704122	valid_1's mse_score: 0.0704122
[600]	training's l2: 0.0391837	training's mse_score: 0.0391837	valid_1's l2: 0.0456047	valid_1's mse_score: 0.0456047
[700]	training's l2: 0.0270194	training's mse_score: 0.0270194	valid_1's l2: 0.0329438	valid_1's mse_score: 0.0329438
[800]	training's l2: 0.0200369	training's mse_score: 0.0200369	valid_1's l2: 0.0255682	valid_1's mse_score: 0.0255682
[900]	training's l2: 

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 7.61152	training's mse_score: 7.61152	valid_1's l2: 7.73715	valid_1's mse_score: 7.73715
[200]	training's l2: 0.838927	training's mse_score: 0.838927	valid_1's l2: 0.872319	valid_1's mse_score: 0.872319
[300]	training's l2: 0.255287	training's mse_score: 0.255287	valid_1's l2: 0.273443	valid_1's mse_score: 0.273443
[400]	training's l2: 0.11327	training's mse_score: 0.11327	valid_1's l2: 0.125347	valid_1's mse_score: 0.125347
[500]	training's l2: 0.0634868	training's mse_score: 0.0634868	valid_1's l2: 0.0730478	valid_1's mse_score: 0.0730478
[600]	training's l2: 0.0406871	training's mse_score: 0.0406871	valid_1's l2: 0.0483801	valid_1's mse_score: 0.0483801
[700]	training's l2: 0.0285209	training's mse_score: 0.0285209	valid_1's l2: 0.0348753	valid_1's mse_score: 0.0348753
[800]	training's l2: 0.0213791	training's mse_score: 0.0213791	valid_1's l2: 0.0268942	valid_1's mse_score: 0.0268942
[900]	training's 

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 7.57211	training's mse_score: 7.57211	valid_1's l2: 7.69138	valid_1's mse_score: 7.69138
[200]	training's l2: 0.856811	training's mse_score: 0.856811	valid_1's l2: 0.887193	valid_1's mse_score: 0.887193
[300]	training's l2: 0.276125	training's mse_score: 0.276125	valid_1's l2: 0.29151	valid_1's mse_score: 0.29151
[400]	training's l2: 0.129384	training's mse_score: 0.129384	valid_1's l2: 0.139922	valid_1's mse_score: 0.139922
[500]	training's l2: 0.0739951	training's mse_score: 0.0739951	valid_1's l2: 0.0824458	valid_1's mse_score: 0.0824458
[600]	training's l2: 0.0474552	training's mse_score: 0.0474552	valid_1's l2: 0.054804	valid_1's mse_score: 0.054804
[700]	training's l2: 0.032217	training's mse_score: 0.032217	valid_1's l2: 0.0387333	valid_1's mse_score: 0.0387333
[800]	training's l2: 0.0239899	training's mse_score: 0.0239899	valid_1's l2: 0.0299886	valid_1's mse_score: 0.0299886
[900]	training's l2: 

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 7.58933	training's mse_score: 7.58933	valid_1's l2: 7.71042	valid_1's mse_score: 7.71042
[200]	training's l2: 0.821247	training's mse_score: 0.821247	valid_1's l2: 0.863664	valid_1's mse_score: 0.863664
[300]	training's l2: 0.254903	training's mse_score: 0.254903	valid_1's l2: 0.278471	valid_1's mse_score: 0.278471
[400]	training's l2: 0.11129	training's mse_score: 0.11129	valid_1's l2: 0.126664	valid_1's mse_score: 0.126664
[500]	training's l2: 0.0620959	training's mse_score: 0.0620959	valid_1's l2: 0.0742385	valid_1's mse_score: 0.0742385
[600]	training's l2: 0.039634	training's mse_score: 0.039634	valid_1's l2: 0.0497496	valid_1's mse_score: 0.0497496
[700]	training's l2: 0.0277944	training's mse_score: 0.0277944	valid_1's l2: 0.0359048	valid_1's mse_score: 0.0359048
[800]	training's l2: 0.0211655	training's mse_score: 0.0211655	valid_1's l2: 0.0281669	valid_1's mse_score: 0.0281669
[900]	training's l2

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 7.8021	training's mse_score: 7.8021	valid_1's l2: 7.84561	valid_1's mse_score: 7.84561
[200]	training's l2: 0.884535	training's mse_score: 0.884535	valid_1's l2: 0.899985	valid_1's mse_score: 0.899985
[300]	training's l2: 0.273097	training's mse_score: 0.273097	valid_1's l2: 0.28729	valid_1's mse_score: 0.28729
[400]	training's l2: 0.121622	training's mse_score: 0.121622	valid_1's l2: 0.133073	valid_1's mse_score: 0.133073
[500]	training's l2: 0.0678807	training's mse_score: 0.0678807	valid_1's l2: 0.076865	valid_1's mse_score: 0.076865
[600]	training's l2: 0.0423408	training's mse_score: 0.0423408	valid_1's l2: 0.0500798	valid_1's mse_score: 0.0500798
[700]	training's l2: 0.0303102	training's mse_score: 0.0303102	valid_1's l2: 0.0372377	valid_1's mse_score: 0.0372377
[800]	training's l2: 0.0235225	training's mse_score: 0.0235225	valid_1's l2: 0.0299305	valid_1's mse_score: 0.0299305
[900]	training's l2: 

In [20]:
result = result.groupby('loadingOrder').agg({'label':'mean'}).reset_index()

In [21]:
result.loc[result['loadingOrder'] == 'JV646262964847', 'label'] = 620
result.loc[result['loadingOrder'] == 'HB956271385453', 'label'] = 623

In [22]:
# train mse:  256.9597223109227
# train mse:  256.2317758079061

sub_FORMAT = "%Y/%m%d  %H:%M:%S"
UTC_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"

def get_time(x, y):
    x = x.split(' ')[-1]
    x = datetime.datetime.strptime(x, UTC_FORMAT)
    day = y // 24 
    seconds = int((y - (y//24)*24)*3600)
    res = x + datetime.timedelta(days=day, seconds=seconds)
    str_res = res.strftime(sub_FORMAT)
    return res 

# result = pd.DataFrame({
#         'loadingOrder': test_loadingOrder, 
#         'label': test_pred,
#     })

result['timestamp'] = test_df['timestamp']
result['ETA'] = list(map(lambda x, y: get_time(x, y), result['timestamp'], result['label']))

In [23]:
test_data = pd.read_csv("data/R2_ATest 0711.csv") 
def get_data(data, mode='train'):
    
    assert mode=='train' or mode=='test'
#     if mode=='train':
#         data['vesselNextportETA'] = pd.to_datetime(data['vesselNextportETA'], infer_datetime_format=True) # 转换时间
    if mode=='test':
        data['temp_timestamp'] = data['timestamp']
        data['onboardDate'] = pd.to_datetime(data['onboardDate'], infer_datetime_format=True)
    
    data['timestamp'] = pd.to_datetime(data['timestamp'], infer_datetime_format=True)
    return data

test_data = get_data(test_data, mode='test')

test_data.sort_values(['loadingOrder', 'timestamp'], inplace=True)
test_data = test_data.reset_index(drop=True) 

test_data = test_data.merge(result[['loadingOrder', 'ETA']], on='loadingOrder', how='left')
test_data['ETA'] = test_data['ETA'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
test_data.drop(['direction','TRANSPORT_TRACE'],axis=1,inplace=True)
test_data['onboardDate'] = test_data['onboardDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
test_data['creatDate'] = pd.datetime.now().strftime('%Y/%m/%d  %H:%M:%S')
test_data['timestamp'] = test_data['temp_timestamp']
# 整理columns顺序
result = test_data[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA', 'creatDate']]



In [None]:
result.to_csv('result/round2_lgb_0722.csv', index=False)