In [1]:
import os
import requests
cred_url = os.environ["QCLOUD_CONTAINER_INSTANCE_CREDENTIALS_URL"]
r = requests.get(cred_url)
secretId = r.json()["TmpSecretId"]
secretKey = r.json()["TmpSecretKey"]
token = r.json()["Token"]

In [4]:
import os
from qcloud_cos import CosConfig
from qcloud_cos import CosS3Client
from ti.utils import get_temporary_secret_and_token

#### 指定本地文件路径，可根据需要修改。
local_file = "/home/tione/notebook/data/train_final_626.zip"

#### 用户的存储桶，修改为存放所需数据文件的存储桶，存储桶获取参考腾讯云对象存储
bucket="hc-02-1258788535"

#### 用户的数据，修改为对应的数据文件路径，文件路径获取参考腾讯云对象存储
data_key="train_final_626.zip"

#### 获取用户临时密钥
secret_id, secret_key, token = get_temporary_secret_and_token()
config = CosConfig(Region=os.environ.get('REGION'), SecretId=secret_id, SecretKey=secret_key, Token=token, Scheme='https')
client = CosS3Client(config)

####  获取文件到本地
response = client.get_object(
    Bucket=bucket,
    Key=data_key,
)
response['Body'].get_stream_to_file(local_file)

In [1]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm 
import datetime, time 
from utils import distance, haversine, standard, pad_seq 
from scipy.stats import skew, kurtosis
from zipfile import ZipFile
from collections import Counter 

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
%%time

myzip=ZipFile('data/train_final_626.zip')
f=myzip.open('train_final_626.csv')
train_df=pd.read_csv(f)
print(train_df)
f.close()
myzip.close() 

train_df.drop_duplicates(['loadingOrder', 'label'], inplace=True)
test_df = pd.read_csv("data/test_create_final.csv")

del_cols = ['arrive_time_index', 'arrive_time', 'direction', 'geo_hash3', 'selcte_index']
train_df.drop(del_cols, axis=1, inplace=True) 

columns = ['TRANSPORT_TRACE', 'begin_port_name', 'begin_port_position',
        'begin_port_position_hash3', 'begin_port_position_hash4',
        'begin_port_position_hash5', 'carrierName', 'end_port_name',
        'end_port_position', 'end_port_position_hash3',
        'end_port_position_hash4', 'end_port_position_hash5', 'loadingOrder',
        'vesselMMSI', 'test_index', 'geo_hash3', 'geo_hash5',
        'geo_hash4', 'speed', 'longitude',
        'latitude', 'timestamp', 'direction', 'label']

train_df.columns = columns

                TRANSPORT_TRACE               arrive_time  arrive_time_index  \
0                   CNSHK-AEJEA  2020-02-07T17:48:03.000Z              10506   
1                   CNSHK-AEJEA  2020-02-07T17:48:03.000Z              10506   
2                   CNSHK-AEJEA  2020-02-07T17:48:03.000Z              10506   
3                   CNSHK-AEJEA  2020-02-07T17:48:03.000Z              10506   
4                   CNSHK-AEJEA  2020-02-07T17:48:03.000Z              10506   
...                         ...                       ...                ...   
146130  CNSHK-MYTPP-MUPLU-ZADUR  2019-11-25T06:45:05.000Z               2466   
146131  CNSHK-MYTPP-MUPLU-ZADUR  2019-11-25T06:45:05.000Z               2466   
146132  CNSHK-MYTPP-MUPLU-ZADUR  2019-11-25T06:45:05.000Z               2466   
146133  CNSHK-MYTPP-MUPLU-ZADUR  2019-11-25T06:45:05.000Z               2466   
146134  CNSHK-MYTPP-MUPLU-ZADUR  2019-11-25T06:45:05.000Z               2466   

       begin_port_name           begin_

In [3]:
train_df.head(1)

Unnamed: 0,TRANSPORT_TRACE,begin_port_name,begin_port_position,begin_port_position_hash3,begin_port_position_hash4,begin_port_position_hash5,carrierName,end_port_name,end_port_position,end_port_position_hash3,...,test_index,geo_hash3,geo_hash5,geo_hash4,speed,longitude,latitude,timestamp,direction,label
0,CNSHK-AEJEA,CNSHK,113.86305800000001 22.559462,ws0,ws0b,ws0br,OYSCFP,AEJEA,55.04979 25.022073000000002,thr,...,225,web web web web web web web web web web,webzx webzx webzx webzr webzr webzr webzr webz...,webz webz webz webz webz webz webz webz webz webz,11 10 10 7 7 9 11 15 21 22,113.88364299999999 113.885625 113.885680000000...,22.437238 22.418201999999997 22.417943 22.4042...,2020-01-24T05:38:21.000Z 2020-01-24T05:48:51.0...,19100 16700 16700 17100 16500 16500 16600 1600...,347.458611


In [4]:
test_df.head(1)

Unnamed: 0,loadingOrder,timestamp,direction,speed,TRANSPORT_TRACE,carrierName,vesselMMSI,longitude,latitude,geo_hash5,...,begin_port_name,end_port_name,begin_port_position,end_port_position,begin_port_position_hash4,begin_port_position_hash5,begin_port_position_hash3,end_port_position_hash4,end_port_position_hash5,end_port_position_hash3
0,AE223035353902,2019-07-03T21:16:48.000Z 2019-07-03T21:34:48.0...,29070.0 34550.0 30160.0 33540.0 35140.0 35300....,24 25 32 33 36 37 40 40 42 43 42 41 40 40 39 3...,CNYTN-MXZLO,OIEQNT,C2075927370,120.09385800000001 120.035707 119.9818 119.970...,22.58132 22.617522 22.658465 22.668688 22.7568...,wsj2k wsj2e wsj2f wsj2f wsj33 wsj3c wsj7b wsj7...,...,CNYTN,MXZLO,114.275347 22.5777,-104.305571 19.085960999999998,ws12,ws122,ws1,9emu,9emud,9em


In [5]:
%%time

def get_stats(values):   # 统计特征
    xlen = len(values)
    vmax = np.max(values)
    vmin = np.min(values)
    return [np.mean(values), vmax, vmin, np.std(values), vmax-vmin,
            np.sum(values), np.median(values), np.count_nonzero(values), len(set(values)), 
            np.count_nonzero(values)/xlen, len(set(values))/xlen, 
            # np.percentile(values, 25), np.percentile(values, 75), 
            skew(values), kurtosis(values), 
            sorted(Counter(values).items(), key=lambda x: x[1], reverse=True)[0][0]]

def feature_engineering(df, is_train=True):
    numerical_fea = []
    categorical_fea = []
    if is_train:
        label = []
    for idx in tqdm(range(df.shape[0])):
        line = df.iloc[idx]

        timestamp = [datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.000Z") for x in line['timestamp'].split(' ')]
        
        # 时间特征
        timestamp_hours = np.array([(x-timestamp[0]).total_seconds()/3600.0 for x in timestamp])  # 减去最初的时间
        timestamp_diff1 = np.diff(timestamp_hours, prepend=timestamp_hours[0])    # 时间一阶差分
        # print(timestamp_diff1[:20])
        xlen = len(timestamp_hours)   # 长度

        begin_port = [float(x) for x in line['begin_port_position'].split(' ')]
        end_port = [float(x) for x in line['end_port_position'].split(' ')]

        # 原始的四个特征
        lon = [float(x) for x in line['longitude'].split(' ')]
        lat  = [float(x) for x in line['latitude'].split(' ')]
        speed = [float(x) for x in line['speed'].split(' ')]
        direction = [float(x) for x in line['direction'].split(' ')]
        lon_diff1 = np.diff(lon, prepend=lon[0])
        lat_diff1 = np.diff(lat, prepend=lat[0])
        speed_diff1 = np.diff(speed, prepend=speed[0])
        direction_diff1 = np.diff(direction, prepend=direction[0])

        # 类别特征
        carrierName = line['carrierName']
        vesselMMSI = line['vesselMMSI']
        begin_hash3, begin_hash4, begin_hash5 = line['begin_port_position_hash3'], line['begin_port_position_hash4'], line['begin_port_position_hash5']
        end_hash3, end_hash4, end_hash5 = line['end_port_position_hash3'], line['end_port_position_hash4'], line['end_port_position_hash5']
        first_hash3, first_hash4, first_hash5 = line['geo_hash3'].split(' ')[0], line['geo_hash4'].split(' ')[0], line['geo_hash5'].split(' ')[0]
        last_hash3, last_hash4, last_hash5 = line['geo_hash3'].split(' ')[-1], line['geo_hash4'].split(' ')[-1], line['geo_hash5'].split(' ')[-1]
        first_year, first_month, first_day = timestamp[0].year, timestamp[0].month, timestamp[0].day
        last_year, last_month, last_day = timestamp[-1].year, timestamp[-1].month, timestamp[-1].day

        # 距离特征
        begin_distance = [distance(begin_port[0], begin_port[1], lon[i], lat[i]) for i in range(xlen)]  # 与起始距离
        begin_dis_cumsum = np.cumsum(begin_distance)   # 累积距离
        begin_dis_diff1 = np.diff(begin_distance, prepend=begin_distance[0])   # 一阶差分距离

        end_distance = [distance(end_port[0], end_port[1], lon[i], lat[i]) for i in range(xlen)]    # 与终止距离
        end_dis_cumsum = np.cumsum(end_distance)    # 累积距离
        end_dis_diff1 = np.diff(end_distance, prepend=end_distance[0]) 

        begin_haversine = [haversine(begin_port[0], begin_port[1], lon[i], lat[i]) for i in range(xlen)] 
        begin_hav_cumsum = np.cumsum(begin_haversine)   # 累积距离
        begin_hav_diff1 = np.diff(begin_haversine, prepend=begin_haversine[0])   # 一阶差分距离

        end_haversine = [haversine(end_port[0], end_port[1], lon[i], lat[i]) for i in range(xlen)]
        end_hav_cumsum = np.cumsum(end_haversine)   # 累积距离
        end_hav_diff1 = np.diff(end_haversine, prepend=end_haversine[0])   # 一阶差分距离

        # 速度特征
        timestamp_diff1_tmp = timestamp_diff1.copy()
        timestamp_diff1_tmp[timestamp_diff1_tmp==0] = np.mean(timestamp_diff1_tmp)
        diff_speed = begin_hav_diff1 / (timestamp_diff1_tmp + 1e-8)    # 差分路程 / 差分时间=速度
        
        timestamp_hours_tmp = timestamp_hours.copy()
        timestamp_hours_tmp[timestamp_hours_tmp == 0] = np.mean(timestamp_hours_tmp)
        cumsum_speed = begin_hav_cumsum / (timestamp_hours_tmp + 1e-8)  # 累积路程 / 累积时间

        # 简单数值特征
        speed_0_nums = (np.array(speed) == 0).sum()
        speed_0_rate = speed_0_nums / xlen 
        speed_0_time = timestamp_diff1[np.array(speed) == 0].sum()
        hash3_nunique = len(set(line['geo_hash3'].split(' ')))
        hash4_nunique = len(set(line['geo_hash4'].split(' ')))
        hash5_nunique = len(set(line['geo_hash5'].split(' ')))
        hash3_rate, hash4_rate, hash5_rate = hash3_nunique/xlen,  hash4_nunique/xlen, hash5_nunique/xlen 
        first_begin_dis, first_begin_hav = distance(begin_port[0], begin_port[1], lon[0], lat[0]), haversine(begin_port[0], begin_port[1], lon[0], lat[0])
        last_begin_dis, last_begin_hav = distance(begin_port[0], begin_port[1], lon[-1], lat[-1]), haversine(begin_port[0], begin_port[1], lon[-1], lat[-1])
        first_end_dis, first_end_hav = distance(end_port[0], end_port[1], lon[0], lat[0]), haversine(end_port[0], end_port[1], lon[0], lat[0])
        last_end_dis, last_end_hav = distance(end_port[0], end_port[1], lon[-1], lat[-1]), haversine(end_port[0], end_port[1], lon[-1], lat[-1])
        fbd_lbd, fbd_lbd_rate = last_begin_dis/first_begin_dis, last_begin_dis-first_begin_dis
        fbh_lbh, fbh_lbh_rate = last_begin_hav/first_begin_hav, last_begin_hav-first_begin_hav
        fed_led, fed_led_rate = first_end_dis/last_end_dis, first_end_dis-last_end_dis
        fed_leh, fed_leh_rate = first_end_hav/last_end_hav, first_end_hav-last_end_hav


        # 正式添加统计特征
        num_feas1 = get_stats(timestamp_hours) + get_stats(timestamp_diff1) + get_stats(lon) + get_stats(lat) + get_stats(speed) + get_stats(direction)
        num_feas2 = get_stats(begin_distance)+get_stats(begin_dis_cumsum)+get_stats(begin_dis_diff1)+get_stats(end_distance)+get_stats(end_dis_cumsum)+get_stats(end_dis_diff1)
        num_feas3 = get_stats(begin_haversine)+get_stats(begin_hav_cumsum)+get_stats(begin_hav_diff1)+get_stats(end_haversine)+get_stats(end_hav_cumsum)+get_stats(end_hav_diff1)
        num_feas4 = get_stats(diff_speed) + get_stats(cumsum_speed)
        num_feas5 = get_stats(lon_diff1) + get_stats(lat_diff1) + get_stats(speed_diff1) + get_stats(direction_diff1)
        num_feas6 = [xlen, speed_0_nums, speed_0_rate, speed_0_time, hash3_nunique, hash4_nunique, hash5_nunique, 
                     hash3_rate, hash4_rate, hash5_rate, first_begin_dis, first_begin_hav,  last_begin_dis, last_begin_hav,
                     first_end_dis, first_end_hav, last_end_dis, last_end_hav, fbd_lbd, fbd_lbd_rate, 
                     fbh_lbh, fbh_lbh_rate, fed_led, fed_led_rate, fed_leh, fed_leh_rate]
        # print(len(num_feas1+num_feas2+num_feas3))
        # break 

        # 添加类别型特征
        cat_feas1 = [carrierName, vesselMMSI, begin_hash3, begin_hash4, begin_hash5, 
                     end_hash3, end_hash4, end_hash5, first_hash3, first_hash4, first_hash5, 
                     last_hash3, last_hash4, last_hash5, first_year, first_month, first_day, 
                     last_year, last_month, last_day]
        
        numerical_fea.append(num_feas1 + num_feas2 + num_feas3 + num_feas4 + num_feas5 + num_feas6)
        categorical_fea.append(cat_feas1)
        if is_train:
            label.append(line['label'])
    
    numerical_fea = np.array(numerical_fea)
    categorical_fea = np.array(categorical_fea)
    # print(numerical_fea.shape)
    num_cols = ['num_{}'.format(str(i)) for i in range(numerical_fea.shape[1])]
    cat_cols = ['cat_{}'.format(str(i)) for i in range(categorical_fea.shape[1])]
    num_df = pd.DataFrame(numerical_fea)
    print(num_df.shape)
    cat_df = pd.DataFrame(categorical_fea)
    print(cat_df.shape)
    fea_df = pd.concat([num_df, cat_df], axis=1)
    
    fea_df.columns = num_cols + cat_cols
    if is_train:
        fea_df['label'] = label
    print(fea_df.shape)
    return fea_df

test_fea = feature_engineering(test_df, is_train=False)
train_fea = feature_engineering(train_df)

100%|██████████| 228/228 [00:05<00:00, 43.97it/s]
  0%|          | 6/146135 [00:00<40:39, 59.90it/s]

(228, 362)
(228, 20)
(228, 382)


100%|██████████| 146135/146135 [1:36:13<00:00, 25.31it/s]  


(146135, 362)
(146135, 20)
(146135, 383)
CPU times: user 1h 36min 31s, sys: 26.4 s, total: 1h 36min 57s
Wall time: 1h 36min 29s


In [6]:
train_fea.to_pickle('data/train_fea.pkl')
test_fea.to_pickle('data/test_fea.pkl')

In [9]:
train_fea.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,num_8,num_9,num_10,num_11,num_12,num_13,num_14,num_15,num_16,num_17,num_18,num_19,num_20,num_21,num_22,num_23,num_24,num_25,num_26,num_27,num_28,num_29,num_30,num_31,num_32,num_33,num_34,num_35,num_36,num_37,num_38,num_39,num_40,num_41,num_42,num_43,num_44,num_45,num_46,num_47,num_48,num_49,num_50,num_51,num_52,num_53,num_54,num_55,num_56,num_57,num_58,num_59,num_60,num_61,num_62,num_63,num_64,num_65,num_66,num_67,num_68,num_69,num_70,num_71,num_72,num_73,num_74,num_75,num_76,num_77,num_78,num_79,num_80,num_81,num_82,num_83,num_84,num_85,num_86,num_87,num_88,num_89,num_90,num_91,num_92,num_93,num_94,num_95,num_96,num_97,num_98,num_99,num_100,num_101,num_102,num_103,num_104,num_105,num_106,num_107,num_108,num_109,num_110,num_111,num_112,num_113,num_114,num_115,num_116,num_117,num_118,num_119,num_120,num_121,num_122,num_123,num_124,num_125,num_126,num_127,num_128,num_129,num_130,num_131,num_132,num_133,num_134,num_135,num_136,num_137,num_138,num_139,num_140,num_141,num_142,num_143,num_144,num_145,num_146,num_147,num_148,num_149,num_150,num_151,num_152,num_153,num_154,num_155,num_156,num_157,num_158,num_159,num_160,num_161,num_162,num_163,num_164,num_165,num_166,num_167,num_168,num_169,num_170,num_171,num_172,num_173,num_174,num_175,num_176,num_177,num_178,num_179,num_180,num_181,num_182,num_183,num_184,num_185,num_186,num_187,num_188,num_189,num_190,num_191,num_192,num_193,num_194,num_195,num_196,num_197,num_198,num_199,num_200,num_201,num_202,num_203,num_204,num_205,num_206,num_207,num_208,num_209,num_210,num_211,num_212,num_213,num_214,num_215,num_216,num_217,num_218,num_219,num_220,num_221,num_222,num_223,num_224,num_225,num_226,num_227,num_228,num_229,num_230,num_231,num_232,num_233,num_234,num_235,num_236,num_237,num_238,num_239,num_240,num_241,num_242,num_243,num_244,num_245,num_246,num_247,num_248,num_249,num_250,num_251,num_252,num_253,num_254,num_255,num_256,num_257,num_258,num_259,num_260,num_261,num_262,num_263,num_264,num_265,num_266,num_267,num_268,num_269,num_270,num_271,num_272,num_273,num_274,num_275,num_276,num_277,num_278,num_279,num_280,num_281,num_282,num_283,num_284,num_285,num_286,num_287,num_288,num_289,num_290,num_291,num_292,num_293,num_294,num_295,num_296,num_297,num_298,num_299,num_300,num_301,num_302,num_303,num_304,num_305,num_306,num_307,num_308,num_309,num_310,num_311,num_312,num_313,num_314,num_315,num_316,num_317,num_318,num_319,num_320,num_321,num_322,num_323,num_324,num_325,num_326,num_327,num_328,num_329,num_330,num_331,num_332,num_333,num_334,num_335,num_336,num_337,num_338,num_339,num_340,num_341,num_342,num_343,num_344,num_345,num_346,num_347,num_348,num_349,num_350,num_351,num_352,num_353,num_354,num_355,num_356,num_357,num_358,num_359,num_360,num_361,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18,cat_19,label
0,0.424361,0.703056,0.0,0.224311,0.703056,4.243611,0.526111,9.0,10.0,0.9,1.0,-0.583415,-1.031987,0.0,0.070306,0.175,0.0,0.068578,0.175,0.703056,0.043056,9.0,10.0,0.9,1.0,0.664976,-1.296856,0.0,113.891127,113.90379,113.883643,0.005706,0.020147,1138.911267,113.891168,10.0,10.0,1.0,1.0,0.794366,0.019517,113.883643,22.398996,22.437238,22.368845,0.019567,0.068393,223.989957,22.393654,10.0,10.0,1.0,1.0,0.411714,-0.646674,22.437238,12.3,22.0,7.0,5.080354,15.0,123.0,10.5,10.0,7.0,1.0,0.7,0.936854,-0.539103,11.0,16360.0,19100.0,12900.0,1456.845908,6200.0,163600.0,16550.0,10.0,8.0,1.0,0.8,-0.680156,1.45275,16700.0,0.162923,0.19492,0.123945,0.02022,0.070975,1.62923,0.168173,10.0,10.0,1.0,1.0,-0.354575,-0.661659,0.123945,0.839575,1.62923,0.123945,0.481059,1.505284,8.395749,0.819407,10.0,10.0,1.0,1.0,0.117177,-1.217487,0.123945,0.007097,0.019106,0.0,0.006184,0.019106,0.070975,0.006781,9.0,10.0,0.9,1.0,0.472278,-0.974732,0.0,58.899778,58.913775,58.890607,0.006525,0.023168,588.997777,58.900053,10.0,10.0,1.0,1.0,0.639501,-0.183323,58.890607,323.93084,588.997777,58.890607,169.178531,530.10717,3239.308403,323.924008,10.0,10.0,1.0,1.0,0.000117,-1.224226,58.890607,0.002317,0.007103,0.0,0.002162,0.007103,0.023168,0.002074,9.0,10.0,0.9,1.0,0.835126,-0.158711,0.0,18.076548,21.604948,13.754233,2.237825,7.850715,180.765476,18.661805,10.0,10.0,1.0,1.0,-0.362637,-0.659869,13.754233,93.16899,180.765476,13.754233,53.376818,167.011243,931.689896,90.938382,10.0,10.0,1.0,1.0,0.116748,-1.217779,13.754233,0.785071,2.123354,0.0,0.685748,2.123354,7.850715,0.748316,9.0,10.0,0.9,1.0,0.486352,-0.949392,0.0,5950.311027,5952.452782,5948.446385,1.122927,4.006396,59503.110267,5950.4717,10.0,10.0,1.0,1.0,0.178106,-0.587356,5948.446385,32723.567998,59503.110267,5948.446385,17091.460486,53554.663881,327235.679977,32722.405196,10.0,10.0,1.0,1.0,0.000194,-1.224231,5948.446385,0.40064,0.933701,0.0,0.331239,0.933701,4.006396,0.398129,9.0,10.0,0.9,1.0,0.139539,-1.567365,0.0,11.103698,20.091522,0.0,5.477718,20.091522,111.03698,11.036105,9.0,10.0,0.9,1.0,-0.194659,-0.168979,0.0,194.010006,257.114067,32.411624,63.933165,224.702443,1940.100055,200.378354,10.0,10.0,1.0,1.0,-1.364979,1.343228,32.411624,0.002015,0.006698,0.0,0.002005,0.006698,0.020147,0.001577,9.0,10.0,0.9,1.0,1.08623,0.345334,0.0,-0.006839,0.0,-0.019036,0.006072,0.019036,-0.068393,-0.006362,9.0,10.0,0.9,1.0,-0.574443,-0.792865,0.0,1.1,6.0,-3.0,2.426932,9.0,11.0,0.5,7.0,7.0,0.7,0.7,0.428972,-0.247393,0.0,-620.0,400.0,-2600.0,990.757286,3000.0,-6200.0,-250.0,7.0,7.0,0.7,0.7,-1.156879,-0.176764,0.0,10.0,0.0,0.0,0.0,1.0,1.0,2.0,0.1,0.1,0.2,0.123945,13.754233,0.19492,21.604948,58.890607,5948.446385,58.913775,5952.452782,1.572631,0.070975,1.570785,7.850715,0.999607,-0.023168,0.999327,-4.006396,OYSCFP,Z9282756147,ws0,ws0b,ws0br,thr,thrn,thrnk,web,webz,webzx,web,webz,webzr,2020,1,24,2020,1,24,347.458611
1,0.328611,0.675,0.0,0.237891,0.675,3.286111,0.244722,9.0,10.0,0.9,1.0,0.277956,-1.377707,0.0,0.0675,0.180833,0.0,0.058152,0.180833,0.675,0.051667,9.0,10.0,0.9,1.0,0.717382,-0.688981,0.0,113.888163,113.895048,113.883643,0.003917,0.011405,1138.881629,113.886662,10.0,10.0,1.0,1.0,0.663077,-1.084148,113.886747,22.411781,22.442002,22.382253,0.020095,0.059749,224.11781,22.416631,10.0,10.0,1.0,1.0,-0.105228,-1.271443,22.442002,12.0,19.0,6.0,3.605551,13.0,120.0,11.0,10.0,6.0,1.0,0.6,0.704043,-0.033136,11.0,17420.0,22000.0,15600.0,1756.018223,6400.0,174200.0,16950.0,10.0,8.0,1.0,0.8,1.682256,1.869524,17000.0,0.14981,0.180073,0.119825,0.020397,0.060248,1.4981,0.14465,10.0,10.0,1.0,1.0,0.138621,-1.284495,0.119825,0.766081,1.4981,0.119825,0.439852,1.378275,7.660814,0.735827,10.0,10.0,1.0,1.0,0.158811,-1.193505,0.119825,0.006025,0.011833,0.0,0.004259,0.011833,0.060248,0.005775,9.0,10.0,0.9,1.0,0.127343,-1.326413,0.0,58.896249,58.90444,58.890607,0.004732,0.013833,588.96249,58.894043,10.0,10.0,1.0,1.0,0.637461,-1.136372,58.893499,323.917004,588.96249,58.893499,169.166566,530.068991,3239.170044,323.90928,10.0,10.0,1.0,1.0,0.000106,-1.224221,58.893499,0.001094,0.003849,-0.002892,0.001734,0.006741,0.01094,0.001187,9.0,10.0,0.9,1.0,-0.741828,0.544593,0.0,16.623707,19.977025,13.285739,2.263171,6.691286,166.23707,16.054867,10.0,10.0,1.0,1.0,0.13392,-1.282786,13.285739,85.008315,166.23707,13.285739,48.813525,152.951331,850.083147,81.654678,10.0,10.0,1.0,1.0,0.158598,-1.19369,13.285739,0.669129,1.315997,0.0,0.472052,1.315997,6.691286,0.640853,9.0,10.0,0.9,1.0,0.123963,-1.320343,0.0,5949.641704,5951.191583,5948.446385,0.954851,2.745198,59496.417037,5949.276869,10.0,10.0,1.0,1.0,0.449436,-1.289147,5948.614434,32720.378186,59496.417037,5948.614434,17089.265795,53547.802603,327203.781864,32718.872776,10.0,10.0,1.0,1.0,0.000204,-1.224214,5948.614434,0.257715,0.662201,-0.168049,0.247542,0.830249,2.577149,0.263156,9.0,10.0,0.9,1.0,-0.077384,-1.021338,0.0,10.38569,19.353773,0.0,4.819666,19.353773,103.856899,10.381589,9.0,10.0,0.9,1.0,-0.309005,0.454782,0.0,269.090439,509.653828,40.429973,112.456059,469.223854,2690.904388,257.568431,10.0,10.0,1.0,1.0,0.157187,0.845341,40.429973,0.00083,0.003352,-0.003104,0.001634,0.006456,0.008301,0.000879,9.0,10.0,0.9,1.0,-0.94435,1.020347,0.0,-0.005975,0.0,-0.011846,0.004171,0.011846,-0.059749,-0.005693,9.0,10.0,0.9,1.0,-0.101834,-1.286014,0.0,0.8,7.0,-4.0,2.95973,11.0,8.0,0.0,7.0,6.0,0.7,0.6,0.727575,0.051792,0.0,-640.0,100.0,-2900.0,912.359578,3000.0,-6400.0,-350.0,8.0,8.0,0.8,0.8,-1.574628,1.132852,0.0,10.0,0.0,0.0,0.0,1.0,1.0,2.0,0.1,0.1,0.2,0.119825,13.285739,0.180073,19.977025,58.893499,5948.614434,58.90444,5951.191583,1.502803,0.060248,1.503644,6.691286,0.999814,-0.01094,0.999567,-2.577149,OYSCFP,Z9282756147,ws0,ws0b,ws0br,thr,thrn,thrnk,web,webz,webzx,web,webz,webzr,2020,1,24,2020,1,24,347.539722
2,0.24225,0.539167,0.0,0.171423,0.539167,2.4225,0.206111,9.0,10.0,0.9,1.0,0.597904,-0.700449,0.0,0.053917,0.241944,0.0,0.069113,0.241944,0.539167,0.029167,9.0,10.0,0.9,1.0,1.862432,2.59648,0.0,113.88653,113.89092,113.883908,0.002437,0.007012,1138.865298,113.885726,10.0,10.0,1.0,1.0,0.844685,-0.6499,113.884668,22.416553,22.439743,22.394583,0.014117,0.04516,224.165526,22.417687,10.0,10.0,1.0,1.0,-0.138866,-0.930759,22.439743,10.1,13.0,7.0,1.972308,6.0,101.0,10.5,10.0,6.0,1.0,0.6,-0.29561,-1.111716,12.0,17460.0,21500.0,16600.0,1373.462777,4900.0,174600.0,17100.0,10.0,7.0,1.0,0.7,2.474133,4.52596,17300.0,0.144827,0.167217,0.121654,0.0143,0.045563,1.448266,0.143576,10.0,10.0,1.0,1.0,0.16685,-0.938891,0.121654,0.756183,1.448266,0.121654,0.422626,1.326613,7.561833,0.737862,10.0,10.0,1.0,1.0,0.108618,-1.205189,0.121654,0.004556,0.015159,0.0,0.004648,0.015159,0.045563,0.002839,9.0,10.0,0.9,1.0,0.99387,0.057721,0.0,58.894405,58.899765,58.891148,0.00303,0.008617,588.944045,58.893549,10.0,10.0,1.0,1.0,0.759149,-0.754453,58.891521,323.911057,588.944045,58.891521,169.161667,530.052524,3239.110571,323.906778,10.0,10.0,1.0,1.0,6.3e-05,-1.224231,58.891521,0.000824,0.004284,-0.000373,0.001271,0.004658,0.008243,0.000353,9.0,10.0,0.9,1.0,1.859479,2.618214,0.0,16.072977,18.555868,13.495988,1.587148,5.05988,160.729775,15.935806,10.0,10.0,1.0,1.0,0.162876,-0.937811,13.495988,83.921335,160.729775,13.495988,46.905803,147.233787,839.213346,81.889354,10.0,10.0,1.0,1.0,0.108536,-1.205266,13.495988,0.505988,1.679306,0.0,0.515695,1.679306,5.05988,0.314732,9.0,10.0,0.9,1.0,0.986162,0.032884,0.0,5949.339569,5950.419877,5948.474555,0.646772,1.945322,59493.395688,5949.226403,10.0,10.0,1.0,1.0,0.476232,-0.935648,5948.474555,32719.562191,59493.395688,5948.474555,17088.372274,53544.921133,327195.621911,32718.688424,10.0,10.0,1.0,1.0,0.000128,-1.224226,5948.474555,0.194532,0.793681,0.0,0.226713,0.793681,1.945322,0.138105,9.0,10.0,0.9,1.0,1.67229,2.042603,0.0,9.437115,13.240124,0.0,3.595419,13.240124,94.37115,10.487301,9.0,10.0,0.9,1.0,-1.622458,1.972446,0.0,331.743283,436.691246,55.71099,105.124463,380.980255,3317.432831,357.033212,10.0,10.0,1.0,1.0,-1.629544,1.950204,55.71099,0.000625,0.003631,-0.00076,0.001121,0.004391,0.006252,0.000273,9.0,10.0,0.9,1.0,1.695399,2.434424,0.0,-0.004516,0.0,-0.014765,0.00458,0.014765,-0.04516,-0.002782,9.0,10.0,0.9,1.0,-0.941225,-0.113156,0.0,-0.5,1.0,-2.0,0.921954,3.0,-5.0,0.0,5.0,4.0,0.5,0.4,-0.382818,-0.83737,0.0,-490.0,200.0,-4200.0,1257.338459,4400.0,-4900.0,0.0,7.0,6.0,0.7,0.6,-2.511074,4.60692,0.0,10.0,0.0,0.0,0.0,1.0,1.0,2.0,0.1,0.1,0.2,0.121654,13.495988,0.167217,18.555868,58.891521,5948.474555,58.899765,5950.419877,1.374529,0.045563,1.374917,5.05988,0.99986,-0.008243,0.999673,-1.945322,OYSCFP,Z9282756147,ws0,ws0b,ws0br,thr,thrn,thrnk,web,webz,webzx,web,webz,webzr,2020,1,24,2020,1,24,347.648056
3,0.577306,1.193056,0.0,0.399034,1.193056,5.773056,0.541389,9.0,10.0,0.9,1.0,0.064948,-1.319347,0.0,0.119306,0.255,0.0,0.082982,0.255,1.193056,0.090694,9.0,10.0,0.9,1.0,0.369298,-1.175605,0.0,113.944396,114.062595,113.885798,0.063969,0.176797,1139.443958,113.913454,10.0,10.0,1.0,1.0,0.673267,-1.063404,113.885798,22.376291,22.417422,22.345992,0.02674,0.07143,223.762906,22.373155,10.0,10.0,1.0,1.0,0.225159,-1.562723,22.417422,17.6,26.0,6.0,8.187796,20.0,176.0,20.5,10.0,6.0,1.0,0.6,-0.224605,-1.745905,25.0,12680.0,17000.0,7000.0,3938.73076,10000.0,126800.0,13550.0,10.0,9.0,1.0,0.9,-0.145562,-1.794491,8900.0,0.206236,0.282015,0.143849,0.04941,0.138166,2.062363,0.197786,10.0,10.0,1.0,1.0,0.200095,-1.52874,0.143849,0.995233,2.062363,0.143849,0.614153,1.918514,9.952326,0.910791,10.0,10.0,1.0,1.0,0.291797,-1.155655,0.143849,0.013817,0.043616,0.0,0.012087,0.043616,0.138166,0.009914,9.0,10.0,0.9,1.0,1.268876,1.009301,0.0,58.954011,59.07281,58.893633,0.064835,0.179177,589.540106,58.923435,10.0,10.0,1.0,1.0,0.650933,-1.096986,58.893633,324.072361,589.540106,58.893633,169.336794,530.646472,3240.723611,323.962129,10.0,10.0,1.0,1.0,0.001417,-1.22394,58.893633,0.017918,0.047079,0.0,0.017783,0.047079,0.179177,0.011793,9.0,10.0,0.9,1.0,0.444742,-1.445957,0.0,22.592584,30.191259,15.965986,5.131981,14.225272,225.925844,21.879705,10.0,10.0,1.0,1.0,0.14339,-1.574385,15.965986,109.809851,225.925844,15.965986,67.306547,209.959858,1098.098507,101.006215,10.0,10.0,1.0,1.0,0.274867,-1.169497,15.965986,1.422527,4.709397,0.0,1.286321,4.709397,14.225272,0.937347,9.0,10.0,0.9,1.0,1.444406,1.51417,0.0,5956.260952,5968.454995,5949.241351,6.968692,19.213643,59562.609522,5953.413094,10.0,10.0,1.0,1.0,0.535429,-1.259204,5949.241351,32740.381758,59562.609522,5949.241351,17109.025039,53613.36817,327403.817583,32728.322239,10.0,10.0,1.0,1.0,0.001517,-1.223987,5949.241351,1.921364,4.974976,0.0,1.82164,4.974976,19.213643,1.364446,9.0,10.0,0.9,1.0,0.53836,-1.304986,0.0,10.056052,18.468222,0.0,4.499539,18.468222,100.560518,10.886431,9.0,10.0,0.9,1.0,-0.470609,0.761919,0.0,213.02347,615.665554,27.656041,143.008201,588.009514,2130.234701,182.059357,10.0,10.0,1.0,1.0,2.003491,3.569208,27.656041,0.01768,0.047399,0.0,0.017747,0.047399,0.176797,0.01146,9.0,10.0,0.9,1.0,0.439594,-1.437563,0.0,-0.005725,0.008168,-0.034705,0.011618,0.042873,-0.057252,-0.003739,9.0,10.0,0.9,1.0,-1.313789,1.242433,0.0,1.6,9.0,-4.0,3.954744,13.0,16.0,0.5,7.0,5.0,0.7,0.5,0.977039,-0.119282,0.0,-820.0,1400.0,-4900.0,1687.483333,6300.0,-8200.0,-250.0,8.0,9.0,0.8,0.9,-1.188763,0.825804,0.0,10.0,0.0,0.0,0.0,2.0,2.0,5.0,0.2,0.2,0.5,0.143849,15.965986,0.282015,30.191259,58.893633,5949.241351,59.07281,5968.454995,1.960495,0.138166,1.890974,14.225272,0.996967,-0.179177,0.996781,-19.213643,OYSCFP,Z9282756147,ws0,ws0b,ws0br,thr,thrn,thrnk,web,webz,webzx,wec,wecp,wecp5,2020,1,24,2020,1,24,346.785556
4,0.574111,1.213611,0.0,0.411656,1.213611,5.741111,0.635139,9.0,10.0,0.9,1.0,0.146462,-1.29311,0.0,0.121361,0.419444,0.0,0.141485,0.419444,1.213611,0.050556,9.0,10.0,0.9,1.0,1.254065,-0.030471,0.0,113.919004,114.018568,113.884312,0.048599,0.134256,1139.190036,113.894858,10.0,10.0,1.0,1.0,1.305314,-0.019026,113.884312,22.388259,22.439178,22.346138,0.033013,0.09304,223.882586,22.383119,10.0,10.0,1.0,1.0,0.138604,-1.527021,22.439178,18.0,25.0,10.0,6.21289,15.0,180.0,18.0,10.0,7.0,1.0,0.7,-0.017513,-1.808693,25.0,14490.0,20900.0,8500.0,3787.202134,12400.0,144900.0,15750.0,10.0,9.0,1.0,0.9,-0.25096,-0.848522,8500.0,0.183456,0.263324,0.122147,0.047287,0.141177,1.834561,0.179191,10.0,10.0,1.0,1.0,0.443303,-1.071131,0.122147,0.876852,1.834561,0.122147,0.543844,1.712414,8.768518,0.809692,10.0,10.0,1.0,1.0,0.306692,-1.116581,0.122147,0.014118,0.049346,0.0,0.015008,0.049346,0.141177,0.007216,9.0,10.0,0.9,1.0,1.253348,0.432982,0.0,58.928111,59.029425,58.891191,0.049719,0.138234,589.28111,58.904211,10.0,10.0,1.0,1.0,1.27859,-0.060082,58.891191,323.983935,589.28111,58.891191,169.250074,530.38992,3239.839355,323.925051,10.0,10.0,1.0,1.0,0.000936,-1.223838,58.891191,0.013823,0.088249,0.0,0.025424,0.088249,0.138234,0.004861,9.0,10.0,0.9,1.0,2.441974,4.377987,0.0,20.213491,28.525769,13.552021,5.028947,14.973748,202.134914,19.879151,10.0,10.0,1.0,1.0,0.378618,-1.135962,13.552021,97.06472,202.134914,13.552021,59.985711,188.582892,970.6472,89.851523,10.0,10.0,1.0,1.0,0.295246,-1.129274,13.552021,1.497375,4.947231,0.0,1.548201,4.947231,14.973748,0.766414,9.0,10.0,0.9,1.0,1.113183,-0.004526,0.0,5953.391466,5964.478988,5948.455827,5.613899,16.023161,59533.914657,5951.147312,10.0,10.0,1.0,1.0,1.124217,-0.286607,5948.455827,32729.356058,59533.914657,5948.455827,17099.482831,53585.45883,327293.560578,32722.255964,10.0,10.0,1.0,1.0,0.001086,-1.223859,5948.455827,1.602316,9.080791,0.0,2.583937,9.080791,16.023161,0.629763,9.0,10.0,0.9,1.0,2.344118,4.061461,0.0,12.403801,18.745322,0.0,5.268351,18.745322,124.038008,12.174158,9.0,10.0,0.9,1.0,-0.922095,0.582302,0.0,173.851806,319.112971,23.605224,73.393805,295.507747,1738.518065,162.100591,10.0,10.0,1.0,1.0,-0.0002,0.405613,23.605224,0.013426,0.087758,0.0,0.025361,0.087758,0.134256,0.004601,9.0,10.0,0.9,1.0,2.452751,4.412462,0.0,-0.009222,0.000824,-0.031436,0.009487,0.03226,-0.092216,-0.006661,9.0,10.0,0.9,1.0,-1.040942,0.350687,0.0,1.3,6.0,-1.0,2.368544,7.0,13.0,0.0,6.0,6.0,0.6,0.6,0.950064,-0.613324,0.0,-1240.0,0.0,-3800.0,1504.792344,3800.0,-12400.0,-500.0,8.0,7.0,0.8,0.7,-0.839691,-1.152512,0.0,10.0,0.0,0.0,0.0,2.0,2.0,4.0,0.2,0.2,0.4,0.122147,13.552021,0.263324,28.525769,58.891191,5948.455827,59.029425,5964.478988,2.155791,0.141177,2.104909,14.973748,0.997658,-0.138234,0.997314,-16.023161,OYSCFP,Z9282756147,ws0,ws0b,ws0br,thr,thrn,thrnk,web,webz,webzx,wec,wecp,wecp4,2020,1,24,2020,1,24,346.967778


In [10]:
use_cols = []
# imp = pd.read_pickle("imp.pkl")
# del_cols = imp['feat'].values.tolist()[-50:]

for col in ['num_{}'.format(str(i)) for i in range(362)]:
    if np.std(train_fea[col].values) > 0.01:
        use_cols.append(col)
print(len(use_cols))

338


In [11]:
# label_encoder
from sklearn.preprocessing import LabelEncoder

test_fea['label'] = -1e8
data_fea = pd.concat([train_fea, test_fea])
data_fea = data_fea.reset_index(drop=True)

for col in ['cat_{}'.format(str(i)) for i in range(20)]:
    print(col)
    le = LabelEncoder()
    data_fea[col] = le.fit_transform(data_fea[col])
    data_fea[col] = data_fea[col].astype('category')

train_feas = data_fea[data_fea['label'] != -1e8].reset_index(drop=True)
test_feas = data_fea[data_fea['label'] == -1e8].reset_index(drop=True)
del test_feas['label']
print(train_feas.shape, test_feas.shape)

all_feas = use_cols + ['cat_{}'.format(str(i)) for i in range(20)]
cat_feas = ['cat_{}'.format(str(i)) for i in range(20)]
print(len(all_feas), len(cat_feas))

cat_0
cat_1
cat_2
cat_3
cat_4
cat_5
cat_6
cat_7
cat_8
cat_9
cat_10
cat_11
cat_12
cat_13
cat_14
cat_15
cat_16
cat_17
cat_18
cat_19
(146135, 383) (228, 382)
358 20


In [15]:
from sklearn.metrics import mean_squared_error as mse 
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold

In [14]:
# !pip install lightgbm

In [16]:
def mse_score_eval(preds, valid):
    labels = valid.get_label()
    scores = mse(y_true=labels, y_pred=preds)
    return 'mse_score', scores, False

def build_model(train, test, label, seed=2020, is_shuffle=True):
    imp = pd.DataFrame()  # 特征重要性
    imp['feat'] = all_feas

    train_pred = np.zeros((train.shape[0], ))
    test_pred = np.zeros((test.shape[0], ))
    n_splits = 10
    # Kfold
    # fold = GroupKFold(n_splits=n_splits)
    fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed)
    # groups = train_df['loadingOrder'].values
    kf_way = fold.split(train)
    # params
    params = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'num_leaves': 128,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.75,
        'bagging_freq': 5,
        'seed': 8,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': -1,
        'verbose': 1,
    }
    # train
    for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1):
        print("fold ", n_fold)
        train_x, train_y = train.iloc[train_idx], label[train_idx]
        valid_x, valid_y = train.iloc[valid_idx], label[valid_idx]
        # 数据加载
        n_train = lgb.Dataset(train_x, label=train_y)
        n_valid = lgb.Dataset(valid_x, label=valid_y)

        clf = lgb.train(
            params=params,
            train_set=n_train,
            categorical_feature=cat_feas,
            num_boost_round=10000,
            valid_sets=[n_train, n_valid],
            early_stopping_rounds=50,
            verbose_eval=100,
            feval=mse_score_eval
        )
        train_pred[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        test_pred += clf.predict(test, num_iteration=clf.best_iteration)/fold.n_splits

        imp['gain' + str(n_fold + 1)] = clf.feature_importance(importance_type='gain')
        imp['split' + str(n_fold + 1)] = clf.feature_importance(importance_type='split')

    print("train mse: ", mse(label, train_pred))
    result = pd.DataFrame({
        'loadingOrder': test_df['loadingOrder'], 
        'label': test_pred,
    })
    return result, imp

result, imp = build_model(train_feas[all_feas], test_feas[all_feas], train_feas['label'].values, is_shuffle=True)

fold  1


New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 345.008	training's mse_score: 345.008	valid_1's l2: 741.237	valid_1's mse_score: 741.237
[200]	training's l2: 129.463	training's mse_score: 129.463	valid_1's l2: 509.409	valid_1's mse_score: 509.409
[300]	training's l2: 72.1607	training's mse_score: 72.1607	valid_1's l2: 445.352	valid_1's mse_score: 445.352
[400]	training's l2: 45.6524	training's mse_score: 45.6524	valid_1's l2: 413.386	valid_1's mse_score: 413.386
[500]	training's l2: 30.953	training's mse_score: 30.953	valid_1's l2: 394.182	valid_1's mse_score: 394.182
[600]	training's l2: 21.9024	training's mse_score: 21.9024	valid_1's l2: 381.571	valid_1's mse_score: 381.571
[700]	training's l2: 16.2897	training's mse_score: 16.2897	valid_1's l2: 373.756	valid_1's mse_score: 373.756
[800]	training's l2: 12.3894	training's mse_score: 12.3894	valid_1's l2: 368.148	valid_1's mse_score: 368.148
[900]	training's l2: 9.70774	training's mse_score: 9.70774	va

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 343.027	training's mse_score: 343.027	valid_1's l2: 636.608	valid_1's mse_score: 636.608
[200]	training's l2: 127.677	training's mse_score: 127.677	valid_1's l2: 429.565	valid_1's mse_score: 429.565
[300]	training's l2: 70.2632	training's mse_score: 70.2632	valid_1's l2: 369.95	valid_1's mse_score: 369.95
[400]	training's l2: 44.3122	training's mse_score: 44.3122	valid_1's l2: 340.874	valid_1's mse_score: 340.874
[500]	training's l2: 30.3029	training's mse_score: 30.3029	valid_1's l2: 322.832	valid_1's mse_score: 322.832
[600]	training's l2: 21.4444	training's mse_score: 21.4444	valid_1's l2: 310.887	valid_1's mse_score: 310.887
[700]	training's l2: 15.9209	training's mse_score: 15.9209	valid_1's l2: 302.568	valid_1's mse_score: 302.568
[800]	training's l2: 11.9572	training's mse_score: 11.9572	valid_1's l2: 296.47	valid_1's mse_score: 296.47
[900]	training's l2: 9.23305	training's mse_score: 9.23305	vali

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 339.876	training's mse_score: 339.876	valid_1's l2: 558.776	valid_1's mse_score: 558.776
[200]	training's l2: 126.162	training's mse_score: 126.162	valid_1's l2: 380.524	valid_1's mse_score: 380.524
[300]	training's l2: 70.0654	training's mse_score: 70.0654	valid_1's l2: 332.33	valid_1's mse_score: 332.33
[400]	training's l2: 44.0111	training's mse_score: 44.0111	valid_1's l2: 306.332	valid_1's mse_score: 306.332
[500]	training's l2: 29.6406	training's mse_score: 29.6406	valid_1's l2: 289.853	valid_1's mse_score: 289.853
[600]	training's l2: 20.9211	training's mse_score: 20.9211	valid_1's l2: 279.836	valid_1's mse_score: 279.836
[700]	training's l2: 15.3597	training's mse_score: 15.3597	valid_1's l2: 273.34	valid_1's mse_score: 273.34
[800]	training's l2: 11.5019	training's mse_score: 11.5019	valid_1's l2: 267.978	valid_1's mse_score: 267.978
[900]	training's l2: 8.93244	training's mse_score: 8.93244	vali

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 347.97	training's mse_score: 347.97	valid_1's l2: 603.605	valid_1's mse_score: 603.605
[200]	training's l2: 127.954	training's mse_score: 127.954	valid_1's l2: 395.362	valid_1's mse_score: 395.362
[300]	training's l2: 69.5366	training's mse_score: 69.5366	valid_1's l2: 335.321	valid_1's mse_score: 335.321
[400]	training's l2: 43.9892	training's mse_score: 43.9892	valid_1's l2: 307.97	valid_1's mse_score: 307.97
[500]	training's l2: 29.7762	training's mse_score: 29.7762	valid_1's l2: 291.728	valid_1's mse_score: 291.728
[600]	training's l2: 21.2434	training's mse_score: 21.2434	valid_1's l2: 280.945	valid_1's mse_score: 280.945
[700]	training's l2: 15.7181	training's mse_score: 15.7181	valid_1's l2: 274.184	valid_1's mse_score: 274.184
[800]	training's l2: 11.979	training's mse_score: 11.979	valid_1's l2: 269.374	valid_1's mse_score: 269.374
[900]	training's l2: 9.28179	training's mse_score: 9.28179	valid_

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 345.766	training's mse_score: 345.766	valid_1's l2: 632.017	valid_1's mse_score: 632.017
[200]	training's l2: 127.554	training's mse_score: 127.554	valid_1's l2: 425.104	valid_1's mse_score: 425.104
[300]	training's l2: 69.5681	training's mse_score: 69.5681	valid_1's l2: 369.946	valid_1's mse_score: 369.946
[400]	training's l2: 43.3878	training's mse_score: 43.3878	valid_1's l2: 344.16	valid_1's mse_score: 344.16
[500]	training's l2: 29.3949	training's mse_score: 29.3949	valid_1's l2: 328.469	valid_1's mse_score: 328.469
[600]	training's l2: 20.8474	training's mse_score: 20.8474	valid_1's l2: 318.596	valid_1's mse_score: 318.596
[700]	training's l2: 15.3277	training's mse_score: 15.3277	valid_1's l2: 311.386	valid_1's mse_score: 311.386
[800]	training's l2: 11.6153	training's mse_score: 11.6153	valid_1's l2: 306.876	valid_1's mse_score: 306.876
[900]	training's l2: 9.09031	training's mse_score: 9.09031	va

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 339.875	training's mse_score: 339.875	valid_1's l2: 600.162	valid_1's mse_score: 600.162
[200]	training's l2: 124.702	training's mse_score: 124.702	valid_1's l2: 386.998	valid_1's mse_score: 386.998
[300]	training's l2: 69.0369	training's mse_score: 69.0369	valid_1's l2: 333.078	valid_1's mse_score: 333.078
[400]	training's l2: 43.438	training's mse_score: 43.438	valid_1's l2: 306.387	valid_1's mse_score: 306.387
[500]	training's l2: 29.2943	training's mse_score: 29.2943	valid_1's l2: 290.172	valid_1's mse_score: 290.172
[600]	training's l2: 20.9821	training's mse_score: 20.9821	valid_1's l2: 279.564	valid_1's mse_score: 279.564
[700]	training's l2: 15.4628	training's mse_score: 15.4628	valid_1's l2: 272.934	valid_1's mse_score: 272.934
[800]	training's l2: 11.6717	training's mse_score: 11.6717	valid_1's l2: 268.21	valid_1's mse_score: 268.21
[900]	training's l2: 9.00817	training's mse_score: 9.00817	vali

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 345.651	training's mse_score: 345.651	valid_1's l2: 596.414	valid_1's mse_score: 596.414
[200]	training's l2: 126.825	training's mse_score: 126.825	valid_1's l2: 375.579	valid_1's mse_score: 375.579
[300]	training's l2: 69.8285	training's mse_score: 69.8285	valid_1's l2: 319.238	valid_1's mse_score: 319.238
[400]	training's l2: 43.8131	training's mse_score: 43.8131	valid_1's l2: 293.802	valid_1's mse_score: 293.802
[500]	training's l2: 29.3089	training's mse_score: 29.3089	valid_1's l2: 278.558	valid_1's mse_score: 278.558
[600]	training's l2: 20.7557	training's mse_score: 20.7557	valid_1's l2: 269.28	valid_1's mse_score: 269.28
[700]	training's l2: 15.1973	training's mse_score: 15.1973	valid_1's l2: 263.051	valid_1's mse_score: 263.051
[800]	training's l2: 11.4974	training's mse_score: 11.4974	valid_1's l2: 258.476	valid_1's mse_score: 258.476
[900]	training's l2: 8.95596	training's mse_score: 8.95596	va

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 337.92	training's mse_score: 337.92	valid_1's l2: 655.822	valid_1's mse_score: 655.822
[200]	training's l2: 125.207	training's mse_score: 125.207	valid_1's l2: 426.246	valid_1's mse_score: 426.246
[300]	training's l2: 69.002	training's mse_score: 69.002	valid_1's l2: 364.581	valid_1's mse_score: 364.581
[400]	training's l2: 43.3076	training's mse_score: 43.3076	valid_1's l2: 334.532	valid_1's mse_score: 334.532
[500]	training's l2: 29.1311	training's mse_score: 29.1311	valid_1's l2: 316.593	valid_1's mse_score: 316.593
[600]	training's l2: 20.5512	training's mse_score: 20.5512	valid_1's l2: 305.203	valid_1's mse_score: 305.203
[700]	training's l2: 15.2334	training's mse_score: 15.2334	valid_1's l2: 297.73	valid_1's mse_score: 297.73
[800]	training's l2: 11.5289	training's mse_score: 11.5289	valid_1's l2: 292.156	valid_1's mse_score: 292.156
[900]	training's l2: 8.91058	training's mse_score: 8.91058	valid_

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 347.167	training's mse_score: 347.167	valid_1's l2: 531.646	valid_1's mse_score: 531.646
[200]	training's l2: 128.309	training's mse_score: 128.309	valid_1's l2: 332.822	valid_1's mse_score: 332.822
[300]	training's l2: 70.6322	training's mse_score: 70.6322	valid_1's l2: 280.774	valid_1's mse_score: 280.774
[400]	training's l2: 44.4209	training's mse_score: 44.4209	valid_1's l2: 254.197	valid_1's mse_score: 254.197
[500]	training's l2: 30.0037	training's mse_score: 30.0037	valid_1's l2: 239.548	valid_1's mse_score: 239.548
[600]	training's l2: 21.001	training's mse_score: 21.001	valid_1's l2: 229.626	valid_1's mse_score: 229.626
[700]	training's l2: 15.4838	training's mse_score: 15.4838	valid_1's l2: 223.477	valid_1's mse_score: 223.477
[800]	training's l2: 11.7586	training's mse_score: 11.7586	valid_1's l2: 218.792	valid_1's mse_score: 218.792
[900]	training's l2: 9.15069	training's mse_score: 9.15069	va

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[100]	training's l2: 340.511	training's mse_score: 340.511	valid_1's l2: 746.031	valid_1's mse_score: 746.031
[200]	training's l2: 127.125	training's mse_score: 127.125	valid_1's l2: 510.166	valid_1's mse_score: 510.166
[300]	training's l2: 69.5334	training's mse_score: 69.5334	valid_1's l2: 444.792	valid_1's mse_score: 444.792
[400]	training's l2: 43.5897	training's mse_score: 43.5897	valid_1's l2: 413.319	valid_1's mse_score: 413.319
[500]	training's l2: 29.349	training's mse_score: 29.349	valid_1's l2: 395.485	valid_1's mse_score: 395.485
[600]	training's l2: 20.6591	training's mse_score: 20.6591	valid_1's l2: 383.79	valid_1's mse_score: 383.79
[700]	training's l2: 15.1698	training's mse_score: 15.1698	valid_1's l2: 375.985	valid_1's mse_score: 375.985
[800]	training's l2: 11.4515	training's mse_score: 11.4515	valid_1's l2: 370.687	valid_1's mse_score: 370.687
[900]	training's l2: 8.91971	training's mse_score: 8.91971	vali

In [18]:
imp['gain'] = imp[[f for f in imp.columns if 'gain' in f]].sum(axis=1)/10
imp['split'] = imp[[f for f in imp.columns if 'split' in f]].sum(axis=1)/10

imp = imp[['feat', 'gain', 'split']]
imp

Unnamed: 0,feat,gain,split
0,num_0,7003265.0,820.4
1,num_1,41602350.0,1196.6
2,num_3,12617910.0,964.4
3,num_4,9811510.0,176.0
4,num_5,2397947.0,664.8
5,num_6,5525129.0,1222.4
6,num_7,376207.0,170.4
7,num_8,125111.0,39.6
8,num_9,72630.34,9.6
9,num_11,5326924.0,2567.2


In [19]:
imp = imp.sort_values(by=['gain'], ascending=False)
imp = imp.reset_index(drop=True)
imp.to_pickle("imp.pkl")

In [20]:
imp

Unnamed: 0,feat,gain,split
0,num_212,25439550000.0,1132.8
1,num_353,5001448000.0,647.6
2,num_352,2151259000.0,669.6
3,cat_6,1363523000.0,4315.6
4,num_128,1292548000.0,1136.0
5,cat_5,983923200.0,2939.7
6,cat_7,856513400.0,8352.6
7,cat_1,551575600.0,49841.1
8,cat_11,150589700.0,26706.0
9,num_360,137952800.0,709.6


In [21]:
# train mse:  1174.767318829244
# train mse:  273.7163487202298

sub_FORMAT = "%Y/%m%d  %H:%M:%S"
UTC_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"

def get_time(x, y):
    x = x.split(' ')[-1]
    x = datetime.datetime.strptime(x, UTC_FORMAT)
    day = y // 24 
    seconds = int((y - (y//24)*24)*3600)
    res = x + datetime.timedelta(days=day, seconds=seconds)
    str_res = res.strftime(sub_FORMAT)
    return res 

# result = pd.DataFrame({
#         'loadingOrder': test_loadingOrder, 
#         'label': test_pred,
#     })

result['timestamp'] = test_df['timestamp']
result['ETA'] = list(map(lambda x, y: get_time(x, y), result['timestamp'], result['label']))

In [22]:
test_data = pd.read_csv("data/B_testData0626.csv") 
def get_data(data, mode='train'):
    
    assert mode=='train' or mode=='test'
#     if mode=='train':
#         data['vesselNextportETA'] = pd.to_datetime(data['vesselNextportETA'], infer_datetime_format=True) # 转换时间
    if mode=='test':
        data['temp_timestamp'] = data['timestamp']
        data['onboardDate'] = pd.to_datetime(data['onboardDate'], infer_datetime_format=True)
    
    data['timestamp'] = pd.to_datetime(data['timestamp'], infer_datetime_format=True)
    return data

test_data = get_data(test_data, mode='test')

test_data.sort_values(['loadingOrder', 'timestamp'], inplace=True)
test_data = test_data.reset_index(drop=True) 

test_data = test_data.merge(result[['loadingOrder', 'ETA']], on='loadingOrder', how='left')
test_data['ETA'] = test_data['ETA'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
test_data.drop(['direction','TRANSPORT_TRACE'],axis=1,inplace=True)
test_data['onboardDate'] = test_data['onboardDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
test_data['creatDate'] = pd.datetime.now().strftime('%Y/%m/%d  %H:%M:%S')
test_data['timestamp'] = test_data['temp_timestamp']
# 整理columns顺序
result = test_data[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA', 'creatDate']]



In [23]:
result

Unnamed: 0,loadingOrder,timestamp,longitude,latitude,carrierName,vesselMMSI,onboardDate,ETA,creatDate
0,AE223035353902,2019-07-03T21:16:48.000Z,120.093858,22.581320,OIEQNT,C2075927370,2019/07/02 04:12:48,2019/07/26 03:43:57,2020/06/27 08:10:30
1,AE223035353902,2019-07-03T21:34:48.000Z,120.035707,22.617522,OIEQNT,C2075927370,2019/07/02 04:12:48,2019/07/26 03:43:57,2020/06/27 08:10:30
2,AE223035353902,2019-07-03T21:51:18.000Z,119.981800,22.658465,OIEQNT,C2075927370,2019/07/02 04:12:48,2019/07/26 03:43:57,2020/06/27 08:10:30
3,AE223035353902,2019-07-03T21:54:18.000Z,119.970845,22.668688,OIEQNT,C2075927370,2019/07/02 04:12:48,2019/07/26 03:43:57,2020/06/27 08:10:30
4,AE223035353902,2019-07-03T22:11:08.000Z,119.953628,22.756897,OIEQNT,C2075927370,2019/07/02 04:12:48,2019/07/26 03:43:57,2020/06/27 08:10:30
...,...,...,...,...,...,...,...,...,...
34707,ZZ524449869421,2020-03-17T04:02:38.000Z,103.776707,1.252897,BHSOUA,P2595193878,2020/03/13 06:07:28,2020/04/01 04:40:59,2020/06/27 08:10:30
34708,ZZ524449869421,2020-03-17T04:03:18.000Z,103.776312,1.253418,BHSOUA,P2595193878,2020/03/13 06:07:28,2020/04/01 04:40:59,2020/06/27 08:10:30
34709,ZZ524449869421,2020-03-17T04:05:18.000Z,103.775175,1.254865,BHSOUA,P2595193878,2020/03/13 06:07:28,2020/04/01 04:40:59,2020/06/27 08:10:30
34710,ZZ524449869421,2020-03-17T04:05:58.000Z,103.774803,1.255285,BHSOUA,P2595193878,2020/03/13 06:07:28,2020/04/01 04:40:59,2020/06/27 08:10:30


In [24]:
result.to_csv('data/B_lgb_10fold.csv', index=False)