In [48]:
import os
import requests
cred_url = os.environ["QCLOUD_CONTAINER_INSTANCE_CREDENTIALS_URL"]
r = requests.get(cred_url)
secretId = r.json()["TmpSecretId"]
secretKey = r.json()["TmpSecretKey"]
token = r.json()["Token"]

In [49]:
import os
from qcloud_cos import CosConfig
from qcloud_cos import CosS3Client
from ti.utils import get_temporary_secret_and_token

#### 指定本地文件路径，可根据需要修改。
local_file = "/home/tione/notebook/data/Round2_trainb.zip"

#### 用户的存储桶，修改为存放所需数据文件的存储桶，存储桶获取参考腾讯云对象存储
bucket="hc1-1258788535"

#### 用户的数据，修改为对应的数据文件路径，文件路径获取参考腾讯云对象存储
data_key="Round2_trainb.zip"

#### 获取用户临时密钥
secret_id, secret_key, token = get_temporary_secret_and_token()
config = CosConfig(Region=os.environ.get('REGION'), SecretId=secret_id, SecretKey=secret_key, Token=token, Scheme='https')
client = CosS3Client(config)

###  获取文件到本地
response = client.get_object(
    Bucket=bucket,
    Key=data_key,
)
response['Body'].get_stream_to_file(local_file)

In [1]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm 
import datetime, time 
from utils import distance, haversine, standard, pad_seq, geohash_encode
from scipy.stats import skew, kurtosis
from zipfile import ZipFile
from collections import Counter 
from sklearn.metrics import mean_squared_error as mse 
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
# !pip install lightgbm
!free -m

              total        used        free      shared  buff/cache   available
Mem:          96166       21336       44895          18       29934       74666
Swap:             0           0           0


# 分析港口

In [26]:
port = pd.read_csv("data/port.csv")
print(port.shape)
port.head()
port['geo_hash4'] = port[["LONGITUDE", "LATITUDE"]].apply(lambda x: geohash_encode(x[0], x[1], precision=4), axis=1)

port['lonlat'] = port[["LONGITUDE", "LATITUDE"]].apply(lambda x: (x[0], x[1]), axis=1)
porthash2lonlat = dict(zip(port['geo_hash4'], port['lonlat']))
print(len(porthash2lonlat))

(2456, 10)
1226


In [1]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:          96166       18361       62098          18       15706       77641
Swap:             0           0           0


In [5]:
%%time

myzip=ZipFile('data/Round2_trainb.zip')
f=myzip.open('Round2_trainb.csv')
train_df=pd.read_csv(f)
print(train_df.shape)
print(train_df.head())
f.close()
myzip.close()

train_df.drop_duplicates(['loadingOrder', 'label'], inplace=True)

del_cols = ['arrive_time_index', 'arrive_time', 'direction',  'begin_year', 'geo_hash3', 'selcte_index'] # 'or_last_index',
train_df.drop(del_cols, axis=1, inplace=True) 
print(train_df.columns)

columns = ['TRANSPORT_TRACE', 'begin_port_name', 'begin_port_position',
        'begin_port_position_hash3', 'begin_port_position_hash4',
        'begin_port_position_hash5', 'carrierName', 'end_port_name',
        'end_port_position', 'end_port_position_hash3',
        'end_port_position_hash4', 'end_port_position_hash5', 'loadingOrder',
        'vesselMMSI', 'test_index', 'geo_hash3', 'geo_hash5',
        'geo_hash4', 'speed', 'longitude',
        'latitude', 'timestamp', 'direction', 'label']
train_df.columns = columns

test_df = pd.read_csv("data/Round2_testb.csv")
print(train_df.shape, test_df.shape)

(223603, 30)
  TRANSPORT_TRACE               arrive_time  arrive_time_index  \
0     CNYTN-MYTPP  2020-03-27T18:32:53.000Z               2072   
1     CNYTN-MYTPP  2020-03-27T18:32:53.000Z               2072   
2     CNYTN-MYTPP  2020-03-27T18:32:53.000Z               2072   
3     CNSHK-MYTPP  2020-04-13T07:57:43.000Z                653   
4     CNSHK-MYTPP  2020-04-13T07:57:43.000Z                653   

  begin_port_name           begin_port_position begin_port_position_hash3  \
0           CNYTN            114.275347 22.5777                       ws1   
1           CNYTN            114.275347 22.5777                       ws1   
2           CNYTN            114.275347 22.5777                       ws1   
3           CNSHK  113.86305800000001 22.559462                       ws0   
4           CNSHK  113.86305800000001 22.559462                       ws0   

  begin_port_position_hash4 begin_port_position_hash5 carrierName  \
0                      ws12                     ws122     

In [6]:
train_df = train_df[train_df['label'] > 0].reset_index(drop=True)
print(train_df.shape)

(216358, 24)


In [7]:
test_df.head(1)

Unnamed: 0,loadingOrder,timestamp,direction,speed,TRANSPORT_TRACE,carrierName,vesselMMSI,longitude,latitude,geo_hash5,geo_hash4,geo_hash3,begin_port_name,end_port_name,begin_port_position,end_port_position,begin_port_position_hash4,begin_port_position_hash5,begin_port_position_hash3,end_port_position_hash4,end_port_position_hash5,end_port_position_hash3,right_index,ans
0,AB674675500650,2020-06-01T00:01:50.000Z 2020-06-01T00:04:48.0...,19400.0 19600.0 19900.0 21200.0 19700.0 16900....,0 0 0 1 3 4 5 4 3 2 2 2 2 2 2 2 2 2 2 2 1 1 2 ...,CNSHK-HKHKG-TWKHH-CNNBG-CNSHA-CNTAO-KRPUS-MXZL...,RWHZVZ,V2180946969,-79.91805 -79.918133 -79.918217 -79.9183829999...,9.257433 9.2571 9.25685 9.256467 9.2557 9.2545...,d1xht d1xht d1xht d1xht d1xht d1xht d1xht d1xh...,d1xh d1xh d1xh d1xh d1xh d1xh d1xh d1xh d1xh d...,d1x d1x d1x d1x d1x d1x d1x d1x d1x d1x d1x d1...,CNSHK,DOCAU,113.86305800000001 22.559462,-69.638318 18.425821,ws0b,ws0br,ws0,d7q2,d7q2z,d7q,-1,


In [None]:
%%time

def get_stats(values):   # 统计特征
    xlen = len(values)
    vmax = np.max(values)
    vmin = np.min(values)
    return [np.mean(values), 
            vmax, vmin, np.std(values), 
            # vmax-vmin,
            # np.sum(values), 
            # np.median(values), 
            xlen-np.count_nonzero(values), 
            values[0], values[-1],
            #np.count_nonzero(values)/xlen, 
            #np.percentile(values, 25), np.percentile(values, 75), 
            len(set(values)), 
            #len(set(values))/xlen,
            skew(values), 
            # kurtosis(values), 
            #sorted(Counter(values).items(), key=lambda x: x[1], reverse=True)[0][0]
           ]

def feature_engineering(df, is_train=True):
    numerical_fea = []
    categorical_fea = []
    if is_train:
        label = []
    for idx in tqdm(range(df.shape[0])):
        line = df.iloc[idx]

        timestamp = [datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.000Z") for x in line['timestamp'].split(' ')]
        
        # 时间特征
        timestamp_hours = np.array([(x-timestamp[0]).total_seconds()/3600.0 for x in timestamp])  # 减去最初的时间
        timestamp_diff1 = np.diff(timestamp_hours, prepend=timestamp_hours[0])    # 时间一阶差分
        # print(timestamp_diff1[:20])
        xlen = len(timestamp_hours)   # 长度

        begin_port = [float(x) for x in line['begin_port_position'].split(' ')]
        end_port = [float(x) for x in line['end_port_position'].split(' ')]

        # 原始的四个特征
        lon = [float(x) for x in line['longitude'].split(' ')]  # 经度
        lat  = [float(x) for x in line['latitude'].split(' ')]  # 纬度
        speed = [float(x) for x in line['speed'].split(' ')]
        direction = [float(x) for x in line['direction'].split(' ')]
        lon_diff1 = np.diff(lon, prepend=lon[0])
        lat_diff1 = np.diff(lat, prepend=lat[0])
        speed_diff1 = np.diff(speed, prepend=speed[0])
        direction_diff1 = np.diff(direction, prepend=direction[0])

        # 类别特征
        carrierName = line['carrierName']
        vesselMMSI = line['vesselMMSI']
        begin_hash3, begin_hash4, begin_hash5 = line['begin_port_position_hash3'], line['begin_port_position_hash4'], line['begin_port_position_hash5']
        end_hash3, end_hash4, end_hash5 = line['end_port_position_hash3'], line['end_port_position_hash4'], line['end_port_position_hash5']
        first_hash3, first_hash4, first_hash5 = line['geo_hash3'].split(' ')[0], line['geo_hash4'].split(' ')[0], line['geo_hash5'].split(' ')[0]
        last_hash3, last_hash4, last_hash5 = line['geo_hash3'].split(' ')[-1], line['geo_hash4'].split(' ')[-1], line['geo_hash5'].split(' ')[-1]
        first_year, first_month, first_day = timestamp[0].year, timestamp[0].month, timestamp[0].day
        last_year, last_month, last_day = timestamp[-1].year, timestamp[-1].month, timestamp[-1].day

        # 距离特征
        begin_distance = [distance(begin_port[0], begin_port[1], lon[i], lat[i]) for i in range(xlen)]  # 与起始距离
        begin_dis_cumsum = np.cumsum(begin_distance)   # 累积距离
        begin_dis_diff1 = np.diff(begin_distance, prepend=begin_distance[0])   # 一阶差分距离

        end_distance = [distance(end_port[0], end_port[1], lon[i], lat[i]) for i in range(xlen)]    # 与终止距离
        end_dis_cumsum = np.cumsum(end_distance)    # 累积距离
        end_dis_diff1 = np.diff(end_distance, prepend=end_distance[0]) 

        begin_haversine = [haversine(begin_port[0], begin_port[1], lon[i], lat[i]) for i in range(xlen)] 
        begin_hav_cumsum = np.cumsum(begin_haversine)   # 累积距离
        begin_hav_diff1 = np.diff(begin_haversine, prepend=begin_haversine[0])   # 一阶差分距离

        end_haversine = [haversine(end_port[0], end_port[1], lon[i], lat[i]) for i in range(xlen)]
        end_hav_cumsum = np.cumsum(end_haversine)   # 累积距离
        end_hav_diff1 = np.diff(end_haversine, prepend=end_haversine[0])   # 一阶差分距离

        # 速度特征
        timestamp_diff1_tmp = timestamp_diff1.copy()
        timestamp_diff1_tmp[timestamp_diff1_tmp==0] = np.mean(timestamp_diff1_tmp)
        diff_speed = begin_hav_diff1 / (timestamp_diff1_tmp + 1e-8)    # 差分路程 / 差分时间 = 速度1
        
        timestamp_hours_tmp = timestamp_hours.copy()
        timestamp_hours_tmp[timestamp_hours_tmp == 0] = np.mean(timestamp_hours_tmp)
        cumsum_speed = begin_hav_cumsum / (timestamp_hours_tmp + 1e-8)  # 累积路程 / 累积时间 = 速度2

        # 简单数值特征
        speed_0_nums = (np.array(speed) == 0).sum()
        speed_0_rate = speed_0_nums / xlen 
        speed_0_time = timestamp_diff1[np.array(speed) == 0].sum()
        hash3_nunique = len(set(line['geo_hash3'].split(' ')))
        hash4_nunique = len(set(line['geo_hash4'].split(' ')))
        hash5_nunique = len(set(line['geo_hash5'].split(' ')))
        hash3_rate, hash4_rate, hash5_rate = hash3_nunique/xlen,  hash4_nunique/xlen, hash5_nunique/xlen 
        first_begin_dis, first_begin_hav = distance(begin_port[0], begin_port[1], lon[0], lat[0]), haversine(begin_port[0], begin_port[1], lon[0], lat[0])
        last_begin_dis, last_begin_hav = distance(begin_port[0], begin_port[1], lon[-1], lat[-1]), haversine(begin_port[0], begin_port[1], lon[-1], lat[-1])
        first_end_dis, first_end_hav = distance(end_port[0], end_port[1], lon[0], lat[0]), haversine(end_port[0], end_port[1], lon[0], lat[0])
        last_end_dis, last_end_hav = distance(end_port[0], end_port[1], lon[-1], lat[-1]), haversine(end_port[0], end_port[1], lon[-1], lat[-1])
        fbd_lbd, fbd_lbd_rate = last_begin_dis/first_begin_dis, last_begin_dis-first_begin_dis
        fbh_lbh, fbh_lbh_rate = last_begin_hav/first_begin_hav, last_begin_hav-first_begin_hav
        fed_led, fed_led_rate = first_end_dis/last_end_dis, first_end_dis-last_end_dis
        fed_leh, fed_leh_rate = first_end_hav/last_end_hav, first_end_hav-last_end_hav


        # 正式添加统计特征
        num_feas1 = get_stats(timestamp_hours) + get_stats(timestamp_diff1) + get_stats(lon) + get_stats(lat) + get_stats(speed) + get_stats(direction)
        num_feas2 = get_stats(begin_distance)+get_stats(begin_dis_cumsum)+get_stats(begin_dis_diff1)+get_stats(end_distance)+get_stats(end_dis_cumsum)+get_stats(end_dis_diff1)
        num_feas3 = get_stats(begin_haversine)+get_stats(begin_hav_cumsum)+get_stats(begin_hav_diff1)+get_stats(end_haversine)+get_stats(end_hav_cumsum)+get_stats(end_hav_diff1)
        num_feas4 = get_stats(diff_speed) + get_stats(cumsum_speed)
        num_feas5 = get_stats(lon_diff1) + get_stats(lat_diff1) + get_stats(speed_diff1) + get_stats(direction_diff1)
        num_feas6 = [xlen, speed_0_nums, speed_0_rate, speed_0_time, hash3_nunique, hash4_nunique, hash5_nunique, 
                     hash3_rate, hash4_rate, hash5_rate, first_begin_dis, first_begin_hav,  last_begin_dis, last_begin_hav,
                     first_end_dis, first_end_hav, last_end_dis, last_end_hav, fbd_lbd, fbd_lbd_rate, 
                     fbh_lbh, fbh_lbh_rate, fed_led, fed_led_rate, fed_leh, fed_leh_rate]
        # print(len(num_feas1+num_feas2+num_feas3))
        # break 

        # 添加类别型特征
        cat_feas1 = [carrierName, vesselMMSI, begin_hash3, begin_hash4, begin_hash5, 
                     end_hash3, end_hash4, end_hash5, first_hash3, first_hash4, first_hash5, 
                     last_hash3, last_hash4, last_hash5, first_year, first_month, first_day, 
                     last_year, last_month, last_day]
        
        numerical_fea.append(num_feas1 + num_feas2 + num_feas3 + num_feas4 + num_feas5 + num_feas6)
        categorical_fea.append(cat_feas1)
        if is_train:
            label.append(line['label'])
    
    numerical_fea = np.array(numerical_fea)
    categorical_fea = np.array(categorical_fea)
    
    # print(numerical_fea.shape)
    num_cols = ['num_{}'.format(str(i)) for i in range(numerical_fea.shape[1])]
    cat_cols = ['cat_{}'.format(str(i)) for i in range(categorical_fea.shape[1])]
    num_df = pd.DataFrame(numerical_fea)
    print(num_df.shape)
    cat_df = pd.DataFrame(categorical_fea)
    print(cat_df.shape)
    fea_df = pd.concat([num_df, cat_df], axis=1)
    
    fea_df.columns = num_cols + cat_cols
    if is_train:
        fea_df['label'] = label
    print(fea_df.shape)
    return fea_df

test_fea = feature_engineering(test_df, is_train=False)
train_fea = feature_engineering(train_df)

100%|██████████| 239/239 [00:03<00:00, 61.04it/s]
  0%|          | 5/216358 [00:00<1:23:57, 42.95it/s]

(239, 242)
(239, 20)
(239, 262)


 10%|▉         | 21609/216358 [20:50<2:54:21, 18.62it/s] 

In [10]:
train_fea.to_pickle('data/train_fea_0809.pkl')
test_fea.to_pickle('data/test_fea_0809.pkl')

In [14]:
train_fea = pd.read_pickle("data/train_fea_0808.pkl")
test_fea = pd.read_pickle("data/test_fea_0808.pkl")

In [9]:
print(train_fea.shape, test_fea.shape)

(216358, 263) (239, 262)


In [11]:
test_fea.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,num_8,num_9,num_10,num_11,num_12,num_13,num_14,num_15,num_16,num_17,num_18,num_19,num_20,num_21,num_22,num_23,num_24,num_25,num_26,num_27,num_28,num_29,num_30,num_31,num_32,num_33,num_34,num_35,num_36,num_37,num_38,num_39,num_40,num_41,num_42,num_43,num_44,num_45,num_46,num_47,num_48,num_49,num_50,num_51,num_52,num_53,num_54,num_55,num_56,num_57,num_58,num_59,num_60,num_61,num_62,num_63,num_64,num_65,num_66,num_67,num_68,num_69,num_70,num_71,num_72,num_73,num_74,num_75,num_76,num_77,num_78,num_79,num_80,num_81,num_82,num_83,num_84,num_85,num_86,num_87,num_88,num_89,num_90,num_91,num_92,num_93,num_94,num_95,num_96,num_97,num_98,num_99,num_100,num_101,num_102,num_103,num_104,num_105,num_106,num_107,num_108,num_109,num_110,num_111,num_112,num_113,num_114,num_115,num_116,num_117,num_118,num_119,num_120,num_121,num_122,num_123,num_124,num_125,num_126,num_127,num_128,num_129,num_130,num_131,num_132,num_133,num_134,num_135,num_136,num_137,num_138,num_139,num_140,num_141,num_142,num_143,num_144,num_145,num_146,num_147,num_148,num_149,num_150,num_151,num_152,num_153,num_154,num_155,num_156,num_157,num_158,num_159,num_160,num_161,num_162,num_163,num_164,num_165,num_166,num_167,num_168,num_169,num_170,num_171,num_172,num_173,num_174,num_175,num_176,num_177,num_178,num_179,num_180,num_181,num_182,num_183,num_184,num_185,num_186,num_187,num_188,num_189,num_190,num_191,num_192,num_193,num_194,num_195,num_196,num_197,num_198,num_199,num_200,num_201,num_202,num_203,num_204,num_205,num_206,num_207,num_208,num_209,num_210,num_211,num_212,num_213,num_214,num_215,num_216,num_217,num_218,num_219,num_220,num_221,num_222,num_223,num_224,num_225,num_226,num_227,num_228,num_229,num_230,num_231,num_232,num_233,num_234,num_235,num_236,num_237,num_238,num_239,num_240,num_241,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18,cat_19
0,10.72261,23.934722,0.0,7.181429,1.0,0.0,23.934722,440.0,0.216247,0.054397,0.105,0.0,0.025264,1.0,0.0,0.050278,135.0,0.718651,-79.893851,-79.885233,-79.919283,0.012923,0.0,-79.91805,-79.88525,143.0,-0.999897,9.348437,9.423383,9.252117,0.042348,0.0,9.257433,9.369717,150.0,-1.291235,1.765909,27.0,0.0,4.553942,332.0,0.0,0.0,25.0,3.201669,18094.318182,35900.0,0.0,10832.801178,8.0,19400.0,8600.0,144.0,0.104324,194.206778,194.238007,194.196732,0.015226,0.0,194.237128,194.196746,284.0,0.995461,42824.139686,85450.982434,194.237128,24667.116224,0.0,194.237128,85450.982434,440.0,-7.5e-05,-9.2e-05,0.000535,-0.0066,0.000514,17.0,0.0,-0.000125,421.0,-8.188465,13.695821,13.777164,13.655107,0.036076,0.0,13.774332,13.675256,284.0,1.275499,3023.387763,6026.161254,13.774332,1738.478208,0.0,13.774332,6026.161254,440.0,-0.002031,-0.000225,0.003715,-0.007548,0.000977,17.0,0.0,-0.000165,421.0,-2.745482,16163.510623,16172.501792,16155.100026,3.936102,0.0,16171.814312,16161.693447,284.0,1.2228,3564384.0,7111945.0,16171.814312,2052911.0,0.0,16171.814312,7111945.0,440.0,-0.000141,-0.023002,0.663457,-0.904947,0.150719,17.0,0.0,-0.007047,421.0,-1.068595,1497.031756,1506.255694,1492.270906,4.088396,0.0,1505.930056,1494.716519,284.0,1.280985,330486.012899,658693.97245,1505.930056,190020.826964,0.0,1505.930056,658693.97245,440.0,-0.002087,-0.025485,0.427914,-0.843205,0.111882,17.0,0.0,-0.018266,421.0,-2.635384,-0.686151,19.903695,-25.123987,4.504278,17.0,0.0,-0.140166,424.0,-0.972426,368053.796788,654141.290174,1508.197498,60733.324033,0.0,1508.197498,297139.219126,440.0,-0.033306,7.5e-05,0.006533,-0.000733,0.000518,44.0,0.0,0.000117,118.0,7.998333,0.000255,0.0089,-0.005433,0.001428,50.0,0.0,0.000117,146.0,1.546439,0.0,4.0,-3.0,0.625227,370.0,0.0,0.0,8.0,0.278968,-24.545455,35900.0,-35900.0,7847.656701,230.0,0.0,-1900.0,121.0,-0.236285,440.0,332.0,0.754545,20.261667,1.0,2.0,6.0,0.002273,0.004545,0.013636,194.237128,16171.814312,194.196746,16161.693447,13.774332,1505.930056,13.675256,1494.716519,0.999792,-0.040382,0.999374,-10.120864,1.007245,0.099076,1.007502,11.213537,RWHZVZ,V2180946969,ws0,ws0b,ws0br,d7q,d7q2,d7q2z,d1x,d1xh,d1xht,d1x,d1xj,d1xjq,2020,6,1,2020,6,1
1,0.358265,0.702222,0.0,0.205828,1.0,0.0,0.702222,94.0,-0.037904,0.00747,0.010278,0.0,0.001292,1.0,0.0,0.005833,41.0,-1.803981,113.894465,113.926597,113.88412,0.010867,0.0,113.89433,113.926597,94.0,1.419546,22.407347,22.452023,22.354962,0.030195,0.0,22.452023,22.354962,94.0,-0.259351,17.595745,25.0,7.0,6.311622,0.0,7.0,25.0,18.0,-0.018381,16865.957447,21480.0,12030.0,2791.08738,0.0,20660.0,12070.0,72.0,0.006104,0.155453,0.214144,0.111898,0.031456,0.0,0.111898,0.214144,94.0,0.363342,6.540801,14.612557,0.111898,4.197823,0.0,0.111898,14.612557,94.0,0.247656,0.001088,0.00238,0.0,0.000465,1.0,0.0,0.000954,94.0,0.419242,29.376293,29.419641,29.333121,0.027243,0.0,29.419641,29.333121,94.0,-0.130563,1396.109033,2761.37151,29.419641,797.095714,0.0,29.419641,2761.37151,94.0,-0.001128,-0.00092,0.0,-0.002104,0.000401,1.0,0.0,-0.000339,94.0,-0.497516,17.231886,23.658232,12.37105,3.477287,0.0,12.37105,23.658232,94.0,0.348904,725.2403,1619.797,12.37105,465.5742,0.0,12.37105,1619.797,94.0,0.246979,0.120076,0.263512,0.0,0.050677,1.0,0.0,0.101094,94.0,0.464102,3262.611007,3267.420064,3257.78913,3.030895,0.0,3267.420064,3257.78913,94.0,-0.135792,155055.804787,306685.434666,3267.420064,88527.699246,0.0,3267.420064,306685.434666,94.0,-0.00113,-0.102457,0.0,-0.234041,0.044408,1.0,0.0,-0.038797,94.0,-0.515504,15.990113,30.664511,0.0,6.512682,1.0,0.0,17.330429,94.0,0.266353,1946.606065,3432.233394,34.530408,309.037932,0.0,34.530408,2306.673337,94.0,-1.20961,0.000343,0.001878,-0.000706,0.000638,1.0,0.0,0.001167,94.0,0.434733,-0.001033,0.0,-0.002312,0.00042,1.0,0.0,-0.000638,91.0,-0.695649,0.191489,2.0,-1.0,0.570117,63.0,0.0,0.0,4.0,0.332291,-91.382979,660.0,-1160.0,254.74059,9.0,0.0,-20.0,43.0,-1.909555,94.0,0.0,0.0,0.0,2.0,2.0,4.0,0.021277,0.021277,0.042553,0.111898,12.37105,0.214144,23.658232,29.419641,3267.420064,29.333121,3257.78913,1.913745,0.102246,1.912387,11.287182,1.00295,0.08652,1.002956,9.630934,JONOCD,R7840559895,ws0,ws0b,ws0br,qqu,qquj,qquj0,web,webz,webzx,wec,wecp,wecp0,2020,6,9,2020,6,9
2,9.803589,20.427778,0.0,5.261265,1.0,0.0,20.427778,306.0,-0.022338,0.066757,0.574722,0.0,0.072407,1.0,0.0,0.256389,186.0,3.249324,13.896018,14.570038,11.66483,0.828014,0.0,14.534068,11.66483,244.0,-0.992935,36.087965,37.218727,35.725175,0.400028,0.0,35.821433,37.218727,245.0,1.108505,16.277778,28.0,0.0,12.876663,112.0,0.0,27.0,17.0,-0.42774,28926.176471,35370.0,3060.0,4025.971984,0.0,31200.0,30400.0,109.0,-3.761678,101.284839,103.649785,100.578063,0.874081,0.0,100.616694,103.649785,279.0,1.002806,15478.825847,30993.160773,100.616694,8932.160894,0.0,100.616694,30993.160773,306.0,0.009149,0.009912,0.159164,-0.005392,0.019755,27.0,0.0,0.070072,280.0,4.101643,10.858909,11.546217,8.612002,0.843952,0.0,11.508492,8.612002,279.0,-0.985157,1733.058668,3322.826061,11.508492,973.861121,0.0,11.508492,3322.826061,306.0,-0.079486,-0.009466,0.005203,-0.15424,0.01871,27.0,0.0,-0.063724,280.0,-4.125601,9421.974619,9550.384867,9380.046259,49.345269,0.0,9382.035434,9550.384867,279.0,0.918797,1442333.0,2883124.0,9382.035434,831515.1,0.0,9382.035434,2883124.0,306.0,0.00576,0.550162,8.803384,-0.374157,1.052213,27.0,0.0,3.525229,280.0,4.103781,972.49907,1036.260691,765.174972,78.448591,0.0,1032.653617,765.174972,279.0,-0.982911,155426.133864,297584.71547,1032.653617,87272.616117,0.0,1032.653617,297584.71547,306.0,-0.082315,-0.874113,0.511655,-14.418503,1.728868,27.0,0.0,-5.741955,280.0,-4.141719,8.796109,27.599794,-11.611756,8.087473,27.0,0.0,13.74954,280.0,-0.181011,143571.361619,276092.587068,957.000045,19214.996722,0.0,957.000045,141137.438609,306.0,0.299836,-0.009377,0.004848,-0.150337,0.018581,28.0,0.0,-0.065617,266.0,-4.106137,0.004566,0.075977,-0.007877,0.010047,29.0,0.0,0.036222,265.0,3.819327,0.088235,6.0,-2.0,0.612019,272.0,0.0,0.0,7.0,4.311584,-2.614379,32310.0,-20260.0,2591.617778,62.0,0.0,0.0,81.0,3.815569,306.0,112.0,0.366013,8.436944,4.0,17.0,72.0,0.013072,0.055556,0.235294,100.616694,9382.035434,103.649785,9550.384867,11.508492,1032.653617,8.612002,765.174972,1.030145,3.03309,1.017944,168.349433,1.336332,2.89649,1.349565,267.478645,RWHZVZ,A2177695011,ws1,ws12,ws122,snd,snd1,snd1j,sq6,sq67,sq67f,sq8,sq87,sq879,2020,6,2,2020,6,2
3,2.404613,4.941667,0.0,1.398167,1.0,0.0,4.941667,145.0,0.009879,0.03408,0.196944,0.0,0.014967,1.0,0.0,0.009444,91.0,8.72619,113.79824,113.854055,113.68783,0.051871,0.0,113.68783,113.80856,145.0,-0.841254,22.164712,22.63745,21.53508,0.313853,0.0,22.63745,21.53508,145.0,-0.141255,24.634483,33.0,1.0,7.91926,0.0,3.0,32.0,28.0,-1.182566,17132.413793,19290.0,9450.0,1566.270455,0.0,9680.0,18600.0,97.0,-1.638844,0.551205,1.169467,0.066372,0.314178,0.0,0.066372,1.169467,145.0,0.085563,27.120221,79.924729,0.066372,23.768712,0.0,0.066372,79.924729,145.0,0.621534,0.007608,0.055759,0.0,0.004904,1.0,0.0,0.001825,145.0,6.391174,58.818721,58.882166,58.686507,0.064511,0.0,58.686507,58.862145,145.0,-0.792791,4291.313337,8528.714583,58.686507,2462.484546,0.0,58.686507,8528.714583,145.0,0.001241,0.001211,0.005203,-0.002354,0.001549,1.0,0.0,-7.9e-05,145.0,0.292256,60.948718,129.876539,7.231513,34.928109,0.0,7.231513,129.876539,145.0,0.093566,2991.034,8837.564,7.231513,2627.202,0.0,7.231513,8837.564,145.0,0.625358,0.845828,6.221585,0.0,0.54748,1.0,0.0,0.20362,145.0,6.394921,5948.081593,5967.972416,5923.202916,13.798688,0.0,5923.202916,5967.972416,145.0,-0.409023,433641.291701,862471.830963,5923.202916,249016.56479,0.0,5923.202916,862471.830963,145.0,0.002757,0.308755,1.170522,0.0,0.178395,1.0,0.0,0.03803,145.0,0.833392,24.409942,34.786758,0.0,8.033871,1.0,0.0,21.559793,145.0,-1.206023,957.087937,1811.512449,3.00735,494.923087,0.0,3.00735,1788.377222,145.0,0.067854,0.000833,0.00475,-0.005717,0.001663,1.0,0.0,-0.00019,145.0,-0.147149,-0.007603,0.0,-0.057106,0.005056,1.0,0.0,-0.001867,144.0,-6.359164,0.2,3.0,-3.0,0.979796,81.0,0.0,0.0,7.0,-0.100302,61.517241,4260.0,-1600.0,483.149625,13.0,0.0,50.0,60.0,4.461524,145.0,0.0,0.0,0.0,2.0,7.0,28.0,0.013793,0.048276,0.193103,0.066372,7.231513,1.169467,129.876539,58.686507,5923.202916,58.862145,5967.972416,17.61999,1.103096,17.9598,122.645026,0.997016,-0.175638,0.992498,-44.7695,OYSCFP,G5028862507,ws0,ws0c,ws0c4,thr,thrn,thrnk,ws0,ws0b,ws0bg,web,webf,webft,2020,6,10,2020,6,10
4,12.274598,23.771944,0.0,6.918796,1.0,0.0,23.771944,49.0,-0.212228,0.485142,2.300556,0.0,0.480963,1.0,0.0,0.392222,47.0,2.235785,67.616718,71.262752,64.17128,2.052243,0.0,71.262752,64.17128,49.0,0.193172,9.03349,10.118587,7.87896,0.646035,0.0,7.87896,10.118587,49.0,-0.181295,34.102041,36.0,32.0,1.111074,0.0,35.0,34.0,5.0,0.244472,28784.897959,29010.0,28520.0,93.396728,0.0,28900.0,28680.0,22.0,-0.306269,48.198759,51.225464,45.058886,1.784534,0.0,45.058886,51.225464,49.0,-0.171382,1179.90677,2361.73919,45.058886,683.33727,0.0,45.058886,2361.73919,49.0,0.042449,0.125849,0.577369,0.0,0.123531,1.0,0.0,0.106314,49.0,2.20913,41.733547,45.440852,38.247125,2.081295,0.0,45.440852,38.247125,49.0,0.195914,1072.55505,2044.943793,45.440852,588.341931,0.0,45.440852,2044.943793,49.0,-0.05772,-0.146811,0.0,-0.676715,0.143414,1.0,0.0,-0.117871,49.0,-2.19321,5148.823129,5453.197608,4831.724906,179.921187,0.0,4831.724906,5453.197608,49.0,-0.177422,126194.1,252292.3,4831.724906,72988.98,0.0,4831.724906,252292.3,49.0,0.040062,12.683116,57.800175,0.0,12.445912,1.0,0.0,10.680706,49.0,2.197657,4405.845174,4798.473026,4037.648656,220.12051,0.0,4798.473026,4037.648656,49.0,0.198512,113235.94311,215886.413508,4798.473026,62110.184009,0.0,4798.473026,215886.413508,49.0,-0.057819,-15.527028,0.0,-71.692036,15.159525,1.0,0.0,-12.381196,49.0,-2.192439,25.676946,28.198641,0.0,3.855823,1.0,0.0,27.231262,49.0,-5.994629,10883.932359,29024.217656,393.636117,3629.068713,0.0,393.636117,10613.028893,49.0,2.494455,-0.144724,0.0,-0.66432,0.141584,1.0,0.0,-0.118347,49.0,-2.186283,0.045707,0.21288,0.0,0.044497,1.0,0.0,0.0348,49.0,2.264668,-0.020408,1.0,-2.0,0.622365,33.0,0.0,0.0,4.0,-0.494197,-4.489796,190.0,-380.0,105.270521,13.0,0.0,-10.0,25.0,-1.210841,49.0,0.0,0.0,0.0,7.0,24.0,46.0,0.142857,0.489796,0.938776,45.058886,4831.724906,51.225464,5453.197608,45.440852,4798.473026,38.247125,4037.648656,1.136856,6.166578,1.128623,621.472702,1.188085,7.193727,1.188433,760.824371,JCMFTA,T6889553248,ws0,ws0b,ws0br,stw,stwc,stwcz,t96,t96s,t96sv,t3v,t3v9,t3v9s,2020,6,1,2020,6,1


# 读取特征文件

In [4]:
# train_weight = train_fea['num_336'].values

In [17]:
print(train_fea.shape, test_fea.shape)
train_fea.head()

(199770, 263) (200, 262)


Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,num_8,num_9,num_10,num_11,num_12,num_13,num_14,num_15,num_16,num_17,num_18,num_19,num_20,num_21,num_22,num_23,num_24,num_25,num_26,num_27,num_28,num_29,num_30,num_31,num_32,num_33,num_34,num_35,num_36,num_37,num_38,num_39,num_40,num_41,num_42,num_43,num_44,num_45,num_46,num_47,num_48,num_49,num_50,num_51,num_52,num_53,num_54,num_55,num_56,num_57,num_58,num_59,num_60,num_61,num_62,num_63,num_64,num_65,num_66,num_67,num_68,num_69,num_70,num_71,num_72,num_73,num_74,num_75,num_76,num_77,num_78,num_79,num_80,num_81,num_82,num_83,num_84,num_85,num_86,num_87,num_88,num_89,num_90,num_91,num_92,num_93,num_94,num_95,num_96,num_97,num_98,num_99,num_100,num_101,num_102,num_103,num_104,num_105,num_106,num_107,num_108,num_109,num_110,num_111,num_112,num_113,num_114,num_115,num_116,num_117,num_118,num_119,num_120,num_121,num_122,num_123,num_124,num_125,num_126,num_127,num_128,num_129,num_130,num_131,num_132,num_133,num_134,num_135,num_136,num_137,num_138,num_139,num_140,num_141,num_142,num_143,num_144,num_145,num_146,num_147,num_148,num_149,num_150,num_151,num_152,num_153,num_154,num_155,num_156,num_157,num_158,num_159,num_160,num_161,num_162,num_163,num_164,num_165,num_166,num_167,num_168,num_169,num_170,num_171,num_172,num_173,num_174,num_175,num_176,num_177,num_178,num_179,num_180,num_181,num_182,num_183,num_184,num_185,num_186,num_187,num_188,num_189,num_190,num_191,num_192,num_193,num_194,num_195,num_196,num_197,num_198,num_199,num_200,num_201,num_202,num_203,num_204,num_205,num_206,num_207,num_208,num_209,num_210,num_211,num_212,num_213,num_214,num_215,num_216,num_217,num_218,num_219,num_220,num_221,num_222,num_223,num_224,num_225,num_226,num_227,num_228,num_229,num_230,num_231,num_232,num_233,num_234,num_235,num_236,num_237,num_238,num_239,num_240,num_241,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18,cat_19,label
0,206.538927,318.100556,0.0,93.03552,1.0,0.0,318.100556,1847.0,-0.733368,0.172226,2.933333,0.0,0.321751,1.0,0.0,0.001667,833.0,2.866693,-51.166123,179.952767,-179.98355,143.08026,0.0,138.928747,-127.299227,1844.0,0.784626,-33.758206,2.374827,-46.395387,14.911222,0.0,2.374827,-46.395387,1826.0,0.980042,31.603682,37.0,26.0,2.332174,0.0,34.0,27.0,12.0,-0.475608,11305.793178,19800.0,7700.0,1398.682188,0.0,13000.0,9600.0,60.0,0.398637,203.189281,298.723384,31.879491,99.450596,0.0,31.879491,251.054388,1844.0,-0.784826,147665.488799,375290.602166,31.879491,121624.648293,0.0,31.879491,375290.602166,1847.0,0.323246,0.118665,214.74373,-0.794415,4.996663,4.0,0.0,-0.00044,1844.0,42.907552,125.630606,251.61721,57.238339,73.014296,0.0,213.527561,57.238339,1844.0,0.752005,149502.913337,232039.730195,213.527561,64987.140676,0.0,213.527561,232039.730195,1847.0,-0.755377,-0.084618,0.745881,-143.228561,3.333266,4.0,0.0,-0.000448,1844.0,-42.878719,10404.613694,13957.134113,3482.118511,3077.853066,0.0,3482.118511,13957.134113,1844.0,-0.756182,8058294.0,19217320.0,3482.118511,5818938.0,0.0,3482.118511,19217320.0,1847.0,0.296303,5.671367,102.216582,-0.149663,10.692107,4.0,0.0,0.036097,1844.0,2.949795,8444.361325,15351.176915,4878.157334,3086.034476,0.0,15351.176915,4878.157334,1844.0,0.749771,9362943.0,15596740.0,15351.176915,4334593.0,0.0,15351.176915,15596740.0,1847.0,-0.453485,-5.670287,0.150478,-99.930123,10.672759,4.0,0.0,-0.037197,1844.0,-2.920544,32.041307,213.592486,-193.256755,9.596072,4.0,0.0,21.658313,1844.0,-1.881558,35426.031477,2089297.0,16.859381,55223.70522,0.0,16.859381,60412.725335,1847.0,29.434367,-0.144141,0.86695,-359.936317,8.37462,4.0,0.0,0.00048,1291.0,-42.933134,-0.026405,0.0094,-0.490184,0.054604,20.0,0.0,-8e-05,1096.0,-3.262603,-0.00379,2.0,-3.0,0.442083,1512.0,0.0,0.0,6.0,-0.243737,-1.840823,7700.0,-7400.0,314.901297,716.0,0.0,100.0,26.0,0.63801,1847.0,0.0,0.0,0.0,97.0,378.0,694.0,0.052518,0.204656,0.375744,31.879491,3482.118511,251.054388,13957.134113,213.527561,15351.176915,57.238339,4878.157334,7.875106,219.174897,4.008231,10475.015601,3.730499,156.289222,3.146921,10473.01958,UQCRKD,N7236499100,wec,wecp,wecpt,66j,66jh,66jh1,x06,x06v,x06v9,1pv,1pv2,1pv2p,2019,1,28,2019,2,10,167.392778
1,170.110398,293.521944,0.0,71.628551,1.0,0.0,293.521944,724.0,-0.571283,0.405417,5.674167,0.0,0.751363,1.0,0.0,1.635,443.0,2.700581,-102.471751,179.676183,-179.740133,97.729186,0.0,165.115233,-97.176467,720.0,2.336092,-41.72094,-18.001283,-46.593317,7.328319,0.0,-18.001283,-43.284767,715.0,1.928291,30.404696,35.0,25.0,2.400053,0.0,33.0,30.0,11.0,-0.142094,10179.834254,13500.0,6800.0,1638.118775,0.0,12800.0,7200.0,63.0,0.27448,238.71626,298.521677,65.056724,60.446826,0.0,65.056724,221.295349,720.0,-2.121755,82986.092836,172830.572514,65.056724,53501.781049,0.0,65.056724,172830.572514,724.0,-0.011048,0.215799,214.90152,-1.914119,7.988685,5.0,0.0,-0.592228,720.0,26.806578,85.011885,251.343758,27.515473,58.340445,0.0,237.234789,27.515473,720.0,2.067558,40157.323188,61548.60455,237.234789,15867.405699,0.0,237.234789,61548.60455,724.0,-0.584037,-0.289668,0.981813,-143.204665,5.321507,5.0,0.0,-0.587619,720.0,-26.754055,12653.670083,16319.006227,7135.378414,2239.836737,0.0,7135.378414,16319.006227,720.0,-0.741001,4141401.0,9161257.0,7135.378414,2692428.0,0.0,7135.378414,9161257.0,724.0,0.16534,12.684569,153.957645,-0.048671,23.661146,5.0,0.0,48.473088,720.0,2.681615,6187.945128,11740.238638,2492.828915,2253.338761,0.0,11740.238638,2492.828915,720.0,0.742307,2691288.0,4480072.0,11740.238638,1262906.0,0.0,11740.238638,4480072.0,724.0,-0.373399,-12.772665,0.049169,-156.620879,23.852596,5.0,0.0,-49.180526,720.0,-2.694652,31.14152,220.001052,-8.760782,11.963403,5.0,0.0,29.647149,720.0,12.169299,21450.702233,32122.58,41.945575,7831.704426,0.0,41.945575,31211.489679,724.0,-0.172332,-0.362281,2.038567,-359.416316,13.35572,5.0,0.0,0.575916,645.0,-26.837201,-0.034922,0.29115,-0.79135,0.108053,10.0,0.0,0.1424,543.0,-3.671442,-0.004144,2.0,-4.0,0.576937,544.0,0.0,0.0,7.0,-0.604197,-7.734807,1100.0,-2000.0,218.788367,198.0,0.0,100.0,21.0,-1.828884,724.0,0.0,0.0,0.0,83.0,215.0,321.0,0.114641,0.296961,0.44337,65.056724,7135.378414,221.295349,16319.006227,237.234789,11740.238638,27.515473,2492.828915,3.401575,156.238625,2.287055,9183.627813,8.621868,209.719316,4.709605,9247.409723,UQCRKD,N7236499100,wec,wecp,wecpt,66j,66jh,66jh1,rsv,rsv3,rsv3t,3b6,3b6c,3b6cu,2019,2,1,2019,2,13,84.638333
2,162.051948,266.650278,0.0,85.272552,1.0,0.0,266.650278,816.0,-0.381424,0.326777,4.655,0.0,0.538075,1.0,0.0,0.003333,514.0,2.675703,-24.730741,179.952767,-179.9084,156.047524,0.0,142.2955,-141.230133,815.0,0.338082,-29.843383,-0.612717,-45.296917,14.576984,0.0,-0.612717,-45.296917,813.0,0.529841,32.579657,37.0,27.0,1.821314,0.0,36.0,28.0,11.0,-1.251328,11812.990196,14500.0,7600.0,1143.964503,0.0,13100.0,9800.0,43.0,-0.309728,186.544283,298.661132,36.376513,108.540329,0.0,36.376513,264.19562,815.0,-0.343606,54826.0816,152220.135083,36.376513,48734.136932,0.0,36.376513,152220.135083,816.0,0.516705,0.27919,214.681479,-1.005943,7.512701,2.0,0.0,-0.001235,815.0,28.484474,146.152051,251.61721,70.659936,73.578635,0.0,216.380715,70.659936,815.0,0.322317,74845.125219,119260.073351,216.380715,35049.807028,0.0,216.380715,119260.073351,816.0,-0.628484,-0.178579,1.170934,-143.305657,5.01662,2.0,0.0,-0.001266,815.0,-28.460311,9429.921328,12877.009723,3982.556328,2840.980219,0.0,3982.556328,12877.009723,815.0,-0.384077,3195238.0,7694816.0,3982.556328,2294518.0,0.0,3982.556328,7694816.0,816.0,0.34377,10.900065,153.881138,0.0,18.022761,2.0,0.0,0.102086,815.0,2.706692,9423.654783,14856.176755,5962.405171,2848.338205,0.0,14856.176755,5962.405171,815.0,0.377368,4508277.0,7689702.0,14856.176755,2190519.0,0.0,14856.176755,7689702.0,816.0,-0.384416,-10.89923,0.0,-155.420096,18.046769,2.0,0.0,-0.101273,815.0,-2.713605,32.886924,64.31735,0.0,3.619924,2.0,0.0,30.625816,815.0,-1.219252,17132.687839,28857.33,24.5758,5407.791194,0.0,24.5758,28857.332787,816.0,0.393818,-0.347458,1.190566,-359.861167,12.594209,2.0,0.0,0.0013,740.0,-28.506408,-0.05476,0.010383,-0.921483,0.096161,4.0,0.0,-8.4e-05,655.0,-3.138399,-0.009804,2.0,-2.0,0.528503,618.0,0.0,0.0,5.0,-0.010756,-4.044118,4700.0,-2600.0,264.84516,303.0,0.0,-300.0,21.0,5.394173,816.0,0.0,0.0,0.0,77.0,260.0,403.0,0.094363,0.318627,0.493873,36.376513,3982.556328,264.19562,12877.009723,216.380715,14856.176755,70.659936,5962.405171,7.262808,227.819107,3.233353,8894.453395,3.062283,145.720778,2.491642,8893.771584,UQCRKD,N7236499100,wec,wecp,wecpt,66j,66jh,66jh1,rpv,rpvh,rpvhw,0zg,0zgw,0zgw6,2019,1,28,2019,2,8,204.404167
3,70.902656,144.918333,0.0,51.283734,1.0,0.0,144.918333,334.0,-0.25745,0.433887,6.235278,0.0,0.896737,1.0,0.0,0.589722,265.0,3.754633,130.745032,147.863317,114.0921,12.023999,0.0,114.118883,147.863317,333.0,-0.273704,9.184683,22.335083,-5.99235,10.016222,0.0,22.335083,-5.99235,333.0,0.167835,32.269461,37.0,0.0,6.511103,8.0,5.0,32.0,22.0,-3.935059,12156.023952,24600.0,-1.0,2807.400234,0.0,12300.0,12400.0,46.0,-1.260296,21.25286,44.096922,0.087351,15.62451,0.0,0.087351,44.096922,333.0,-0.230916,2092.403234,7098.455324,0.087351,2217.78909,0.0,0.087351,7098.455324,334.0,0.739043,0.131765,2.295772,0.0,0.272541,2.0,0.0,0.176584,333.0,3.910759,207.11176,221.165323,193.804279,9.62724,0.0,193.837201,221.165323,333.0,-0.246315,33786.367708,69175.327735,193.837201,20006.527795,0.0,193.837201,69175.327735,334.0,0.058227,0.081821,1.549632,-0.016738,0.178834,2.0,0.0,0.131397,333.0,4.528994,2319.154987,4841.909855,9.67289,1713.52519,0.0,9.67289,4841.909855,333.0,-0.21749,227401.3,774597.8,9.67289,241780.1,0.0,9.67289,774597.8,334.0,0.745367,14.467775,244.925264,0.0,29.552964,2.0,0.0,19.509682,333.0,3.796736,16478.663203,18698.048822,13999.77019,1664.643577,0.0,18698.048822,13999.77019,333.0,0.195144,2916742.0,5503874.0,18698.048822,1586674.0,0.0,18698.048822,5503874.0,334.0,-0.129795,-14.066702,0.0,-229.08108,28.246885,2.0,0.0,-19.091057,333.0,-3.651397,32.762431,291.902142,0.0,15.423404,2.0,0.0,33.082833,333.0,14.023691,2181.708181,5345.064,0.136425,1599.138243,0.0,0.136425,5345.063994,334.0,0.491493,0.101031,1.951366,-0.015384,0.221146,2.0,0.0,0.144984,321.0,4.452994,-0.084813,0.0,-1.239,0.166554,2.0,0.0,-0.101983,317.0,-3.435398,0.080838,34.0,-34.0,3.788897,241.0,0.0,0.0,15.0,-0.162601,0.299401,13501.0,-13601.0,1670.122996,144.0,0.0,0.0,35.0,0.423018,334.0,8.0,0.023952,19.049722,40.0,114.0,164.0,0.11976,0.341317,0.491018,0.087351,9.67289,44.096922,4841.909855,193.837201,18698.048822,221.165323,13999.77019,504.82469,44.009571,500.564978,4832.236965,0.876436,-27.328123,1.335597,4698.278632,UQCRKD,N7236499100,wec,wecp,wecpt,66j,66jh,66jh1,wec,wecp,wecph,rqc,rqcj,rqcju,2019,1,23,2019,1,29,446.615278
4,73.166574,144.328611,0.0,51.206909,1.0,0.0,144.328611,334.0,-0.310828,0.432122,10.885,0.0,0.959703,1.0,0.0,0.586944,271.0,5.801128,131.284477,147.718333,114.0921,12.001479,0.0,114.118883,147.718333,333.0,-0.328701,8.745597,22.335083,-5.890367,10.022861,0.0,22.335083,-5.890367,333.0,0.217813,32.149701,37.0,0.0,6.88366,9.0,5.0,32.0,22.0,-3.712233,12276.38024,23500.0,-1.0,2890.911565,0.0,12300.0,12400.0,45.0,-1.416598,21.948692,43.920338,0.087351,15.610196,0.0,0.087351,43.920338,333.0,-0.283712,2209.36778,7330.863009,0.087351,2296.609215,0.0,0.087351,7330.863009,334.0,0.708698,0.131236,2.999444,-1.1e-05,0.290935,2.0,0.0,0.17255,333.0,5.45866,207.549596,221.033927,193.804279,9.61704,0.0,193.837201,221.033927,333.0,-0.300822,33860.331895,69321.565123,193.837201,20059.396425,0.0,193.837201,69321.565123,334.0,0.057536,0.081427,2.288879,-0.020865,0.19972,2.0,0.0,0.123828,333.0,6.800438,2395.465436,4822.400173,9.67289,1712.543543,0.0,9.67289,4822.400173,333.0,-0.269822,240157.1,800085.5,9.67289,250434.4,0.0,9.67289,800085.5,334.0,0.71504,14.409363,318.997712,-0.001366,31.361577,2.0,0.0,19.087907,333.0,5.266762,16404.694894,18698.048822,14018.861248,1664.603178,0.0,18698.048822,14018.861248,333.0,0.246642,2904499.0,5479168.0,18698.048822,1577640.0,0.0,18698.048822,5479168.0,334.0,-0.130005,-14.009544,0.001705,-286.070573,29.349314,2.0,0.0,-18.759219,333.0,-4.813733,32.018343,49.837824,-4.918998,6.619874,2.0,0.0,32.520806,333.0,-2.5311,2242.804179,5555.123,0.132204,1660.551499,0.0,0.132204,5543.498614,334.0,0.478923,0.100597,2.711926,-0.019134,0.243674,2.0,0.0,0.137716,323.0,6.463623,-0.084507,2.3e-05,-1.283496,0.165252,2.0,0.0,-0.10435,312.0,-3.775935,0.080838,34.0,-34.0,3.779403,239.0,0.0,0.0,17.0,-0.118584,0.299401,13501.0,-13601.0,1635.776852,142.0,0.0,-700.0,33.0,0.526997,334.0,9.0,0.026946,17.613333,40.0,119.0,170.0,0.11976,0.356287,0.508982,0.087351,9.67289,43.920338,4822.400173,193.837201,18698.048822,221.033927,14018.861248,502.803138,43.832987,498.548033,4812.727283,0.876957,-27.196726,1.333778,4679.187575,UQCRKD,N7236499100,wec,wecp,wecpt,66j,66jh,66jh1,wec,wecp,wecph,rqc,rqcn,rqcn3,2019,1,23,2019,1,29,447.205


In [24]:
test_fea.head()

Unnamed: 0,num_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,num_8,num_9,num_10,num_11,num_12,num_13,num_14,num_15,num_16,num_17,num_18,num_19,num_20,num_21,num_22,num_23,num_24,num_25,num_26,num_27,num_28,num_29,num_30,num_31,num_32,num_33,num_34,num_35,num_36,num_37,num_38,num_39,num_40,num_41,num_42,num_43,num_44,num_45,num_46,num_47,num_48,num_49,num_50,num_51,num_52,num_53,num_54,num_55,num_56,num_57,num_58,num_59,num_60,num_61,num_62,num_63,num_64,num_65,num_66,num_67,num_68,num_69,num_70,num_71,num_72,num_73,num_74,num_75,num_76,num_77,num_78,num_79,num_80,num_81,num_82,num_83,num_84,num_85,num_86,num_87,num_88,num_89,num_90,num_91,num_92,num_93,num_94,num_95,num_96,num_97,num_98,num_99,num_100,num_101,num_102,num_103,num_104,num_105,num_106,num_107,num_108,num_109,num_110,num_111,num_112,num_113,num_114,num_115,num_116,num_117,num_118,num_119,num_120,num_121,num_122,num_123,num_124,num_125,num_126,num_127,num_128,num_129,num_130,num_131,num_132,num_133,num_134,num_135,num_136,num_137,num_138,num_139,num_140,num_141,num_142,num_143,num_144,num_145,num_146,num_147,num_148,num_149,num_150,num_151,num_152,num_153,num_154,num_155,num_156,num_157,num_158,num_159,num_160,num_161,num_162,num_163,num_164,num_165,num_166,num_167,num_168,num_169,num_170,num_171,num_172,num_173,num_174,num_175,num_176,num_177,num_178,num_179,num_180,num_181,num_182,num_183,num_184,num_185,num_186,num_187,num_188,num_189,num_190,num_191,num_192,num_193,num_194,num_195,num_196,num_197,num_198,num_199,num_200,num_201,num_202,num_203,num_204,num_205,num_206,num_207,num_208,num_209,num_210,num_211,num_212,num_213,num_214,num_215,num_216,num_217,num_218,num_219,num_220,num_221,num_222,num_223,num_224,num_225,num_226,num_227,num_228,num_229,num_230,num_231,num_232,num_233,num_234,num_235,num_236,num_237,num_238,num_239,num_240,num_241,num_242,num_243,num_244,num_245,num_246,num_247,num_248,num_249,num_250,num_251,num_252,num_253,num_254,num_255,num_256,num_257,num_258,num_259,num_260,num_261,num_262,num_263,num_264,num_265,num_266,num_267,num_268,num_269,num_270,num_271,num_272,num_273,num_274,num_275,num_276,num_277,num_278,num_279,num_280,num_281,num_282,num_283,num_284,num_285,num_286,num_287,num_288,num_289,num_290,num_291,num_292,num_293,num_294,num_295,num_296,num_297,num_298,num_299,num_300,num_301,num_302,num_303,num_304,num_305,num_306,num_307,num_308,num_309,num_310,num_311,num_312,num_313,num_314,num_315,num_316,num_317,num_318,num_319,num_320,num_321,num_322,num_323,num_324,num_325,num_326,num_327,num_328,num_329,num_330,num_331,num_332,num_333,num_334,num_335,num_336,num_337,num_338,num_339,num_340,num_341,num_342,num_343,num_344,num_345,num_346,num_347,num_348,num_349,num_350,num_351,num_352,num_353,num_354,num_355,num_356,num_357,num_358,num_359,num_360,num_361,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,cat_11,cat_12,cat_13,cat_14,cat_15,cat_16,cat_17,cat_18,cat_19
0,124.754499,251.402778,0.0,82.613136,251.402778,41668.002778,141.875,333.0,0.997006,334.0,1.0,-0.333507,-1.216325,0.0,0.752703,21.722222,0.0,1.785728,21.722222,251.402778,0.1875,333.0,0.997006,248.0,0.742515,7.410639,72.379045,0.016667,138.465947,162.527333,113.895823,16.560653,48.63151,46247.626273,142.30872,334.0,1.0,333.0,0.997006,-0.368396,-1.257847,114.070615,1.960866,22.383463,-21.2233,14.342174,43.606763,654.929202,-0.92296,334.0,1.0,331.0,0.991018,0.231913,-1.259498,22.345372,26.08982,38.0,0.0,8.341747,38.0,8714.0,29.0,331.0,0.991018,33.0,0.098802,-1.710247,1.787737,30.0,13163.949102,31650.0,-1.0,2956.961801,31651.0,4396759.0,13350.0,334.0,1.0,231.0,0.691617,1.268326,12.547578,13350.0,32.097839,65.460995,0.179023,21.892607,65.281972,10720.678084,36.886038,334.0,1.0,333.0,0.997006,-0.312193,-1.261837,0.290722,3311.967506,10720.678084,0.179023,3334.005146,10720.499061,1106197.0,2284.175553,334.0,1.0,334.0,1.0,0.66816,-0.885724,0.179023,0.195455,6.724819,-0.00434,0.502955,6.729159,65.281972,0.046133,332.0,0.994012,333.0,0.997006,8.499139,94.541112,0.0,213.686645,234.467822,193.637283,13.717047,40.830539,71371.339341,216.347525,334.0,1.0,333.0,0.997006,-0.297913,-1.265109,193.796891,34497.018017,71371.339341,193.637283,20669.304686,71177.702057,11522000.0,33852.255328,334.0,1.0,334.0,1.0,0.077383,-1.205304,193.637283,0.122247,4.599295,-0.029198,0.335831,4.628493,40.830539,0.02885,332.0,0.994012,333.0,0.997006,9.31072,108.539626,0.0,3517.409263,7177.905455,19.857668,2406.077227,7158.047787,1174815.0,4038.787081,334.0,1.0,333.0,0.997006,-0.305262,-1.269829,31.101391,362159.398359,1174815.0,19.857668,365318.507689,1174795.0,120961200.0,248641.958635,334.0,1.0,334.0,1.0,0.671902,-0.881156,19.857668,21.431281,721.55636,-0.506574,54.272514,722.062934,7158.047787,4.853753,332.0,0.994012,333.0,0.997006,8.321586,91.606547,0.0,15318.148112,18712.243094,11684.2944,2353.566845,7027.948694,5116261.0,14828.871855,334.0,1.0,333.0,0.997006,0.278876,-1.275691,18702.190885,2788170.0,5116261.0,18712.243094,1471450.0,5097549.0,931248800.0,2899768.0,334.0,1.0,334.0,1.0,-0.190299,-1.146794,18712.243094,-21.041763,0.673109,-676.158948,51.226897,676.832058,-7027.948694,-4.931943,332.0,0.994012,333.0,0.997006,-7.948596,86.208942,0.0,25.002725,39.094542,-3.799302,10.054404,42.893844,8350.910107,28.988708,332.0,0.994012,333.0,0.997006,-1.381965,0.41684,0.0,2055.529914,4673.03784,0.159174,1431.518349,4672.878666,686547.0,1752.54161,334.0,1.0,334.0,1.0,0.316274,-1.302517,0.159174,0.145603,5.708295,-0.022887,0.414374,5.731182,48.63151,0.034294,332.0,0.994012,329.0,0.98503,9.427445,110.473057,0.0,-0.130559,0.006734,-3.575458,0.290997,3.582192,-43.606763,-0.030625,331.0,0.991018,331.0,0.991018,-6.723373,64.560055,0.0,0.0,30.0,-31.0,2.826309,61.0,0.0,0.0,130.0,0.389222,17.0,0.050898,0.102644,81.258444,0.0,-4.431138,29230.0,-11551.0,2263.030309,40781.0,-1480.0,-10.0,324.0,0.97006,135.0,0.404192,6.425104,90.601279,20.0,334.0,3.0,0.008982,24.008333,50.0,134.0,216.0,0.149701,0.401198,0.646707,0.179023,19.857668,65.460995,7177.905455,193.637283,18712.243094,234.467822,11684.2944,365.657136,65.281972,361.467687,7158.047787,0.825859,-40.830539,1.601487,7027.948694,JCMFTA,G9916514058,ws0,ws0b,ws0br,66j,66jh,66jh1,web,webz,webzr,rs5,rs5x,rs5x6,2019,6,23,2019,7,3
1,0.263158,0.555556,0.0,0.168705,0.555556,5.0,0.286111,18.0,0.947368,19.0,1.0,0.046544,-0.805578,0.0,0.02924,0.158333,0.0,0.042015,0.158333,0.555556,0.016667,18.0,0.947368,17.0,0.894737,2.387223,4.174924,0.013889,113.713763,113.737135,113.694225,0.013017,0.04291,2160.561505,113.715192,19.0,1.0,19.0,1.0,0.174725,-0.748582,113.694225,22.585272,22.630953,22.522195,0.033139,0.108758,429.120165,22.584013,19.0,1.0,19.0,1.0,-0.411982,-0.592195,22.630953,22.210526,27.0,13.0,4.640542,14.0,422.0,24.0,19.0,1.0,10.0,0.526316,-0.892495,-0.788214,24.0,15890.0,16260.0,15080.0,375.373498,1180.0,301910.0,16070.0,19.0,1.0,17.0,0.894737,-1.039568,-0.410269,16110.0,0.154714,0.183345,0.131322,0.016921,0.052024,2.939562,0.14989,19.0,1.0,19.0,1.0,0.421896,-1.035109,0.183345,1.636536,2.939562,0.183345,0.834417,2.756217,31.09418,1.671758,19.0,1.0,19.0,1.0,-0.120892,-1.17316,0.183345,-0.002738,0.0,-0.01842,0.004106,0.01842,-0.052024,-0.00162,18.0,0.947368,19.0,1.0,-2.98432,8.408745,0.0,23.518348,23.551386,23.471251,0.024461,0.080135,446.848621,23.517803,19.0,1.0,19.0,1.0,-0.463125,-0.552526,23.551386,235.311993,446.848621,23.551386,128.811067,423.297235,4470.928,235.361559,19.0,1.0,19.0,1.0,-0.001117,-1.206752,23.551386,-0.004218,0.0,-0.024811,0.006185,0.024811,-0.080135,-0.00215,18.0,0.947368,19.0,1.0,-2.449187,4.816301,0.0,15.978392,19.068419,13.580032,1.806638,5.488387,303.5894,15.425971,19.0,1.0,19.0,1.0,0.481268,-1.044424,19.068419,169.299039,303.5894,19.068419,86.041649,284.521,3216.682,173.066827,19.0,1.0,19.0,1.0,-0.12521,-1.169024,19.068419,-0.288862,0.0,-2.011541,0.442475,2.011541,-5.488387,-0.174648,18.0,0.947368,19.0,1.0,-3.108173,9.217077,0.0,2601.950964,2605.642565,2596.697901,2.730062,8.944665,49437.07,2601.887646,19.0,1.0,19.0,1.0,-0.460185,-0.554884,2605.642565,26033.85,49437.07,2605.642565,14250.99,46831.43,494643.2,26039.39,19.0,1.0,19.0,1.0,-0.001127,-1.206752,2605.642565,-0.470772,0.0,-2.76548,0.68999,2.76548,-8.944665,-0.239972,18.0,0.947368,19.0,1.0,-2.447013,4.799969,0.0,-9.817164,0.0,-13.685743,4.376354,13.685743,-186.526115,-11.51971,18.0,0.947368,19.0,1.0,1.396898,0.415583,0.0,818.125083,1949.802397,72.45999,495.165925,1877.342407,15544.38,639.838365,19.0,1.0,19.0,1.0,1.185155,0.183321,72.45999,0.002258,0.011475,0.0,0.00319,0.011475,0.04291,0.00115,18.0,0.947368,19.0,1.0,2.332416,3.902551,0.0,-0.005724,0.0,-0.032849,0.00832,0.032849,-0.108758,-0.00291,18.0,0.947368,19.0,1.0,-2.414104,4.550632,0.0,0.736842,6.0,0.0,1.331485,6.0,14.0,0.0,9.0,0.473684,3.0,0.157895,3.164748,9.899512,0.0,58.421053,540.0,-150.0,152.807521,690.0,1110.0,30.0,18.0,0.947368,18.0,0.947368,1.582344,2.847788,120.0,19.0,0.0,0.0,0.0,1.0,1.0,3.0,0.052632,0.052632,0.157895,0.183345,19.068419,0.131322,13.580032,23.551386,2605.642565,23.471251,2596.697901,0.716254,-0.052024,0.712174,-5.488387,1.003414,0.080135,1.003445,8.944665,OYSCFP,O8358265987,ws0,ws0b,ws0br,w21,w21x,w21xr,ws0,ws0b,ws0be,ws0,ws0b,ws0bh,2020,3,25,2020,3,25
2,90.203496,236.1,0.0,81.688298,236.1,8028.111111,71.069444,88.0,0.988764,89.0,1.0,0.415595,-1.293369,0.0,2.652809,20.15,0.0,3.666558,20.15,236.1,0.8,88.0,0.988764,87.0,0.977528,2.016864,4.842759,0.266667,83.676271,104.196312,43.50522,21.292861,60.691092,7447.188148,91.16672,89.0,1.0,89.0,1.0,-0.563946,-1.149071,104.196312,5.922134,14.363573,1.139347,4.658772,13.224226,527.069939,5.975087,89.0,1.0,89.0,1.0,0.438125,-1.279247,1.287627,24.629213,35.0,0.0,12.334643,35.0,2192.0,31.0,76.0,0.853933,17.0,0.191011,-1.163009,-0.247685,31.0,25863.146067,32700.0,840.0,6734.764345,31860.0,2301820.0,27630.0,89.0,1.0,81.0,0.910112,-2.833388,7.133491,28690.0,37.513283,71.320451,23.354758,16.371973,47.965693,3338.682207,28.178151,89.0,1.0,89.0,1.0,0.802822,-0.796754,23.354758,1299.590937,3338.682207,23.354758,921.159259,3315.327449,115663.6,1082.514386,89.0,1.0,89.0,1.0,0.565264,-0.784203,23.354758,0.53894,3.684584,-0.138532,0.844898,3.823116,47.965693,0.046807,88.0,0.988764,89.0,1.0,1.652686,1.927386,0.0,87.465183,107.882371,49.446294,20.970231,58.436077,7784.401318,94.00938,89.0,1.0,89.0,1.0,-0.524594,-1.224071,107.882371,4449.257354,7784.401318,107.882371,2315.542403,7676.518947,395983.9,4753.046861,89.0,1.0,89.0,1.0,-0.275493,-1.203217,107.882371,-0.656585,0.061475,-5.530352,1.005843,5.591827,-58.436077,-0.081691,88.0,0.988764,89.0,1.0,-2.066599,5.168943,0.0,4035.301876,7509.754686,2582.483147,1677.908827,4927.271539,359141.9,3060.486391,89.0,1.0,89.0,1.0,0.799187,-0.795169,2584.038495,141774.172918,359141.9,2584.038495,99215.5534,356557.8,12617900.0,119386.883174,89.0,1.0,89.0,1.0,0.536796,-0.817102,2584.038495,55.345126,405.121244,-23.957696,88.314034,429.07894,4925.716191,5.131031,88.0,0.988764,89.0,1.0,1.739547,2.467069,0.0,8668.419278,10658.60595,5038.529072,2036.726336,5620.076878,771489.3,9241.671275,89.0,1.0,89.0,1.0,-0.511027,-1.245246,10658.60595,439973.4,771489.3,10658.60595,229189.7,760830.7,39157630.0,469348.8,89.0,1.0,89.0,1.0,-0.271748,-1.202871,10658.60595,-63.146931,8.514042,-591.462892,100.239289,599.976934,-5620.076878,-8.165655,88.0,0.988764,89.0,1.0,-2.404546,7.631738,0.0,16.316123,34.712364,-3.203736,12.040157,37.9161,1452.134904,22.478142,88.0,0.988764,89.0,1.0,-0.311734,-1.47416,0.0,3939.172458,12957.009676,28.646767,3654.44868,12928.362909,350586.3,1594.67351,89.0,1.0,89.0,1.0,1.166375,-0.06133,28.646767,-0.681922,0.035412,-4.928093,1.025499,4.963505,-60.691092,-0.094608,88.0,0.988764,89.0,1.0,-1.766075,2.801754,0.0,0.126566,2.553037,-1.11816,0.409206,3.671197,11.264375,0.00019,88.0,0.988764,89.0,1.0,2.704092,13.941573,0.0,0.224719,14.0,-25.0,3.879513,39.0,20.0,0.0,47.0,0.52809,14.0,0.157303,-2.097137,21.367076,0.0,34.494382,28140.0,-30100.0,7746.033129,58240.0,3070.0,-10.0,87.0,0.977528,72.0,0.808989,0.165646,7.928698,-50.0,89.0,13.0,0.146067,12.369444,32.0,47.0,64.0,0.359551,0.52809,0.719101,23.354758,2584.038495,71.320451,7509.754686,107.882371,10658.60595,49.446294,5038.529072,3.053787,47.965693,2.906209,4925.716191,2.181809,58.436077,2.11542,5620.076878,RWHZVZ,J3697709469,wec,wecp,wecpt,spe,sper,sperp,w24,w24p,w24p7,sfn,sfnz,sfnzm,2019,4,1,2019,4,10
3,2.405103,4.749444,0.0,1.350957,4.749444,336.714444,2.398333,139.0,0.992857,140.0,1.0,-0.000815,-1.185854,0.0,0.033925,0.085278,0.0,0.008624,0.085278,4.749444,0.033333,139.0,0.992857,97.0,0.692857,1.651144,11.074902,0.033611,114.158114,114.3568,113.8841,0.160263,0.4727,15982.135992,114.18345,140.0,1.0,139.0,0.992857,-0.407987,-1.194876,114.338867,22.163189,22.4512,21.757183,0.195671,0.694017,3102.846506,22.185459,140.0,1.0,139.0,0.992857,-0.42442,-0.961499,22.34545,22.435714,31.0,8.0,5.21702,23.0,3141.0,25.0,140.0,1.0,22.0,0.157143,-1.094084,0.806499,25.0,15252.142857,21600.0,6200.0,3837.232408,15400.0,2135300.0,15700.0,140.0,1.0,73.0,0.521429,-0.398136,-0.660726,19400.0,0.498811,0.89776,0.112787,0.243411,0.784974,69.833597,0.492473,140.0,1.0,140.0,1.0,0.03865,-1.347019,0.112787,25.345909,69.833597,0.112787,20.715806,69.72081,3548.427,20.129016,140.0,1.0,140.0,1.0,0.568333,-0.936242,0.112787,0.005607,0.011382,0.0,0.001947,0.011382,0.784974,0.005405,139.0,0.992857,140.0,1.0,0.129312,0.053458,0.0,35.076764,35.236483,34.880098,0.112442,0.356385,4910.747019,35.096832,140.0,1.0,140.0,1.0,-0.4401,-0.918793,34.89803,2469.003964,4910.747019,34.89803,1418.466549,4875.848988,345660.6,2466.583283,140.0,1.0,140.0,1.0,0.003835,-1.201257,34.89803,0.001281,0.010874,-0.012834,0.004592,0.023708,0.179367,0.00124,139.0,0.992857,140.0,1.0,0.086541,-0.336835,0.0,53.987794,98.385768,12.468848,26.337129,85.91692,7558.291,53.054968,140.0,1.0,140.0,1.0,0.073324,-1.324678,12.468848,2743.594296,7558.291,12.468848,2234.951036,7545.822,384103.2,2182.515769,140.0,1.0,140.0,1.0,0.569864,-0.928382,12.468848,0.613692,1.357453,0.0,0.212925,1.357453,85.91692,0.60966,139.0,0.992857,140.0,1.0,0.052233,0.522885,0.0,3720.541199,3738.236078,3698.018495,13.025866,40.217583,520875.8,3723.450828,140.0,1.0,140.0,1.0,-0.446824,-1.013567,3699.511861,261826.1,520875.8,3699.511861,150449.9,517176.3,36655660.0,261541.3,140.0,1.0,140.0,1.0,0.004283,-1.201181,3699.511861,0.173748,1.158181,-1.205606,0.46372,2.363787,24.324741,0.185717,139.0,0.992857,140.0,1.0,0.085266,-0.406931,0.0,18.147133,29.686565,0.0,5.515235,29.686565,2540.598602,17.887586,139.0,0.992857,140.0,1.0,-0.447554,0.07078,0.0,925.765716,1591.405308,5.18433,383.592056,1586.220978,129607.2,909.985363,140.0,1.0,140.0,1.0,0.036072,-1.097673,5.18433,0.002652,0.011783,-0.00865,0.004182,0.020433,0.371267,0.002942,139.0,0.992857,135.0,0.964286,0.065088,-0.569894,0.004334,-0.004957,0.00455,-0.017916,0.003109,0.022466,-0.694017,-0.005709,139.0,0.992857,128.0,0.914286,0.1855,1.561972,-0.005833,0.107143,5.0,-6.0,1.80751,11.0,15.0,0.0,76.0,0.542857,12.0,0.085714,-0.138122,2.128949,0.0,-0.714286,4200.0,-3200.0,847.222051,7400.0,-100.0,0.0,108.0,0.771429,35.0,0.25,0.355876,7.273845,0.0,140.0,0.0,0.0,0.0,2.0,7.0,28.0,0.014286,0.05,0.2,0.112787,12.468848,0.89776,98.385768,34.89803,3699.511861,35.077397,3723.836602,7.959818,0.784974,7.890526,85.91692,0.994887,-0.179367,0.993468,-24.324741,OYSCFP,R7006912480,ws0,ws0b,ws0br,tf3,tf34,tf346,web,webz,webzx,wec,wec7,wec7b,2020,6,1,2020,6,1
4,6.433514,21.579444,0.0,6.209001,21.579444,7076.865833,4.0725,1099.0,0.999091,1100.0,1.0,1.156539,-0.056626,0.0,0.019618,0.220833,0.0,0.02993,0.220833,21.579444,0.007222,1099.0,0.999091,267.0,0.242727,4.047388,18.602322,0.006667,115.259583,118.345583,113.883967,1.497412,4.461616,126785.541396,114.636167,1100.0,1.0,1099.0,0.999091,1.088527,-0.391393,113.899117,22.51058,24.225433,22.091017,0.619247,2.134416,24761.637951,22.241484,1100.0,1.0,1042.0,0.947273,1.638274,1.356423,22.34535,25.831818,31.0,2.0,5.844336,29.0,28415.0,28.0,1100.0,1.0,30.0,0.027273,-2.4275,5.439051,29.0,10359.909091,35500.0,100.0,6025.037306,35400.0,11395900.0,8800.0,1100.0,1.0,179.0,0.162727,1.906299,4.975243,8700.0,1.522585,4.757987,0.104216,1.503368,4.653771,1674.843529,0.901873,1100.0,1.0,1100.0,1.0,1.145778,-0.219104,0.104216,414.881764,1674.843529,0.104216,438.730877,1674.739313,456369.9,257.776129,1100.0,1.0,1100.0,1.0,1.248261,0.567782,0.104216,0.004201,0.058609,-0.002032,0.007952,0.060641,4.621472,0.001643,1099.0,0.999091,1100.0,1.0,4.386792,21.15393,0.0,18.818394,19.999977,15.355725,1.539148,4.644252,20700.233945,19.56,1100.0,1.0,1100.0,1.0,-1.278111,0.019315,19.951021,10773.867415,20700.233945,19.951021,6082.37937,20680.282924,11851250.0,10924.322999,1100.0,1.0,1100.0,1.0,-0.071073,-1.232963,19.951021,-0.00417,0.001303,-0.062012,0.0087,0.063315,-4.587314,-0.001514,1099.0,0.999091,1100.0,1.0,-4.187029,19.814555,0.0,157.991027,490.783201,11.48617,153.9967,479.29703,173790.1,94.816735,1100.0,1.0,1100.0,1.0,1.148895,-0.202684,11.48617,43599.088827,173790.1,11.48617,45526.016043,173778.6,47959000.0,27596.866191,1100.0,1.0,1100.0,1.0,1.228749,0.525628,11.48617,0.43303,5.948814,-0.188911,0.811444,6.137725,476.33269,0.169518,1099.0,0.999091,1100.0,1.0,4.359112,20.918046,0.0,1932.458092,2045.151084,1576.557156,154.870745,468.593928,2125704.0,2009.836823,1100.0,1.0,1100.0,1.0,-1.32383,0.133919,2038.634608,1104835.0,2125704.0,2038.634608,624836.8,2123665.0,1215318000.0,1119408.0,1100.0,1.0,1100.0,1.0,-0.067686,-1.233872,2038.634608,-0.42007,0.174299,-6.21084,0.889543,6.38514,-462.077452,-0.141495,1099.0,0.999091,1100.0,1.0,-4.065661,18.903123,0.0,21.263809,34.291176,-5.071812,7.105219,39.362988,23390.189442,23.463462,1099.0,0.999091,1100.0,1.0,-1.782373,3.021821,0.0,5659.392651,9535.886838,1.785365,2204.631292,9534.101473,6225332.0,6108.614887,1100.0,1.0,1100.0,1.0,-0.244171,-1.004436,1.785365,0.003987,0.062133,-0.003383,0.008092,0.065516,4.386233,0.001866,1099.0,0.999091,572.0,0.52,4.598955,23.171088,0.001867,0.001603,0.029133,-0.004733,0.00434,0.033866,1.76375,6.6e-05,1087.0,0.988182,548.0,0.498182,2.777803,9.751525,0.0001,0.006364,2.0,-2.0,0.498138,4.0,7.0,0.0,201.0,0.182727,5.0,0.004545,-0.163355,5.290069,0.0,12.090909,35200.0,-35300.0,2279.762586,70500.0,13300.0,0.0,562.0,0.510909,34.0,0.030909,2.173448,213.287385,0.0,1100.0,0.0,0.0,0.0,7.0,27.0,141.0,0.006364,0.024545,0.128182,0.104216,11.48617,4.725688,487.818861,19.951021,2038.634608,15.363707,1576.557156,45.345108,4.621472,42.470105,476.33269,1.298581,4.587314,1.293093,462.077452,JONOCD,U9615709922,ws0,ws0b,ws0br,wvu,wvux,wvuxr,web,webz,webzz,wsk,wsk1,wsk1g,2020,6,16,2020,6,16


In [12]:
use_cols = []
# imp = pd.read_pickle("imp_2.pkl")
# del_cols = imp['feat'].values.tolist()[-30:]

for col in ['num_{}'.format(str(i)) for i in range(242)]:
    if np.std(train_fea[col].values) > 0.01:
        use_cols.append(col)
print(len(use_cols))

218


In [13]:
# label_encoder
from sklearn.preprocessing import LabelEncoder

test_fea['label'] = -1e8
data_fea = pd.concat([train_fea, test_fea])
data_fea = data_fea.reset_index(drop=True)

for col in ['cat_{}'.format(str(i)) for i in range(20)]:
    print(col)
    le = LabelEncoder()
    data_fea[col] = le.fit_transform(data_fea[col])
    data_fea[col] = data_fea[col].astype('category')

train_feas = data_fea[data_fea['label'] != -1e8].reset_index(drop=True)
test_feas = data_fea[data_fea['label'] == -1e8].reset_index(drop=True)
del test_feas['label']
print(train_feas.shape, test_feas.shape)

all_feas = use_cols + ['cat_{}'.format(str(i)) for i in range(20)]
cat_feas = ['cat_{}'.format(str(i)) for i in range(20)]
print(len(all_feas), len(cat_feas))

cat_0
cat_1
cat_2
cat_3
cat_4
cat_5
cat_6
cat_7
cat_8
cat_9
cat_10
cat_11
cat_12
cat_13
cat_14
cat_15
cat_16
cat_17
cat_18
cat_19
(216358, 263) (239, 262)
238 20


In [13]:
imp = pd.read_pickle("result/fea1.pkl")

imp['gain'] = imp[[f for f in imp.columns if 'gain' in f]].sum(axis=1)/5
imp['split'] = imp[[f for f in imp.columns if 'split' in f]].sum(axis=1)/5

imp = imp[['feat', 'gain', 'split']]
imp = imp.sort_values(by=['gain'], ascending=False)
imp = imp.reset_index(drop=True)
# imp.to_pickle("imp_2.pkl")
# imp

all_feas = imp['feat'].values[:-30].tolist()
print(len(all_feas))

182


In [20]:
print(len(all_feas))

238


In [14]:
# Log
file_name = datetime.date.today().strftime('%m%d')+"_{}.log".format("lgb_round2")
def write_log(w):
    t0 = datetime.datetime.now().strftime('%H:%M:%S')
    info = "{} : {}\n".format(t0, w)
    print(info)
    with open(file_name, 'a') as f:
        f.write(info)
        f.write("-"*80+"\n")

In [24]:
# del label

In [23]:
# label = train_feas['label'].values
# print(label[100:120])
# label[label < 0] = 0
# print(label[100:120])

In [None]:
%%time

def mse_score_eval(preds, valid):
    labels = valid.get_label()
    #weight = valid.data['num_336'].values
    scores = mse(y_true=labels, y_pred=preds)
    return 'mse_score', scores, False

def build_model(train, test, label, seed=2020, is_shuffle=True):
    imp = pd.DataFrame()  # 特征重要性
    imp['feat'] = all_feas

    train_pred = np.zeros((train.shape[0], ))
    test_pred = np.zeros((test.shape[0], ))
    n_splits = 10
    # Kfold
#     fold = GroupKFold(n_splits=n_splits)
    fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed)
#     groups = train_df['loadingOrder'].values
#     kf_way = fold.split(train, groups=groups)
    kf_way = fold.split(train)
    # params
    params = {
        'learning_rate': 0.02,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'num_leaves': 128,
        'feature_fraction': 0.90,
        'bagging_fraction': 0.90,
        'bagging_freq': 5,
        'seed': 8,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': 45,
        'verbose': 1,
    }
    # train
    for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1):
        write_log("fold {}".format(n_fold))
        train_x, train_y = train.iloc[train_idx], label[train_idx]
        valid_x, valid_y = train.iloc[valid_idx], label[valid_idx]
        # 数据加载
        n_train = lgb.Dataset(train_x, label=train_y, 
                             # free_raw_data=False
                             )
        n_valid = lgb.Dataset(valid_x, label=valid_y, 
                              #free_raw_data=False
                             )

        clf = lgb.train(
            params=params,
            train_set=n_train,
            categorical_feature=cat_feas,
            num_boost_round=15000,
            valid_sets=[n_train, n_valid],
            early_stopping_rounds=50,
            verbose_eval=400,
            feval=mse_score_eval
        )
        clf.save_model("data/lgb_save_model/lgb_model_fold_{}.txt".format(n_fold))
        train_pred[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        
        write_log("val_mse = {}".format(mse(valid_y, train_pred[valid_idx])))
        test_pred += clf.predict(test, num_iteration=clf.best_iteration)/fold.n_splits

        imp['gain' + str(n_fold + 1)] = clf.feature_importance(importance_type='gain')
        imp['split' + str(n_fold + 1)] = clf.feature_importance(importance_type='split')
        
    write_log("train mse: {}".format(mse(label, train_pred)))
    result = pd.DataFrame({
        'loadingOrder': test_df['loadingOrder'], 
        'label': test_pred,
    })
    return result, imp

result, imp = build_model(train_feas[all_feas], test_feas[all_feas], train_feas['label'].values, is_shuffle=True)

result.to_pickle("result/result1.pkl")
imp.to_pickle("result/fea1.pkl")
# 373.12617949583523  381.9896222463546   386.2948549012397

13:07:38 : fold 1



New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[400]	training's l2: 186.504	training's mse_score: 186.504	valid_1's l2: 482.374	valid_1's mse_score: 482.374
[800]	training's l2: 80.2265	training's mse_score: 80.2265	valid_1's l2: 387.968	valid_1's mse_score: 387.968
[1200]	training's l2: 46.4664	training's mse_score: 46.4664	valid_1's l2: 359.216	valid_1's mse_score: 359.216
[1600]	training's l2: 30.183	training's mse_score: 30.183	valid_1's l2: 343.977	valid_1's mse_score: 343.977
[2000]	training's l2: 20.8254	training's mse_score: 20.8254	valid_1's l2: 335.03	valid_1's mse_score: 335.03
[2400]	training's l2: 14.9876	training's mse_score: 14.9876	valid_1's l2: 329.248	valid_1's mse_score: 329.248
[2800]	training's l2: 11.0898	training's mse_score: 11.0898	valid_1's l2: 325.369	valid_1's mse_score: 325.369
[3200]	training's l2: 8.54132	training's mse_score: 8.54132	valid_1's l2: 322.925	valid_1's mse_score: 322.925
[3600]	training's l2: 6.73894	training's mse_score: 6.738

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[400]	training's l2: 182.335	training's mse_score: 182.335	valid_1's l2: 517.562	valid_1's mse_score: 517.562
[800]	training's l2: 78.5078	training's mse_score: 78.5078	valid_1's l2: 438.063	valid_1's mse_score: 438.063
[1200]	training's l2: 45.4944	training's mse_score: 45.4944	valid_1's l2: 415.71	valid_1's mse_score: 415.71
[1600]	training's l2: 29.5036	training's mse_score: 29.5036	valid_1's l2: 403.562	valid_1's mse_score: 403.562
[2000]	training's l2: 20.4228	training's mse_score: 20.4228	valid_1's l2: 396.499	valid_1's mse_score: 396.499
[2400]	training's l2: 14.881	training's mse_score: 14.881	valid_1's l2: 392.055	valid_1's mse_score: 392.055
[2800]	training's l2: 11.1638	training's mse_score: 11.1638	valid_1's l2: 389	valid_1's mse_score: 389
[3200]	training's l2: 8.56198	training's mse_score: 8.56198	valid_1's l2: 386.992	valid_1's mse_score: 386.992
[3600]	training's l2: 6.69854	training's mse_score: 6.69854	valid

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[400]	training's l2: 185.709	training's mse_score: 185.709	valid_1's l2: 505.564	valid_1's mse_score: 505.564
[800]	training's l2: 80.1714	training's mse_score: 80.1714	valid_1's l2: 414.327	valid_1's mse_score: 414.327
[1200]	training's l2: 46.3423	training's mse_score: 46.3423	valid_1's l2: 387.358	valid_1's mse_score: 387.358
[1600]	training's l2: 30.094	training's mse_score: 30.094	valid_1's l2: 373.167	valid_1's mse_score: 373.167
[2000]	training's l2: 20.7803	training's mse_score: 20.7803	valid_1's l2: 364.455	valid_1's mse_score: 364.455
[2400]	training's l2: 14.9398	training's mse_score: 14.9398	valid_1's l2: 358.62	valid_1's mse_score: 358.62
[2800]	training's l2: 11.1013	training's mse_score: 11.1013	valid_1's l2: 354.645	valid_1's mse_score: 354.645
[3200]	training's l2: 8.54348	training's mse_score: 8.54348	valid_1's l2: 351.855	valid_1's mse_score: 351.855
[3600]	training's l2: 6.753	training's mse_score: 6.753	v

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[400]	training's l2: 184.383	training's mse_score: 184.383	valid_1's l2: 420.38	valid_1's mse_score: 420.38
[800]	training's l2: 80.1731	training's mse_score: 80.1731	valid_1's l2: 345.788	valid_1's mse_score: 345.788
[1200]	training's l2: 46.5644	training's mse_score: 46.5644	valid_1's l2: 321.618	valid_1's mse_score: 321.618
[1600]	training's l2: 30.1377	training's mse_score: 30.1377	valid_1's l2: 309.65	valid_1's mse_score: 309.65
[2000]	training's l2: 20.7312	training's mse_score: 20.7312	valid_1's l2: 302.524	valid_1's mse_score: 302.524
[2400]	training's l2: 15.0302	training's mse_score: 15.0302	valid_1's l2: 297.186	valid_1's mse_score: 297.186
[2800]	training's l2: 11.2768	training's mse_score: 11.2768	valid_1's l2: 293.826	valid_1's mse_score: 293.826
[3200]	training's l2: 8.64511	training's mse_score: 8.64511	valid_1's l2: 291.358	valid_1's mse_score: 291.358
[3600]	training's l2: 6.78193	training's mse_score: 6.781

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[400]	training's l2: 185.786	training's mse_score: 185.786	valid_1's l2: 467.579	valid_1's mse_score: 467.579
[800]	training's l2: 80.366	training's mse_score: 80.366	valid_1's l2: 379.917	valid_1's mse_score: 379.917
[1200]	training's l2: 46.6884	training's mse_score: 46.6884	valid_1's l2: 352.292	valid_1's mse_score: 352.292
[1600]	training's l2: 30.1513	training's mse_score: 30.1513	valid_1's l2: 338.311	valid_1's mse_score: 338.311
[2000]	training's l2: 20.7509	training's mse_score: 20.7509	valid_1's l2: 329.986	valid_1's mse_score: 329.986
[2400]	training's l2: 14.8302	training's mse_score: 14.8302	valid_1's l2: 324.285	valid_1's mse_score: 324.285
[2800]	training's l2: 11.1327	training's mse_score: 11.1327	valid_1's l2: 320.809	valid_1's mse_score: 320.809
[3200]	training's l2: 8.52407	training's mse_score: 8.52407	valid_1's l2: 318.217	valid_1's mse_score: 318.217
[3600]	training's l2: 6.7057	training's mse_score: 6.70

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[400]	training's l2: 182.3	training's mse_score: 182.3	valid_1's l2: 483.787	valid_1's mse_score: 483.787
[800]	training's l2: 79.4612	training's mse_score: 79.4612	valid_1's l2: 403.701	valid_1's mse_score: 403.701
[1200]	training's l2: 46.4746	training's mse_score: 46.4746	valid_1's l2: 376.389	valid_1's mse_score: 376.389
[1600]	training's l2: 30.5377	training's mse_score: 30.5377	valid_1's l2: 363.342	valid_1's mse_score: 363.342
[2000]	training's l2: 21.2569	training's mse_score: 21.2569	valid_1's l2: 355.416	valid_1's mse_score: 355.416
[2400]	training's l2: 15.3021	training's mse_score: 15.3021	valid_1's l2: 350.309	valid_1's mse_score: 350.309
[2800]	training's l2: 11.3443	training's mse_score: 11.3443	valid_1's l2: 346.928	valid_1's mse_score: 346.928
[3200]	training's l2: 8.64707	training's mse_score: 8.64707	valid_1's l2: 344.349	valid_1's mse_score: 344.349
[3600]	training's l2: 6.74801	training's mse_score: 6.748

New categorical_feature is ['cat_0', 'cat_1', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15', 'cat_16', 'cat_17', 'cat_18', 'cat_19', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 50 rounds
[400]	training's l2: 185.685	training's mse_score: 185.685	valid_1's l2: 432.589	valid_1's mse_score: 432.589
[800]	training's l2: 80.2699	training's mse_score: 80.2699	valid_1's l2: 358.122	valid_1's mse_score: 358.122
[1200]	training's l2: 46.0735	training's mse_score: 46.0735	valid_1's l2: 332.257	valid_1's mse_score: 332.257
[1600]	training's l2: 30.0127	training's mse_score: 30.0127	valid_1's l2: 319.781	valid_1's mse_score: 319.781


In [24]:
result = pd.read_pickle("result/result1.pkl")

In [17]:
print(result.shape)
result

(239, 2)


Unnamed: 0,loadingOrder,label
0,AB674675500650,125.770627
1,AC710522369160,138.417223
2,AD584528667006,56.507135
3,AD852352572660,321.337189
4,AE378244933121,135.485514
5,AE563102255135,576.043202
6,AI823168582586,531.357053
7,AJ216944869611,494.341648
8,AM438554857021,482.605668
9,AO528516733940,90.210673


In [97]:
result

Unnamed: 0,loadingOrder,label
0,AC437723355280,416.230213
1,AR701843002140,89.97187
2,AT407433613767,315.436076
3,AV639724236766,260.494004
4,BA498843975994,65.139755
5,BB470439135271,705.406683
6,BE790761427541,376.34592
7,BE898272362291,524.9499
8,BE929753510083,499.783716
9,BK898009012594,444.691697


In [26]:
result.loc[result['loadingOrder'] == 'ZU783006492351', 'label'] = 215.39
# result.loc[result['loadingOrder'] == 'HB956271385453', 'label'] = 623

# 根据保存的模型直接预测

In [8]:
def mse_score_eval(preds, valid):
    labels = valid.get_label()
    scores = mse(y_true=labels, y_pred=preds)
    return 'mse_score', scores, False

def build_model(train, test, label, seed=2020, is_shuffle=True):
    imp = pd.DataFrame()  # 特征重要性
    imp['feat'] = all_feas

    train_pred = np.zeros((train.shape[0], ))
    test_pred = np.zeros((test.shape[0], ))
    n_splits = 10
    # Kfold
    # fold = GroupKFold(n_splits=n_splits)
    fold = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=seed)
    # groups = train_df['loadingOrder'].values
    kf_way = fold.split(train)
    # params
    params = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'num_leaves': 128,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.75,
        'bagging_freq': 5,
        'seed': 8,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': 24,
        'verbose': 1,
    }
    # train
    for n_fold, (train_idx, valid_idx) in enumerate(kf_way, start=1):
        print("fold ", n_fold)
        train_x, train_y = train.iloc[train_idx], label[train_idx]
        valid_x, valid_y = train.iloc[valid_idx], label[valid_idx]
        # 数据加载
        n_train = lgb.Dataset(train_x, label=train_y)
        n_valid = lgb.Dataset(valid_x, label=valid_y)

#         clf = lgb.train(
#             params=params,
#             train_set=n_train,
#             categorical_feature=cat_feas,
#             num_boost_round=10000,
#             valid_sets=[n_train, n_valid],
#             early_stopping_rounds=50,
#             verbose_eval=100,
#             feval=mse_score_eval
#         )
#         clf.save_model("data/lgb_model_fold_{}.txt".format(n_fold))
        clf = lgb.Booster(model_file="data/lgb_save_model/lgb_model_fold_{}.txt".format(n_fold))
        train_pred[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        print("val_mse = {}".format(mse(valid_y, train_pred[valid_idx])))
        test_pred += clf.predict(test, num_iteration=clf.best_iteration)/fold.n_splits

        imp['gain' + str(n_fold + 1)] = clf.feature_importance(importance_type='gain')
        imp['split' + str(n_fold + 1)] = clf.feature_importance(importance_type='split')

    print("train mse: ", mse(label, train_pred))
    result = pd.DataFrame({
        'loadingOrder': test_df['loadingOrder'], 
        'label': test_pred,
    })
    return result, imp

result, imp = build_model(train_feas[all_feas], test_feas[all_feas], train_feas['label'].values, is_shuffle=True)

fold  1
val_mse = 896.7173560462967
fold  2
val_mse = 517.5055837664042
fold  3
val_mse = 690.3456844017744
fold  4
val_mse = 680.1628377551775
fold  5
val_mse = 654.7095421786684
fold  6
val_mse = 831.7647606373238
fold  7
val_mse = 1160.7401751905188
fold  8
val_mse = 771.1870089634364
fold  9
val_mse = 871.6475767371815
fold  10
val_mse = 612.6248600797542
train mse:  768.7405385756534


In [123]:
imp['gain'] = imp[[f for f in imp.columns if 'gain' in f]].sum(axis=1)/5
imp['split'] = imp[[f for f in imp.columns if 'split' in f]].sum(axis=1)/5

imp = imp[['feat', 'gain', 'split']]
imp = imp.sort_values(by=['gain'], ascending=False)
imp = imp.reset_index(drop=True)
# imp.to_pickle("imp_2.pkl")
imp

Unnamed: 0,feat,gain,split
0,num_353,35609800000.0,787.0
1,num_212,9941364000.0,1136.6
2,num_352,5077498000.0,725.8
3,cat_5,1816445000.0,3354.2
4,num_128,1718637000.0,1003.0
5,cat_6,952439300.0,4173.0
6,cat_1,903429100.0,28995.2
7,cat_7,354628200.0,7499.2
8,cat_11,245592700.0,15651.4
9,num_127,145671300.0,734.2


In [27]:
train_feas['label'].describe([0.01, 0.08])

count    130220.000000
mean        198.930673
std         198.848518
min       -1109.135556
1%          -44.315000
8%            0.544311
50%         121.855833
max        1388.678333
Name: label, dtype: float64

In [31]:
test_df.head(1)

Unnamed: 0,loadingOrder,timestamp,direction,speed,TRANSPORT_TRACE,carrierName,vesselMMSI,longitude,latitude,geo_hash5,geo_hash4,geo_hash3,begin_port_name,end_port_name,begin_port_position,end_port_position,begin_port_position_hash4,begin_port_position_hash5,begin_port_position_hash3,end_port_position_hash4,end_port_position_hash5,end_port_position_hash3,right_index,ans
0,AC437723355280,2019-06-23T02:53:18.000Z 2019-06-23T03:01:28.0...,16060 11910 8620 9110 9010 9010 9030 8960 9080...,26 27 23 23 23 23 23 23 23 20 17 17 18 19 20 2...,CNSHK-CLVAP,JCMFTA,G9916514058,113.895823 113.917878 113.97771200000001 113.9...,22.383463 22.360128 22.345253 22.34531 22.3453...,webzr wecp0 wecp1 wecp1 wecp1 wecp1 wecp1 wecp...,webz wecp wecp wecp wecp wecp wecp wecp wecp w...,web wec wec wec wec wec wec wec wec wec wec we...,CNSHK,CLVAP,113.86305800000001 22.559462,-71.642993 -33.030843,ws0b,ws0br,ws0,66jh,66jh1,66j,-1,


In [66]:
result = pd.read_pickle("result/result1.pkl")

In [67]:
test_df = pd.read_csv("data/Round2_test_802.csv")
test_df2 = pd.read_csv("data/Round2_test_717.csv")

In [68]:
test_df[test_df['loadingOrder']=="BR663094574600"]

Unnamed: 0,loadingOrder,timestamp,direction,speed,TRANSPORT_TRACE,carrierName,vesselMMSI,longitude,latitude,geo_hash5,geo_hash4,geo_hash3,begin_port_name,end_port_name,begin_port_position,end_port_position,begin_port_position_hash4,begin_port_position_hash5,begin_port_position_hash3,end_port_position_hash4,end_port_position_hash5,end_port_position_hash3,right_index,ans
13,BR663094574600,2020-06-01T00:02:56.000Z,33380,3,CNSHK-MYTPP-MUPLU-ZADUR,RWHZVZ,C5703488476,31.238667,-29.73064,kdwny,kdwn,kdw,CNSHK,ZADUR,113.86305800000001 22.559462,31.050079999999998 -29.868304,ws0b,ws0br,ws0,kdwn,kdwn4,kdw,0,2020-06-01T00:07:59.000Z


In [69]:
test_df2[test_df2['loadingOrder']=="BR663094574600"] 

Unnamed: 0,loadingOrder,timestamp,direction,speed,TRANSPORT_TRACE,carrierName,vesselMMSI,longitude,latitude,geo_hash5,geo_hash4,geo_hash3,begin_port_name,end_port_name,begin_port_position,end_port_position,begin_port_position_hash4,begin_port_position_hash5,begin_port_position_hash3,end_port_position_hash4,end_port_position_hash5,end_port_position_hash3
14,BR663094574600,2020-06-01T00:02:56.000Z 2020-06-01T00:07:59.0...,33380 14950 15090 15390 15620 15070 34030 3446...,3 2 0 3 0 7 1 2 1 2 1 2 3 0 1 2 6 0 4 1 0 4 4 ...,CNSHK-MYTPP-MUPLU-ZADUR,RWHZVZ,C5703488476,31.238667 31.23872 31.23868 31.238675 31.23858...,-29.73064 -29.730666999999997 -29.730573 -29.7...,kdwny kdwny kdwny kdwny kdwny kdwny kdwny kdwn...,kdwn kdwn kdwn kdwn kdwn kdwn kdwn kdwn kdwn k...,kdw kdw kdw kdw kdw kdw kdw kdw kdw kdw kdw kd...,CNSHK,ZADUR,113.86305800000001 22.559462,31.050079999999998 -29.868304,ws0b,ws0br,ws0,kdwn,kdwn4,kdw


In [18]:
# train mse:  256.9597223109227
# train mse:  256.2317758079061

sub_FORMAT = "%Y/%m%d  %H:%M:%S"
UTC_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"

def get_time(x, y):
    x = x.split(' ')[-1]
    x = datetime.datetime.strptime(x, UTC_FORMAT)
    day = y // 24 
    seconds = int((y - (y//24)*24)*3600)
    res = x + datetime.timedelta(days=day, seconds=seconds)
    str_res = res.strftime(sub_FORMAT)
    return res 

# result = pd.DataFrame({
#         'loadingOrder': test_loadingOrder, 
#         'label': test_pred,
#     })

result['timestamp'] = test_df['timestamp']
result['ETA'] = list(map(lambda x, y: get_time(x, y), result['timestamp'], result['label']))

In [19]:
result.shape 

(239, 4)

In [20]:
test_data = pd.read_csv("data/Btest0711_ALL.csv") 
def get_data(data, mode='train'):
    
    assert mode=='train' or mode=='test'
#     if mode=='train':
#         data['vesselNextportETA'] = pd.to_datetime(data['vesselNextportETA'], infer_datetime_format=True) # 转换时间
    if mode=='test':
        data['temp_timestamp'] = data['timestamp']
        data['onboardDate'] = pd.to_datetime(data['onboardDate'], infer_datetime_format=True)
    
    data['timestamp'] = pd.to_datetime(data['timestamp'], infer_datetime_format=True)
    return data

test_data = get_data(test_data, mode='test')

test_data.sort_values(['loadingOrder', 'timestamp'], inplace=True)
test_data = test_data.reset_index(drop=True)
print(test_data.shape)

test_data = test_data.merge(result[['loadingOrder', 'ETA']], on='loadingOrder', how='left')
print(test_data.shape)

test_data['ETA'] = test_data['ETA'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
test_data.drop(['direction','TRANSPORT_TRACE'],axis=1,inplace=True)
test_data['onboardDate'] = test_data['onboardDate'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))
test_data['creatDate'] = pd.datetime.now().strftime('%Y/%m/%d  %H:%M:%S')
test_data['timestamp'] = test_data['temp_timestamp']
# 整理columns顺序
result = test_data[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA', 'creatDate']]

(57071, 11)
(57071, 12)




In [21]:
result.to_csv('result/round2B_lgb_hc.csv', index=False)

In [23]:
tmp[['loadingOrder', 'onboardDate', 'ETA']].to_csv("result/res.csv", index=None)

In [168]:
# 合并结果
res1 = pd.read_csv("result/round2_lgb_0805_lr001.csv")
res1.sort_values(['loadingOrder', 'timestamp'], inplace=True)
res1 = res1.reset_index(drop=True)
res2 = pd.read_csv("result/round2_804_newdata_more_label.csv")
res2.sort_values(['loadingOrder', 'timestamp'], inplace=True)
res2 = res2.reset_index(drop=True)
print(res1.shape, res2.shape)

res1['ETA'] = pd.to_datetime(res1['ETA'], infer_datetime_format=True)
res2['ETA'] = pd.to_datetime(res2['ETA'], infer_datetime_format=True)

res1['diff'] = (res1['ETA'] - pd.to_datetime('2019/01/01 00:00:00',infer_datetime_format=True)).dt.total_seconds()/3600
res2['diff'] = (res2['ETA'] - pd.to_datetime('2019/01/01 00:00:00',infer_datetime_format=True)).dt.total_seconds()/3600

res1['diff'] = res1['diff'] * 0.3 + res2['diff'] * 0.7

sub_FORMAT = "%Y/%m%d  %H:%M:%S"
UTC_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"

x = pd.to_datetime('2019/01/01 00:00:00',infer_datetime_format=True)
def get_time(y):
    day = y // 24 
    seconds = int((y - (y//24)*24)*3600)
    res = x + datetime.timedelta(days=day, seconds=seconds)
    str_res = res.strftime(sub_FORMAT)
    return res 

res1['ETA'] = res1['diff'].apply(lambda x: get_time(x))
del res1['diff']

res1['ETA'] = res1['ETA'].apply(lambda x:x.strftime('%Y/%m/%d  %H:%M:%S'))

(72847, 9) (72847, 9)


In [169]:
result['ETA'] = res1['ETA']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [174]:
result.to_csv("result/round2_lgb_ronghe0808.csv", index=False)

In [170]:
res1.drop_duplicates('loadingOrder')

Unnamed: 0,loadingOrder,timestamp,longitude,latitude,carrierName,vesselMMSI,onboardDate,ETA,creatDate
0,AC437723355280,2019-06-23T02:53:18.000Z,113.895823,22.383463,JCMFTA,G9916514058,2019/06/23 02:53:18,2019/07/20 20:51:08,2020/08/05 14:09:39
334,AR701843002140,2020-03-25T15:55:58.000Z,113.694225,22.630953,OYSCFP,O8358265987,2020/03/25 15:49:58,2020/03/29 03:25:32,2020/08/05 14:09:39
372,AT407433613767,2019-04-01T00:14:18.000Z,104.196312,1.287627,RWHZVZ,J3697709469,2019/04/01 00:14:18,2019/04/24 10:48:44,2020/08/05 14:09:39
550,AV639724236766,2020-06-01T19:13:22.000Z,113.894683,22.4512,OYSCFP,R7006912480,2020/06/01 19:13:22,2020/06/12 22:33:21,2020/08/05 14:09:39
690,BA498843975994,2020-06-16T02:23:12.000Z,113.899117,22.461683,JONOCD,U9615709922,2020/06/16 02:23:12,2020/06/19 20:28:43,2020/08/05 14:09:39
1790,BB470439135271,2020-04-28T00:00:19.000Z,129.496267,34.140347,OIEQNT,Q2654292540,2020/04/28 00:00:19,2020/05/28 08:07:55,2020/08/05 14:09:39
3867,BE790761427541,2020-06-10T13:59:56.000Z,113.883267,22.4441,OYSCFP,I9946011301,2020/06/10 13:59:56,2020/06/26 16:00:00,2020/08/05 14:09:39
4029,BE898272362291,2019-11-19T05:07:38.000Z,114.273142,22.56341,OYSCFP,C2075927370,2019/11/19 05:07:38,2019/12/12 15:37:20,2020/08/05 14:09:39
4121,BE929753510083,2019-07-06T23:04:08.000Z,114.285403,22.576393,OYSCFP,B8183659437,2019/07/06 23:04:08,2019/07/31 00:36:07,2020/08/05 14:09:39
4210,BK898009012594,2019-09-05T07:01:18.000Z,114.274107,22.564852,OIEQNT,E8197708075,2019/09/05 07:01:18,2019/09/27 05:03:11,2020/08/05 14:09:39


In [172]:
# result.drop_duplicates('loadingOrder')

In [157]:
res1 = pd.read_csv("result/round2_lgb_0805_lr001.csv")
res1.drop_duplicates('loadingOrder')

Unnamed: 0,loadingOrder,timestamp,longitude,latitude,carrierName,vesselMMSI,onboardDate,ETA,creatDate
0,AC437723355280,2019-06-23T02:53:18.000Z,113.895823,22.383463,JCMFTA,G9916514058,2019/06/23 02:53:18,2019/07/20 22:31:16,2020/08/05 14:09:39
334,AR701843002140,2020-03-25T15:55:58.000Z,113.694225,22.630953,OYSCFP,O8358265987,2020/03/25 15:49:58,2020/03/29 10:27:36,2020/08/05 14:09:39
372,AT407433613767,2019-04-01T00:14:18.000Z,104.196312,1.287627,RWHZVZ,J3697709469,2019/04/01 00:14:18,2019/04/23 23:46:27,2020/08/05 14:09:39
550,AV639724236766,2020-06-01T19:13:22.000Z,113.894683,22.4512,OYSCFP,R7006912480,2020/06/01 19:13:22,2020/06/12 20:27:58,2020/08/05 14:09:39
690,BA498843975994,2020-06-16T02:23:12.000Z,113.899117,22.461683,JONOCD,U9615709922,2020/06/16 02:23:12,2020/06/19 17:06:21,2020/08/05 14:09:39
1790,BB470439135271,2020-04-28T00:00:19.000Z,129.496267,34.140347,OIEQNT,Q2654292540,2020/04/28 00:00:19,2020/05/28 09:14:58,2020/08/05 14:09:39
3867,BE790761427541,2020-06-10T13:59:56.000Z,113.883267,22.4441,OYSCFP,I9946011301,2020/06/10 13:59:56,2020/06/26 16:20:44,2020/08/05 14:09:39
4029,BE898272362291,2019-11-19T05:07:38.000Z,114.273142,22.56341,OYSCFP,C2075927370,2019/11/19 05:07:38,2019/12/12 18:48:47,2020/08/05 14:09:39
4121,BE929753510083,2019-07-06T23:04:08.000Z,114.285403,22.576393,OYSCFP,B8183659437,2019/07/06 23:04:08,2019/07/30 13:05:19,2020/08/05 14:09:39
4210,BK898009012594,2019-09-05T07:01:18.000Z,114.274107,22.564852,OIEQNT,E8197708075,2019/09/05 07:01:18,2019/09/26 02:54:08,2020/08/05 14:09:39


In [140]:
res2.head()

Unnamed: 0,loadingOrder,timestamp,longitude,latitude,carrierName,vesselMMSI,onboardDate,ETA,creatDate,diff
0,AC437723355280,2019-06-23T02:53:18.000Z,113.895823,22.383463,JCMFTA,G9916514058,2019/06/23 02:53:18,2019-07-20 20:08:14,2020/08/04 21:29:26,4820.137222
1,AC437723355280,2019-06-23T03:01:28.000Z,113.917878,22.360128,JCMFTA,G9916514058,2019/06/23 02:53:18,2019-07-20 20:08:14,2020/08/04 21:29:26,4820.137222
2,AC437723355280,2019-06-23T03:16:38.000Z,113.977712,22.345253,JCMFTA,G9916514058,2019/06/23 02:53:18,2019-07-20 20:08:14,2020/08/04 21:29:26,4820.137222
3,AC437723355280,2019-06-23T03:17:08.000Z,113.980273,22.34531,JCMFTA,G9916514058,2019/06/23 02:53:18,2019-07-20 20:08:14,2020/08/04 21:29:26,4820.137222
4,AC437723355280,2019-06-23T03:18:08.000Z,113.984088,22.34534,JCMFTA,G9916514058,2019/06/23 02:53:18,2019-07-20 20:08:14,2020/08/04 21:29:26,4820.137222


# 分析一下结果

In [6]:
import pandas as pd
# res1 = pd.read_csv("data/B_lgb_10fold_old_add_fea.csv")
res2 = pd.read_csv("data/B_lgb_10fold_old.csv")
res1.drop_duplicates('loadingOrder', inplace=True)
res2.drop_duplicates('loadingOrder', inplace=True)

res1['onboardDate'] = pd.to_datetime(res1['onboardDate'], infer_datetime_format=True)
res1['ETA'] =  pd.to_datetime(res1['ETA'], infer_datetime_format=True)
res2['onboardDate'] = pd.to_datetime(res2['onboardDate'], infer_datetime_format=True)
res2['ETA'] =  pd.to_datetime(res2['ETA'], infer_datetime_format=True)

res1['diff'] = (res1['ETA'] - res1['onboardDate']).dt.total_seconds()
res2['diff'] = (res2['ETA'] - res2['onboardDate']).dt.total_seconds()

In [7]:
res1['diff'].corr(res2['diff'])

0.9994296946716894

In [29]:
result.shape

(57071, 9)

In [65]:
res1 = pd.read_csv("result/round2B_lgb_hc.csv")
# res1 = result
# res2 = pd.read_csv("result/nn_merge3.csv")
res2 = pd.read_csv("result/b2_807.csv")

In [66]:
res1 = res1.drop_duplicates("loadingOrder")
res2 = res2.drop_duplicates("loadingOrder")

In [67]:
res1 = res1[['loadingOrder', 'timestamp', 'ETA']].reset_index(drop=True)
res2 = res2[['loadingOrder', 'timestamp', 'ETA']].reset_index(drop=True)

In [68]:
res1 = res1.merge(res2, how='left', on='loadingOrder')

In [69]:
del res1['timestamp_y']

In [70]:
res2

Unnamed: 0,loadingOrder,timestamp,ETA
0,NJ169522947117,2019-12-30T06:23:18.000Z,2020/01/01 19:19:59
1,SX540070026140,2020-03-11T05:59:18.000Z,2020/03/13 20:19:54
2,ZV919459607351,2020-06-01T00:09:13.000Z,2020/06/07 09:42:46
3,AE378244933121,2020-06-01T00:06:13.000Z,2020/06/08 05:09:05
4,FQ684650477699,2020-06-01T00:00:43.000Z,2020/06/25 12:23:58
5,WG447072368140,2020-06-01T00:00:12.000Z,2020/06/18 07:32:15
6,ZB832047068480,2020-06-01T00:10:23.000Z,2020/06/02 19:07:00
7,JU877708983002,2020-06-01T00:07:12.000Z,2020/06/09 20:52:12
8,TI932873678567,2020-06-08T08:12:02.000Z,2020/06/18 20:51:11
9,KX661536528233,2020-06-01T00:05:39.000Z,2020/06/22 07:35:53


In [71]:
res1['ETA_x'] = pd.to_datetime(res1['ETA_x'], infer_datetime_format=True)
res1['ETA_y'] = pd.to_datetime(res1['ETA_y'], infer_datetime_format=True)

In [72]:
count = test_data.groupby('loadingOrder').agg({'timestamp':'count'}).reset_index()
res1 = res1.merge(count, how='left', on='loadingOrder')

res1['diff'] = ((res1['ETA_x'] - res1['ETA_y']).dt.total_seconds()/3600) ** 2 * res1['timestamp'] / 57071

In [73]:
res1['diff'] = round(res1['diff'], 3)

In [74]:
res2.sort_values(['loadingOrder', 'timestamp'], inplace=True)
res2

Unnamed: 0,loadingOrder,timestamp,ETA
32,AB674675500650,2020-06-01T00:01:50.000Z,2020/06/09 03:41:28
59,AC710522369160,2020-06-09T23:17:33.000Z,2020/06/15 13:44:48
110,AD584528667006,2020-06-02T03:33:34.000Z,2020/06/04 16:50:39
76,AD852352572660,2020-06-10T18:56:34.000Z,2020/06/24 09:08:29
3,AE378244933121,2020-06-01T00:06:13.000Z,2020/06/08 05:09:05
54,AE563102255135,2019-12-26T00:24:48.000Z,2020/01/31 09:36:18
179,AI823168582586,2019-08-27T02:51:28.000Z,2019/09/20 09:30:58
155,AJ216944869611,2019-05-13T03:00:08.000Z,2019/06/06 13:02:37
234,AM438554857021,2019-06-24T08:17:38.000Z,2019/07/17 07:34:21
95,AO528516733940,2020-03-22T12:30:18.000Z,2020/03/26 13:09:48


In [76]:
res1[res1['diff'] > 5]

Unnamed: 0,loadingOrder,timestamp_x,ETA_x,ETA_y,timestamp,diff
0,AB674675500650,2020-06-01T00:01:50.000Z,2020-06-07 05:44:09,2020-06-09 03:41:28,440,16.282
5,AE563102255135,2019-12-26T00:24:48.000Z,2020-01-30 08:01:53,2020-01-31 09:36:18,934,10.703
28,CM821215596820,2020-02-07T08:01:58.000Z,2020-03-13 23:33:52,2020-03-15 08:45:37,280,5.406
58,FX975491217543,2020-06-01T00:01:50.000Z,2020-06-10 11:29:01,2020-06-13 07:21:33,440,35.519
105,KO849768330117,2020-06-04T15:56:36.000Z,2020-06-09 22:34:02,2020-06-13 18:29:15,88,13.028
133,NX668481428339,2020-06-01T00:07:20.000Z,2020-06-25 04:22:06,2020-06-28 09:46:44,175,18.375
175,SF782540171392,2020-01-14T02:21:58.000Z,2020-02-07 21:19:44,2020-02-07 05:12:21,1468,6.687
178,SU860129068725,2019-09-01T01:09:08.000Z,2019-09-21 18:39:11,2019-09-24 13:28:32,99,7.746
190,UB681657714929,2020-06-03T07:08:36.000Z,2020-06-25 06:04:51,2020-06-27 03:13:07,145,5.176
200,UU854609945095,2019-02-07T00:24:28.000Z,2019-03-03 10:00:16,2019-02-28 21:54:15,936,59.24


In [39]:
test_data[test_data['loadingOrder'] == 'OS590961425176'].shape

(190, 11)

In [51]:
res1['diff'].sum()

1061.188

In [62]:
def cal(hour, count):
    return hour**2 * count / 72847

cal(3.6, 334) * 72847

4328.64