# 特征工程第二部分
硬件环境：MacOS 10.15

软件环境：python3.7

对74个文件进行合并，将同一份订单的数据合并，并进行二次预处理

In [1]:
import numpy as np
import pandas as pd
import os
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings('ignore')

NJOBS = 4

## 1 数据合并

In [3]:
features = []
for file in os.listdir("feature200_new"):
    features.append(pd.read_csv("feature200_new/"+file, index_col=0))
train_data = pd.concat(features)

In [4]:
train_data = train_data.sort_index()

In [6]:
train_data.head()

Unnamed: 0,anchorTime,lastLatitude,lastLongitude,latitude,loadingOrder,longitude,manDis,maxDirdiff,maxSpeed,meanDirdiff,meanSpeed,medianDirdiff,medianSpeed,minDirdiff,minSpeed,number,timeInterval
0.0,79119.0,22.337142,114.116107,-25.831867,ZQ464072113491,-106.724293,15867.2253,148.067527,189.0,5.058631,35.494018,3.353012,37.0,0.008196,0.0,6029.0,1676766.0
2.0,0.0,19.059193,-104.2963,31.14795,RZ321715965702,131.908943,11822.441815,165.6,37.0,12.516673,29.914302,12.511329,30.0,0.011097,1.0,7013.0,1454420.0
19.0,226071.0,22.46245,113.877652,26.506663,UZ994694469634,-127.468163,11460.17736,179.741933,211.107228,47.363293,24.048482,22.942156,31.0,0.12019,0.0,3644.0,1850996.0
20.0,0.0,6.435218,3.389182,6.318798,LX197978822803,3.424328,13.515385,151.8,13.0,50.729645,10.0,36.280195,10.5,5.801559,6.0,5.0,6000.0
21.0,0.0,6.435218,3.389182,6.318798,KF747095155025,3.424328,13.515385,151.8,13.0,50.729645,10.0,36.280195,10.5,5.801559,6.0,5.0,6000.0


## 2 特征合并

In [7]:
def add_feature_grouped(subset):
    feature = ['loadingOrder', 'longitude', 'latitude', 'lastLongitude', 'lastLatitude',
       'anchorTime', 'manDis', 'maxSpeed', 'minSpeed', 'meanSpeed', 'medianSpeed',
       'maxDirdiff', 'minDirdiff', 'meanDirdiff', 'medianDirdiff', 'timeInterval'] 
    
    if len(subset) == 1:
        return subset[feature]
    
    ans = subset.head(1)
    # anchor time
    ans['anchorTime'] = subset['anchorTime'].sum()
    ans['timeInterval'] = subset['timeInterval'].sum()
    
    final = subset.tail(1)
    lastLongitude = final['longitude'].to_list()[0]
    lastLatitude = final['latitude'].to_list()[0]
    ans['lastLatitude'] = lastLatitude
    ans['lastLongitude'] = lastLongitude
    ans['manDis'] =  subset['manDis'].sum()
#     ans['manDis'] = haversine_array(ans['latitude'].to_list()[0], ans['longitude'].to_list()[0], lastLatitude, lastLongitude)

    ans['maxSpeed'] = subset['maxSpeed'].max()
    ans['minSpeed'] = subset['minSpeed'].min()
    ans['maxDirdiff'] = subset['maxDirdiff'].max()
    ans['minDirdiff'] = subset['minDirdiff'].min()

    ratio = subset['number'] / subset['number'].sum()
    ans['meanSpeed'] = np.sum(ans['meanSpeed']*ratio)
    ans['medianSpeed'] = np.sum(ans['medianSpeed']*ratio)
    ans['meanDirdiff'] = np.sum(ans['meanDirdiff']*ratio)
    ans['medianDirdiff'] = np.sum(ans['medianDirdiff']*ratio)
    
    return ans[feature]


def add_feature(data, show_progress=False):
    data_grouped = data.groupby('loadingOrder')

    if show_progress:
        feature = Parallel(n_jobs=NJOBS)(delayed(add_feature_grouped)(group) for name, group in tqdm(data_grouped))
    else:
        feature = Parallel(n_jobs=NJOBS)(delayed(add_feature_grouped)(group) for name, group in data_grouped)
    return pd.concat(feature)

In [8]:
train_features = add_feature(train_data, show_progress=True)

HBox(children=(IntProgress(value=0, max=21157), HTML(value='')))




In [9]:
train_features.sort_index()

Unnamed: 0,loadingOrder,longitude,latitude,lastLongitude,lastLatitude,anchorTime,manDis,maxSpeed,minSpeed,meanSpeed,medianSpeed,maxDirdiff,minDirdiff,meanDirdiff,medianDirdiff,timeInterval
0.0,ZQ464072113491,-106.724293,-25.831867,114.116107,22.337142,79119.0,15867.225300,189.000000,0.000000,35.494018,37.000000,148.067527,0.008196,5.058631,3.353012,1676766.0
2.0,RZ321715965702,131.908943,31.147950,-104.296300,19.059193,0.0,11822.441815,37.000000,1.000000,29.914302,30.000000,165.600000,0.011097,12.516673,12.511329,1454420.0
19.0,UZ994694469634,-127.468163,26.506663,113.877545,22.462415,305335.0,11489.527592,211.107228,0.000000,23.325172,30.067607,179.741933,0.120190,45.938738,22.252120,1939319.0
20.0,LX197978822803,3.424328,6.318798,3.389182,6.435218,0.0,13.515385,13.000000,6.000000,10.000000,10.500000,151.800000,5.801559,50.729645,36.280195,6000.0
21.0,KF747095155025,3.424328,6.318798,3.389182,6.435218,0.0,13.515385,13.000000,6.000000,10.000000,10.500000,151.800000,5.801559,50.729645,36.280195,6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140474286.0,AZ195734902338,114.289438,22.572860,113.904436,19.690276,0.0,323.005719,37.000000,6.000000,28.457143,34.000000,168.000000,0.006475,19.611233,2.262230,37235.0
140491664.0,GS717036500967,114.287903,22.568342,114.288183,22.567982,38167.0,0.049285,4.607103,0.000000,0.389587,0.236010,178.088754,0.953937,73.167626,59.002302,38167.0
140498543.0,CK753945855073,113.683590,22.639473,113.791388,21.773520,23485.0,96.926955,35.000000,0.002702,14.574898,15.000000,176.400000,0.085502,34.089730,19.769862,36475.0
140502889.0,XK498516866928,113.891990,22.452473,114.069487,22.357117,35130.0,21.103926,25.000000,0.006357,4.689779,0.065243,164.888648,2.021135,132.017902,164.371979,35130.0


## 3 异常值、缺失值处理

In [10]:
train_features.replace(np.inf, np.nan, inplace=True)
train_features.replace(-np.inf, np.nan, inplace=True)
train_features.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
train_features.drop(train_features[train_features['meanSpeed']>60].index, inplace=True)
train_features.drop(train_features[train_features['maxSpeed']>300].index, inplace=True)

In [11]:
len(train_features)

18951

In [12]:
train_feature = train_features.iloc[:,1:-1]
train_label = train_features['timeInterval']

In [13]:
train_feature

Unnamed: 0,longitude,latitude,lastLongitude,lastLatitude,anchorTime,manDis,maxSpeed,minSpeed,meanSpeed,medianSpeed,maxDirdiff,minDirdiff,meanDirdiff,medianDirdiff
2924437.0,114.260392,22.571047,-106.201740,19.755487,631544.0,13883.520192,47.000000,0.0,2.249170,0.166410,179.804001,0.000000,12.493624,9.742803
73950370.0,113.879783,22.458383,-2.229783,36.429483,6723.0,14777.577750,42.172337,0.0,0.603108,0.586751,177.538325,0.003530,0.988525,0.393920
79594973.0,113.890020,22.445098,106.585000,5.120000,105447.0,7828.913121,209.167212,0.0,2.016023,2.025157,179.277044,0.000000,3.777685,2.561908
119572196.0,113.887967,22.442017,107.354800,6.303433,0.0,2677.746104,102.282908,1.0,7.791518,7.937365,145.000000,0.066482,8.255429,2.338277
1430349.0,-104.292333,19.065800,116.249000,22.160833,46133.0,14020.014880,80.266083,0.0,1.657640,1.641872,177.386146,0.000000,0.478086,0.104844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69459025.0,113.878350,22.450050,105.988827,4.454933,0.0,2700.919264,95.330617,0.0,7.386431,8.173828,155.253284,0.000000,14.261834,12.484594
62272958.0,113.882578,22.445442,106.917390,-6.092340,0.0,3262.556839,163.903271,1.0,23.321832,25.000000,134.714477,0.037050,42.560961,32.864231
121117698.0,113.688808,22.639068,59.183460,24.053997,69603.0,8449.767187,64.764272,0.0,6.500909,6.477623,179.016036,0.004989,14.979700,11.247106
85360472.0,113.870270,22.466087,61.885162,-17.303783,436297.0,10736.486173,124.268358,0.0,0.942190,0.920261,179.949215,0.001202,1.913462,1.700459


In [14]:
train_label

2924437.0      2235191.0
73950370.0     2759199.0
79594973.0     1193656.0
119572196.0     375857.0
1430349.0      2372053.0
                 ...    
69459025.0      260967.0
62272958.0      369451.0
121117698.0    1121606.0
85360472.0     2119828.0
90368515.0     1874769.0
Name: timeInterval, Length: 18951, dtype: float64

In [15]:
train_feature.to_csv("data/train_feature2.csv", index=False)
train_label.to_csv("data/train_label2.csv",index=False, header=True)