# 模型训练

实验平台：华为云 CPU 8核 32GiB

软件环境：python3.6

## 1 环境配置与数据准备

In [1]:
!pip install joblib

[33mYou are using pip version 9.0.1, however version 20.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
!pip install lightgbm

Collecting lightgbm
  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/0b/9d/ddcb2f43aca194987f1a99e27edf41cf9bc39ea750c3371c2a62698c509a/lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 60.1MB/s ta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1
[33mYou are using pip version 9.0.1, however version 20.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import numpy as np
import pandas as pd
import os
import joblib
import time

In [4]:
from sklearn.linear_model import LinearRegression, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb

In [5]:
source_file_path = os.environ['HOME'] + '/work/'
source_file_path

'/home/ma-user/work/'

In [9]:
from modelarts.session import Session
session = Session()
session.download_data(bucket_path="etasll2020/train_label2.csv", path=source_file_path+"data/train_label2.csv")
session.download_data(bucket_path="etasll2020/train_feature2.csv", path=source_file_path+"data/train_feature2.csv")

Successfully download file etasll2020/train_label2.csv from OBS to local /home/ma-user/work/data/train_label2.csv
Successfully download file etasll2020/train_feature2.csv from OBS to local /home/ma-user/work/data/train_feature2.csv


In [10]:
train_feature = pd.read_csv("data/train_feature2.csv")
train_label = pd.read_csv("data/train_label2.csv")

In [11]:
X_train = np.array(train_feature)
y_train = np.array(train_label)
y_train = y_train.reshape(-1)
X_train.shape, y_train.shape

((18951, 14), (18951,))

## 2 基本调用函数

In [12]:
# 保存模型
def save_model(model, name):
    joblib.dump(model, "saved_model/" + name)


# 加载模型
def load_model(name):
    model = joblib.load("saved_model/" + name)
    return model

In [13]:
NJOBS = 8
n_folds = 5

def cross_validation(model, X, y):
    kf = KFold(n_folds, shuffle=True)
    mse = -cross_val_score(model, X/3600, y/3600, scoring="neg_mean_squared_error", cv=kf, n_jobs=NJOBS)
    return mse

## 3 模型比较

### 3.1 线性模型
* LinearRegression
* Lasso
* BayesianRidge
* LassoLarsIC

In [14]:
model_lr = LinearRegression()
score = cross_validation(model_lr, X_train, y_train)
score, np.mean(score)

(array([10875.51272892, 10655.46791765, 10486.90981549,  9406.74269793,
         9655.61165091]),
 10216.04896218152)

In [15]:
model_lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0001, random_state=2))
score = cross_validation(model_lasso, X_train, y_train)
score, np.mean(score)

(array([ 9590.03600862, 10784.74066502, 10688.08679576,  9086.7455184 ,
        11003.50491418]),
 10230.62278039616)

In [16]:
model_llic = LassoLarsIC()
score = cross_validation(model_llic, X_train, y_train)
score, np.mean(score)

(array([ 8474.64181287, 11511.34416184, 10567.4388824 ,  9434.10700616,
        11234.41755199]),
 10244.389883050897)

In [17]:
model_br = BayesianRidge()
score = cross_validation(model_br, X_train, y_train)
score, np.mean(score)

(array([10824.91807049,  9218.73400394, 10219.57860856, 10573.92424905,
        10281.78039415]),
 10223.787065240456)

### 3.2 决策树
* DecisionTreeRegressor

In [18]:
model_dt = DecisionTreeRegressor()
score = cross_validation(model_dt, X_train, y_train)
score, np.mean(score)

(array([3089.42518323, 3916.35143461, 4027.61569865, 4344.89418769,
        4717.25911736]),
 4019.1091243087867)

### 3.3 K近邻
* KNeighborsRegressor

In [20]:
model_knn = KNeighborsRegressor(n_neighbors=3)
score = cross_validation(model_knn, X_train, y_train)
score, np.mean(score)

(array([5229.86407384, 6857.72068144, 5493.36737481, 5014.92799705,
        5923.56590775]),
 5703.889206979349)

### 3.4 支持向量机
* SVR

In [21]:
model_svm = SVR()
score = cross_validation(model_svm, X_train, y_train)
score, np.mean(score)

(array([29838.613272  , 31402.4725567 , 27963.36432543, 26199.7878413 ,
        29961.30748378]),
 29073.10909584241)

### 3.5 集成学习
* RandomForestRegressor
* BaggingRegressor
* LGBMRegressor
* XGBRegressor
* GradientBoostingRegressor

In [22]:
model_rf = RandomForestRegressor(n_jobs=NJOBS)
score = cross_validation(model_rf, X_train, y_train)
score, np.mean(score)

(array([2692.33021979, 3025.1277502 , 2460.15423098, 2056.23403889,
        3106.54330023]),
 2668.077908019478)

In [23]:
model_bag = BaggingRegressor()
score = cross_validation(model_bag, X_train, y_train)
score, np.mean(score)

(array([3073.33738456, 2042.39061659, 2619.28694261, 2699.23124331,
        3141.38738811]),
 2715.126715036007)

In [24]:
model_lgb = lgb.LGBMRegressor()
score = cross_validation(model_lgb, X_train, y_train)
score, np.mean(score)

(array([4222.85007106, 3020.2175158 , 3224.17547815, 3691.79541711,
        2881.682065  ]),
 3408.1441094216293)

In [25]:
model_xgb = xgb.XGBRegressor()
score = cross_validation(model_xgb, X_train, y_train)
score, np.mean(score)

(array([5840.52617242, 5172.23873176, 4833.72376626, 4265.41251206,
        4776.74601327]),
 4977.7294391538)

In [26]:
model_gbdt = GradientBoostingRegressor()
score = cross_validation(model_gbdt, X_train, y_train)
score, np.mean(score)

(array([4403.13353388, 4873.31268948, 4867.09890319, 5848.82607862,
        4367.45592384]),
 4871.965425800985)

In [27]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
score = cross_validation(model_xgb, X_train, y_train)
score, np.mean(score)

(array([3424.34067648, 3921.46610537, 2593.36090976, 3289.04065377,
        2623.80420741]),
 3170.4025105578758)

In [28]:
model_gbdt = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state=5)
score = cross_validation(model_gbdt, X_train, y_train)
score, np.mean(score)

(array([4088.14041677, 3982.76516622, 3158.08436352, 3965.23156337,
        3473.26989533]),
 3733.4982810416977)

## 4 模型融合

### 4.1 Averaging

In [29]:
# reference: https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

class Averaging(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        for model in self.models_:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions, axis=1)  

In [31]:
model_averaging = Averaging((model_lgb, model_bag, model_rf, model_xgb))
score = cross_validation(model_averaging, X_train, y_train)
score, np.mean(score)

(array([2862.34707262, 2370.24371843, 2580.13991194, 1882.83686604,
        2580.10153755]),
 2455.1338213139848)

In [32]:
class Stacking(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred 
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [33]:
model_stacking = Stacking(base_models = (model_lgb, model_bag, model_rf, model_xgb), meta_model = model_lr)
score = cross_validation(model_stacking, X_train, y_train)
score, np.mean(score)

(array([2000.82823959, 2319.94728808, 2031.79929205, 3448.62416865,
        2398.37817696]),
 2439.9154330642255)

## 5 训练、预测、保存

In [35]:
test_feature = pd.read_csv("data/test_feature.csv")
test_data = pd.read_csv("data/A_testData0531.csv")

In [43]:
import time

def get_timeStamp(timeString, form=0):
    if form == 0:
        timeArray = time.strptime(timeString, '%Y-%m-%dT%H:%M:%S.%fZ')
    if form == 1:
        timeArray = time.strptime(timeString, '%Y/%m/%d  %H:%M:%S')
    timeStamp = int(time.mktime(timeArray))
    return timeStamp

def timestamp_to_str(timestamp=None, format='%Y/%m/%d  %H:%M:%S'):
    if timestamp:
        time_tuple = time.localtime(timestamp)  # 把时间戳转换成时间元祖
        result = time.strftime(format, time_tuple)  # 把时间元祖转换成格式化好的时间
        return result
    else:
        return time.strptime(format)
    
def model_test(model, test_feature, test_data):
    _test_feature = test_feature.copy()
    _test_data = test_data.copy()
    test_feature_array = np.array(_test_feature.iloc[:,1:])
    _test_feature['label'] = model.predict(test_feature_array)
    result = _test_feature[['loadingOrder','label']]

    _test_data = _test_data.merge(result, on='loadingOrder', how='left')
    _test_data['ETA'] = _test_data['onboardDate'].apply(lambda x:get_timeStamp(x,form=1)) + _test_data['label']
    _test_data['ETA'] = _test_data['ETA'].apply(lambda x:timestamp_to_str(x))
    _test_data.drop(['direction','TRANSPORT_TRACE'],axis=1,inplace=True)
    #test_data['onboardDate'] = test_data['onboardDate']
    _test_data['creatDate'] = pd.datetime.now().strftime('%Y/%m/%d  %H:%M:%S')
    # 整理columns顺序
    result = _test_data[['loadingOrder', 'timestamp', 'longitude', 'latitude', 'carrierName', 'vesselMMSI', 'onboardDate', 'ETA', 'creatDate']]
    return result

In [44]:
model_stacking.fit(X_train, y_train)
result = model_test(model_stacking, test_feature, test_data)

In [45]:
result.head()

Unnamed: 0,loadingOrder,timestamp,longitude,latitude,carrierName,vesselMMSI,onboardDate,ETA,creatDate
0,CF946210847851,2019-04-02T02:42:28.000Z,138.471062,40.278787,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/18 21:58:03,2020/06/18 23:03:13
1,CF946210847851,2019-04-02T02:59:28.000Z,138.552168,40.327785,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/18 21:58:03,2020/06/18 23:03:13
2,CF946210847851,2019-04-02T03:07:28.000Z,138.58825,40.352542,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/18 21:58:03,2020/06/18 23:03:13
3,CF946210847851,2019-04-02T03:43:28.000Z,138.751325,40.459447,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/18 21:58:03,2020/06/18 23:03:13
4,CF946210847851,2019-04-02T04:29:28.000Z,138.969782,40.581485,OIEQNT,R5480015614,2019/04/02 02:42:28,2019/04/18 21:58:03,2020/06/18 23:03:13


In [46]:
result.to_csv("stacked_averaged_models5.csv", index=False)

In [48]:
import moxing as mox

INFO:root:Using MoXing-v1.15.1-99273b13
INFO:root:Using OBS-Python-SDK-3.1.2


In [49]:
mox.file.copy_parallel('stacked_averaged_models5.csv', 's3://etasll2020/saved/stacked_averaged_models5.csv') 