## 新能源电动汽车动力电池充电能量预测（六）——训练模型及预测

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
from datetime import date, datetime
from package import utils
import time

### 定义数据处理的类型：1.with_fill（使用填充数据）；2.without_fill（不使用填充数据）

In [2]:
type = 2

### 导入训练数据集

In [3]:
# 导入训练集
dataset_train = pd.read_csv('./energy_predict_data/data_prehandle/5_data_split/train/train_%s.csv' % utils.getType(type))
# 数据字段
header = dataset_train.columns.values.tolist()
# 数据描述
pd.set_option('precision', 2)
dataset_train.describe()

Unnamed: 0,vehicle_id,charge_start_time,charge_end_time,charge_duration,mileage,delta_mileage,charge_start_soc,charge_end_soc,charge_delta_soc,charge_start_U,charge_end_U,charge_start_I,charge_end_I,charge_max_temp,charge_min_temp,charge_delta_temp,charge_energy
count,4313.0,4310.0,4310.0,4313.0,4313.0,4313.0,4313.0,4313.0,4313.0,4313.0,4313.0,4313.0,4313.0,4313.0,4313.0,4313.0,4313.0
mean,11.59,20200000000000.0,20200000000000.0,6954.86,98767.18,103.29,34.41,74.86,40.45,407.46,438.63,-44.72,-22.91,25.29,13.01,12.28,26.45
std,6.45,4340000000.0,4340000000.0,8272.41,30063.91,66.28,19.59,28.84,22.86,93.46,89.08,48.91,35.96,10.0,10.44,9.4,21.3
min,1.0,20200000000000.0,20200000000000.0,600.0,45846.6,0.0,0.0,4.8,0.4,252.3,320.7,-202.0,-199.0,-2.0,-7.0,0.0,0.22
25%,5.0,20200000000000.0,20200000000000.0,2130.0,74482.8,62.5,19.0,38.8,21.2,338.7,374.5,-63.6,-21.7,17.0,4.0,7.0,12.81
50%,12.0,20200000000000.0,20200000000000.0,3790.0,90894.0,94.0,33.0,88.0,38.0,361.0,392.8,-24.1,-11.6,28.0,10.0,10.0,19.83
75%,17.0,20200000000000.0,20200000000000.0,5720.0,118633.0,135.9,46.0,100.0,62.0,534.7,555.8,-9.2,-5.0,34.0,24.0,13.0,31.01
max,21.0,20200000000000.0,20200000000000.0,59960.0,170134.0,490.0,96.0,100.0,99.0,574.6,604.7,-0.1,8.0,49.0,41.0,48.0,103.72


### 导入测试数据集

In [4]:
# 导入测试集
dataset_test = pd.read_csv('./energy_predict_data/data_prehandle/5_data_split/test/test_%s.csv' % utils.getType(type))
# 数据描述
pd.set_option('precision', 2)
dataset_test.describe()

Unnamed: 0,vehicle_id,charge_start_time,charge_end_time,charge_duration,mileage,delta_mileage,charge_start_soc,charge_end_soc,charge_delta_soc,charge_start_U,charge_end_U,charge_start_I,charge_end_I,charge_max_temp,charge_min_temp,charge_delta_temp,charge_energy
count,1445.0,1440.0,1440.0,1445.0,1445.0,1445.0,1445.0,1445.0,1445.0,1445.0,1445.0,1445.0,1445.0,1445.0,1445.0,1445.0,1445.0
mean,11.58,20200000000000.0,20200000000000.0,7068.07,119628.61,99.74,39.28,80.71,41.43,408.08,435.98,-44.01,-20.78,34.44,22.8,11.64,25.62
std,6.45,1230000000.0,1230000000.0,8008.72,29383.38,62.61,19.36,21.06,21.69,90.88,87.14,48.88,33.48,9.14,11.85,11.95,22.22
min,1.0,20200000000000.0,20200000000000.0,600.0,75285.2,0.0,2.0,11.0,0.0,277.8,322.3,-202.0,-198.0,7.0,-2.0,0.0,0.42
25%,5.0,20200000000000.0,20200000000000.0,2160.0,93605.0,63.2,23.0,72.0,24.0,341.9,374.6,-57.3,-16.9,30.0,18.0,6.0,11.73
50%,12.0,20200000000000.0,20200000000000.0,3810.0,114990.2,92.7,37.0,87.0,40.0,361.7,389.1,-24.0,-8.5,36.0,26.0,8.0,18.85
75%,17.0,20200000000000.0,20200000000000.0,7939.0,137625.0,130.3,54.0,99.0,60.0,533.8,548.9,-9.0,-4.9,40.0,31.0,10.0,28.68
max,21.0,20200000000000.0,20200000000000.0,44535.0,182536.0,441.4,93.0,100.0,95.0,571.0,601.4,-0.1,3.9,52.0,46.0,51.0,101.3


In [5]:
dataset_train.isnull().sum()

vehicle_id           0
charge_start_time    0
charge_end_time      0
charge_duration      0
mileage              0
delta_mileage        0
charge_start_soc     0
charge_end_soc       0
charge_delta_soc     0
charge_start_U       0
charge_end_U         0
charge_start_I       0
charge_end_I         0
charge_max_temp      0
charge_min_temp      0
charge_delta_temp    0
charge_energy        0
dtype: int64

### 特征选取

In [6]:
features = ['charge_duration', 
            'delta_mileage',
            'charge_start_soc', 
            'charge_delta_soc', 
            'charge_end_soc',
            'charge_start_U',
            'charge_start_I',
            'charge_delta_temp']
label = header[16]
features, label

(['charge_duration',
  'delta_mileage',
  'charge_start_soc',
  'charge_delta_soc',
  'charge_end_soc',
  'charge_start_U',
  'charge_start_I',
  'charge_delta_temp'],
 'charge_energy')

### 构造特征集合和输出集合

In [7]:
X_train, y_train = dataset_train[features], dataset_train[label]
X_test, y_test = dataset_test[features], dataset_test[label]

### 创建训练后的预测值数据集

In [8]:
data_pred = pd.DataFrame()
data_pred['vehicle_id'] = dataset_test['vehicle_id'].tolist()
data_pred['type'] = type

### 标准化

In [9]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train.astype(np.float64))

# 按训练集数据分布对测试集进行数据标准化
X_test = scaler.transform(X_test.astype(np.float64))

### 特征选择

In [10]:
#导入sklearn库中的VarianceThreshold
from sklearn.feature_selection import VarianceThreshold
#设置方差的阈值为0.08
sel = VarianceThreshold(threshold=0.02)
#选择方差大于0.08的特征
X_sel = sel.fit_transform(X_train.astype(np.float64))

ValueError: No feature in X meets the variance threshold 0.20000

### 训练模型

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.svm import NuSVR
from sklearn.svm import SVR
from xgboost import XGBRegressor

estimator_list = [
    LinearRegression(),
    LinearSVR(),
    SVR(kernel='poly'),
    SVR(kernel='rbf'),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    ExtraTreesRegressor(min_samples_leaf=1, min_samples_split=3, n_estimators=96),
    AdaBoostRegressor(),
    BaggingRegressor(),
    GradientBoostingRegressor(),
    XGBRegressor(),
    MLPRegressor(solver='lbfgs', max_iter=1000)
]

# 定义模型字典
model_dict = {
    'LinearRegression': 'lr',
    'LinearSVR': 'svr_lr',
    'SVR': 'svr_rbf',
    'DecisionTreeRegressor': 'dtr',
    'RandomForestRegressor': 'rtr',
    'ExtraTreesRegressor': 'etr',
    'AdaBoostRegressor': 'abr',
    'BaggingRegressor': 'br',
    'GradientBoostingRegressor': 'gbr',
    'XGBRegressor': 'xgbr',
    'MLPRegressor': 'mlpr'
}

cv_split = ShuffleSplit(n_splits=6, train_size=0.75, test_size=0.25, random_state=142)
df_columns = ['Name', 'Parameters', 'Train Accuracy Mean', 'Test Accuracy Mean', 'Comsumed Time']
df = pd.DataFrame(columns=df_columns)

row_index = 0
for estimator in estimator_list:
    model_name = estimator.__class__.__name__
    df.loc[row_index, 'Name'] = model_name
    df.loc[row_index, 'Parameters'] = str(estimator.get_params())
    
    #记录训练开始时间
    time_start=time.clock()

    # 训练
    estimator.fit(X_train, y_train)
    
    #记录训练结束时间
    time_end=time.clock()
    
    # 预测
    y_pred = estimator.predict(X_test)
    data_pred[model_dict[model_name]] = y_pred
    
    #评估
    print('evaluation of model-', estimator.__class__.__name__)
    scores = utils.evaluate(y_test, y_pred)
    
    #可视化
    utils.visualize(y_test, y_pred)
    
    df.loc[row_index, 'Train Accuracy Mean'] = estimator.score(X_train, y_train)
    df.loc[row_index, 'Test Accuracy Mean'] = scores['ACC']
    df.loc[row_index, 'Comsumed Time'] = time_end-time_start
    
    row_index += 1
df = df.sort_values(by='Test Accuracy Mean', ascending=False)
df

### 保存预测结果

In [None]:
data_pred.to_csv('./energy_predict_data/predict_result/data_pred_%s.csv' % utils.getType(type), index = False)