In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
from datetime import date, datetime

### 加载训练数据

In [2]:
data_root = '../energy_predict_data/train_data'
dataset = pd.read_csv(data_root + 'predict_data_e_train.csv')
count = np.max(dataset['vehicle_id'])  # max id
dataset.head()

FileNotFoundError: File b'../energy_predict_data/train_datapredict_data_e_train.csv' does not exist

### 数据类型

In [None]:
dataset.dtypes

### 数据描述

In [None]:
pd.set_option('precision', 2)
header = dataset.columns.values.tolist()
dataset.describe()

### 数据缺失情况

In [None]:
dataset.isnull().sum()

### 数据间相关性探索

In [None]:
# correlation
pd.set_option('precision', 2)
dataset.corr(method='pearson')

In [None]:
# 对不同车的数据进行相关性分析，会发现更多信息
for i in range(1, count+1):
    data = dataset[(dataset['vehicle_id']==i)]
    data = data[header[1:]]
    corr=data.corr()
    plt.figure(figsize=(10, 10))
    sns.heatmap(corr, vmax=0.8, linewidths=0.01,
                square=True,annot=True,cmap='YlGnBu',linecolor="white")
    plt.title('Correlation between features for vehicle # %d' % i);

### 增加新的特征集——充电时间间隔（charge_duration）与SOC变化值（charge_delta_soc）以及里程变化值（此次充电前该汽车行驶里程数）；

In [None]:
import time
# 将时间字符串转化为时间
def str2time(string):
    year = string[0:4]
    month = string[4:6]
    day = string[6:8]
    hour = string[8:10]
    minute = string[10:12]
    second = string[12:14]
    time_str = '%s-%s-%s %s:%s:%s' % (year, month, day, hour, minute, second)
    timestamp = time.mktime(time.strptime(time_str, '%Y-%m-%d %H:%M:%S'))  # 变为时间戳
    return int(timestamp)

# 将矩阵内的字符串转化为时间戳
def calVecTimeStamp(vector):
    timestamps = []
    for v in vector:
        timestamp = str2time(str(v))
        timestamps.append(timestamp)
    return timestamps

charge_end_time_array = np.array(calVecTimeStamp(dataset['charge_end_time']))
charge_start_time_array = np.array(calVecTimeStamp(dataset['charge_start_time']))
charge_start_soc_array = np.array(dataset['charge_start_soc'])
charge_end_soc_array = np.array(dataset['charge_end_soc'])
dataset['charge_duration'] = (charge_end_time_array - charge_start_time_array).tolist()
dataset['charge_delta_soc'] = (charge_end_soc_array - charge_start_soc_array).tolist()
header.insert(3, 'charge_duration')
header.insert(7, 'charge_delta_soc')
dataset = dataset[header]

In [None]:
# correlation
pd.set_option('precision', 2)
dataset.corr(method='pearson')

In [None]:
# 对不同车的数据进行相关性分析，会发现更多信息
import copy;
header_v1=copy.deepcopy(header)  # 深拷贝
#header_v1.remove('vehicle_id')
header_v1.insert(5, 'delta_mileage')

for i in range(1, count+1):
    data = dataset[(dataset['vehicle_id']==i)]
    data = data[header]
    mileage_list = data['mileage'].tolist()
    # 增加里程充电前汽车行驶里程数
    delta_mileage_list = [0] * len(mileage_list)
    for j in range(1, len(mileage_list)):
        delta_mileage = mileage_list[j] - mileage_list[j-1]
        delta_mileage_list[j] = delta_mileage
    data['delta_mileage'] = delta_mileage_list
    data = data[header_v1]
    corr=data.corr()
    plt.figure(figsize=(10, 10))
    sns.heatmap(corr, vmax=0.8, linewidths=0.01,
                square=True,annot=True,cmap='YlGnBu',linecolor="white")
    plt.title('Correlation between features for vehicle # %d' % i);
    
    # 保存data
    data.to_csv(data_root + "predict_data_e_train_%d.csv" % i, index=False)

> 1. 增加新的三列之后，发现charge_energy与charge_duration、charge_start_soc、charge_delta_soc、charge_start_U、charge_start_I具有较强的相关性。
    下面分别记录了不同车的与charge_energy相关性大于0.5的特征（按照由大到小排序，只考虑大小，不考虑正负）。

> 2. 对不同的车进行相关性分析，某些相关性会变强

    
    | No1    | No2    | No3    | No4    | No5    |
    | ------ | ------ | ------ | ------ | ------ |
    | charge_duration(0.88) | charge_delta_soc(1)   | charge_start_U(0.77)   | charge_start_soc(0.96) | charge_start_soc(0.86) |
    | charge_start_U(0.81)  | charge_end_soc(0.75)  | charge_delta_soc(0.75) | charge_duration(0.93) | charge_delta_soc(0.61) |
    | charge_delta_soc(0.76)| charge_end_U(0.74)    | charge_start_soc(0.63) | delta_mileage(0.76) | charge_start_U(0.57) |
    | charge_start_I(0.68)  | charge_duration(0.73) |                        | charge_delta_soc(0.71) |  |
    | charge_start_soc(0.65)| charge_start_U(0.58)  | | charge_start_I(0.66)| |
    |                       | charge_start_soc(0.51)| | charge_start_U(9.54) | |

### 数据可视化

#### 充电能量分布图

In [None]:
from scipy import stats
for i in range(1, count+1):
    NONE_VIN = (dataset["charge_energy"].notnull() & (dataset['vehicle_id']==i))
    fig = plt.figure()
    sns.distplot(dataset[NONE_VIN]['charge_energy']);
    fig = plt.figure()
    res = stats.probplot(dataset[NONE_VIN]['charge_energy'], plot=plt)

> 由上图可以发现，充电能量分布不均衡，并且0附近的值占比较大，分析数据可得这是由于充电时间过短造成的，而这将很大程度上会影响最终模型的表现，因此，对于建议**这类数据应视为一种“异常”，应该给予特殊处理**。

#### 充电能量低数据集

In [None]:
dataset[dataset['charge_energy'] < 1].head()

> 对这些充电能量较低的数据分析可得，充电能量低是由于充电时间较短造成的。

#### 不同车辆记录数直方图

In [None]:
sns.countplot(x='vehicle_id', data=dataset)

#### 对于每一列数据其值的分布情况直方图

In [None]:
# histograms
for i in range(1, count+1):
    dataset[dataset['vehicle_id']==i].hist(bins=20,figsize=(12,10),grid=False);

不同车辆的里程数与充电能量的关系

In [None]:
_, ax = plt.subplots(2, 3)
for id in range(0, count+1):
    row = int(id / 3)
    col = int(id % 3)
    dataset[dataset['vehicle_id']==id+1]['mileage'].plot(ax=ax[row][col], figsize=(9, 6))
plt.show()

In [None]:
for id in range(0, count+1):
    plt.subplot(2,3,id+1)
    sns.regplot(dataset[dataset['vehicle_id']==id+1]['mileage'], dataset[dataset['vehicle_id']==id+1]['charge_energy'])
plt.show()

不同车辆充电开始SOC与充电能量的关系

In [None]:
for id in range(0, count+1):
    plt.subplot(2,3,id+1)
    sns.regplot(dataset[dataset['vehicle_id']==id+1]['charge_start_soc'], dataset[dataset['vehicle_id']==id+1]['charge_energy'])
plt.show()

In [None]:
for id in range(0, count+1):
    plt.subplot(2,3,id+1)
    sns.regplot(dataset[dataset['vehicle_id']==id+1]['charge_end_soc'], dataset[dataset['vehicle_id']==id+1]['charge_energy'])
plt.show()

In [None]:
for id in range(0, count+1):
    plt.subplot(2,3,id+1)
    sns.regplot(dataset[dataset['vehicle_id']==id+1]['charge_delta_soc'], dataset[dataset['vehicle_id']==id+1]['charge_energy'])
plt.show()

不同车辆充电开始、结束电压与充电能量的关系

In [None]:
for id in range(0, count+1):
    plt.subplot(2,3,id+1)
    sns.regplot(dataset[dataset['vehicle_id']==id+1]['charge_start_U'], dataset[dataset['vehicle_id']==id+1]['charge_energy'])
plt.show()

In [None]:
for id in range(0, count+1):
    plt.subplot(2,3,id+1)
    sns.regplot(dataset[dataset['vehicle_id']==id+1]['charge_end_U'], dataset[dataset['vehicle_id']==id+1]['charge_energy'])
plt.show()

不同车辆充电开始、结束电流与充电能量的关系

In [None]:
for id in range(0, count+1):
    plt.subplot(2,3,id+1)
    sns.regplot(dataset[dataset['vehicle_id']==id+1]['charge_start_I'], dataset[dataset['vehicle_id']==id+1]['charge_energy'])
plt.show()

In [None]:
for id in range(0, 5):
    plt.subplot(2,3,id+1)
    sns.regplot(dataset[dataset['vehicle_id']==id+1]['charge_end_I'], dataset[dataset['vehicle_id']==id+1]['charge_energy'])
plt.show()

不同车辆充电充电过程中温度与充电能量的关系

In [None]:
for id in range(0, count+1):
    plt.subplot(2,3,id+1)
    sns.regplot(dataset[dataset['vehicle_id']==id+1]['charge_max_temp'], dataset[dataset['vehicle_id']==id+1]['charge_energy'])
plt.show()

In [None]:
for id in range(0, count+1):
    plt.subplot(2,3,id+1)
    sns.regplot(dataset[dataset['vehicle_id']==id+1]['charge_min_temp'], dataset[dataset['vehicle_id']==id+1]['charge_energy'])
plt.show()

### 结论
* 每辆车的数据都有相应的分布，所以建立的模型要针对不同的电动汽车。
* 虽然每辆车的数据分布不同，但是充电能量的影响因素大体上来说是一致的。即充电能量与充电时间、充电开始能量、充电开始电压等因素息息相关。