In [80]:
import pandas as pd
import numpy as np
from pylab import rcParams
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [39]:
%matplotlib inline
rcParams['figure.figsize']=5, 4 #图片像素
sns.set_style('whitegrid')

In [124]:
data = pd.read_excel("AirQualityUCI.xlsx")
print(data.shape)
print(list(data.columns))

(9357, 15)
['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']


In [125]:
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


# 数据清洗

In [123]:
for i in range(2,15):
    col = data.columns[i]
    data = data[data[col] != -200]
    print("after removing %s:" % (col))
    print(data.shape)

after removing CO(GT):
(7674, 15)
after removing PT08.S1(CO):
(7344, 15)
after removing NMHC(GT):
(863, 15)
after removing C6H6(GT):
(863, 15)
after removing PT08.S2(NMHC):
(863, 15)
after removing NOx(GT):
(827, 15)
after removing PT08.S3(NOx):
(827, 15)
after removing NO2(GT):
(827, 15)
after removing PT08.S4(NO2):
(827, 15)
after removing PT08.S5(O3):
(827, 15)
after removing T:
(827, 15)
after removing RH:
(827, 15)
after removing AH:
(827, 15)


### 问题出在MHCH(GT)这一列中有太多无效数据
### 解决方案：
1. 回归时不用这个自变量
2. 就用这827个数据回归


### 以下为不去除MHCH(GT)这一列中无效值的数据

In [128]:
#运行这里前一定要先重新运行读取数据的那一条!!!!!
data1 = data
for i in range(2,15):
    if i == 4: 
        continue
    col = data1.columns[i]
    data1 = data1[data1[col] != -200]
    print("after removing %s:" % (col))
    print(data1.shape)

after removing CO(GT):
(6941, 15)
after removing PT08.S1(CO):
(6941, 15)
after removing C6H6(GT):
(6941, 15)
after removing PT08.S2(NMHC):
(6941, 15)
after removing NOx(GT):
(6941, 15)
after removing PT08.S3(NOx):
(6941, 15)
after removing NO2(GT):
(6941, 15)
after removing PT08.S4(NO2):
(6941, 15)
after removing PT08.S5(O3):
(6941, 15)
after removing T:
(6941, 15)
after removing RH:
(6941, 15)
after removing AH:
(6941, 15)


In [129]:
data1.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


# CO观测值预测真实值

In [130]:
#set features
features = ['PT08.S1(CO)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)','T', 'RH', 'AH']
X = data1[features]
y = data1['CO(GT)']

#split the data into training/testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [131]:
regr = LinearRegression()

#train the model using training sets
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [132]:
#make predictions using tesring sets
y_pred = regr.predict(X_test)

In [133]:
#coefficients
regr.coef_

array([ 0.00134937,  0.12109229, -0.00023792,  0.00142454,  0.00025662,
        0.00239522,  0.000527  , -0.00030643, -0.01978414, -0.0039601 ,
        0.00181759])

# 预测效果

In [134]:
#RMSE
np.sqrt(mean_squared_error(y_test, y_pred))

0.4266789863172098

In [136]:
#R
r2_score(y_test, y_pred)

0.9084506105393074