In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from scipy import optimize 

from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression

from pandas_profiling import ProfileReport
%matplotlib inline

### Load data frame

In [314]:
air_df = pd.read_excel("Air Quality Data Set/AirQualityUCI.xlsx")
air_df.head(5).T

Unnamed: 0,0,1,2,3,4
Datetime,2004-03-10 18:00:00,2004-03-10 19:00:00,2004-03-10 20:00:00,2004-03-10 21:00:00,2004-03-10 22:00:00
CO(GT),2.6,2,2.2,2.2,1.6
PT08.S1(CO),1360,1292.25,1402,1375.5,1272.25
NMHC(GT),150,112,88,80,51
C6H6(GT),11.8817,9.39716,8.99782,9.2288,6.51822
NOx(GT),166,103,131,172,131
PT08.S3(NOx),1056.25,1173.75,1140,1092,1205
NO2(GT),113,92,114,122,116
PT08.S4(NO2),1692,1558.75,1554.5,1583.75,1490
PT08.S5(O3),1267.5,972.25,1074,1203.25,1110


In [4]:
air_df.shape

(9357, 15)

In [5]:
air_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9357 entries, 0 to 9356
Data columns (total 15 columns):
Date             9357 non-null datetime64[ns]
Time             9357 non-null object
CO(GT)           9357 non-null float64
PT08.S1(CO)      9357 non-null float64
NMHC(GT)         9357 non-null int64
C6H6(GT)         9357 non-null float64
PT08.S2(NMHC)    9357 non-null float64
NOx(GT)          9357 non-null float64
PT08.S3(NOx)     9357 non-null float64
NO2(GT)          9357 non-null float64
PT08.S4(NO2)     9357 non-null float64
PT08.S5(O3)      9357 non-null float64
T                9357 non-null float64
RH               9357 non-null float64
AH               9357 non-null float64
dtypes: datetime64[ns](1), float64(12), int64(1), object(1)
memory usage: 1.1+ MB


### Drop NaN parametrs

In [6]:
air_df.drop("PT08.S2(NMHC)", axis = 1, inplace = True)
air_df.head().T

Unnamed: 0,0,1,2,3,4
Date,2004-03-10 00:00:00,2004-03-10 00:00:00,2004-03-10 00:00:00,2004-03-10 00:00:00,2004-03-10 00:00:00
Time,18:00:00,19:00:00,20:00:00,21:00:00,22:00:00
CO(GT),2.6,2,2.2,2.2,1.6
PT08.S1(CO),1360,1292.25,1402,1375.5,1272.25
NMHC(GT),150,112,88,80,51
C6H6(GT),11.8817,9.39716,8.99782,9.2288,6.51822
NOx(GT),166,103,131,172,131
PT08.S3(NOx),1056.25,1173.75,1140,1092,1205
NO2(GT),113,92,114,122,116
PT08.S4(NO2),1692,1558.75,1554.5,1583.75,1490


In [7]:
air_df.Time = air_df.Time.apply(lambda x: str(x))
air_df.Date = air_df.Date.apply(lambda x: str(x)[:10] + " ")
air_df.Date = pd.to_datetime(air_df.Date + air_df.Time)
air_df.drop("Time", axis = 1, inplace = True)
air_df.head().T

Unnamed: 0,0,1,2,3,4
Date,2004-03-10 18:00:00,2004-03-10 19:00:00,2004-03-10 20:00:00,2004-03-10 21:00:00,2004-03-10 22:00:00
CO(GT),2.6,2,2.2,2.2,1.6
PT08.S1(CO),1360,1292.25,1402,1375.5,1272.25
NMHC(GT),150,112,88,80,51
C6H6(GT),11.8817,9.39716,8.99782,9.2288,6.51822
NOx(GT),166,103,131,172,131
PT08.S3(NOx),1056.25,1173.75,1140,1092,1205
NO2(GT),113,92,114,122,116
PT08.S4(NO2),1692,1558.75,1554.5,1583.75,1490
PT08.S5(O3),1267.5,972.25,1074,1203.25,1110


In [11]:
air_df[air_df != -200].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9357 entries, 0 to 9356
Data columns (total 13 columns):
Date            9357 non-null datetime64[ns]
CO(GT)          7674 non-null float64
PT08.S1(CO)     8991 non-null float64
NMHC(GT)        914 non-null float64
C6H6(GT)        8991 non-null float64
NOx(GT)         7718 non-null float64
PT08.S3(NOx)    8991 non-null float64
NO2(GT)         7715 non-null float64
PT08.S4(NO2)    8991 non-null float64
PT08.S5(O3)     8991 non-null float64
T               8991 non-null float64
RH              8991 non-null float64
AH              8991 non-null float64
dtypes: datetime64[ns](1), float64(12)
memory usage: 950.4 KB


In [12]:
air_df.drop("NMHC(GT)", axis = 1, inplace = True)
air_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9357 entries, 0 to 9356
Data columns (total 12 columns):
Date            9357 non-null datetime64[ns]
CO(GT)          9357 non-null float64
PT08.S1(CO)     9357 non-null float64
C6H6(GT)        9357 non-null float64
NOx(GT)         9357 non-null float64
PT08.S3(NOx)    9357 non-null float64
NO2(GT)         9357 non-null float64
PT08.S4(NO2)    9357 non-null float64
PT08.S5(O3)     9357 non-null float64
T               9357 non-null float64
RH              9357 non-null float64
AH              9357 non-null float64
dtypes: datetime64[ns](1), float64(11)
memory usage: 877.3 KB


In [35]:
air_df = air_df[air_df["C6H6(GT)"] != -200]

In [36]:
air_df[air_df == -200].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8991 entries, 0 to 9356
Data columns (total 12 columns):
Date            0 non-null datetime64[ns]
CO(GT)          1647 non-null float64
PT08.S1(CO)     0 non-null float64
C6H6(GT)        0 non-null float64
NOx(GT)         1595 non-null float64
PT08.S3(NOx)    0 non-null float64
NO2(GT)         1598 non-null float64
PT08.S4(NO2)    0 non-null float64
PT08.S5(O3)     0 non-null float64
T               0 non-null float64
RH              0 non-null float64
AH              0 non-null float64
dtypes: datetime64[ns](1), float64(11)
memory usage: 913.1 KB


In [46]:
data_cogt_neg = ["NOx(GT)", "NO2(GT)", "C6H6(GT)"]
data_noxgt_neg = ["CO(GT)", "NO2(GT)", "C6H6(GT)"]
data_no2gt_neg = ["NOx(GT)", "CO(GT)", "C6H6(GT)"]

In [47]:
cogt_df = air_df.drop(data_cogt_neg, axis = 1)
noxgt_df = air_df.drop(data_noxgt_neg, axis = 1)
no2gt_df = air_df.drop(data_no2gt_neg, axis = 1)

In [48]:
cogt_df.info(), noxgt_df.info(), no2gt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8991 entries, 0 to 9356
Data columns (total 9 columns):
Date            8991 non-null datetime64[ns]
CO(GT)          8991 non-null float64
PT08.S1(CO)     8991 non-null float64
PT08.S3(NOx)    8991 non-null float64
PT08.S4(NO2)    8991 non-null float64
PT08.S5(O3)     8991 non-null float64
T               8991 non-null float64
RH              8991 non-null float64
AH              8991 non-null float64
dtypes: datetime64[ns](1), float64(8)
memory usage: 702.4 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8991 entries, 0 to 9356
Data columns (total 9 columns):
Date            8991 non-null datetime64[ns]
PT08.S1(CO)     8991 non-null float64
NOx(GT)         8991 non-null float64
PT08.S3(NOx)    8991 non-null float64
PT08.S4(NO2)    8991 non-null float64
PT08.S5(O3)     8991 non-null float64
T               8991 non-null float64
RH              8991 non-null float64
AH              8991 non-null float64
dtypes: datetime64[ns](1), fl

(None, None, None)

# Prepare for series which has a negative values(outliers)

#### First series CO(GT)

In [49]:
cogt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8991 entries, 0 to 9356
Data columns (total 9 columns):
Date            8991 non-null datetime64[ns]
CO(GT)          8991 non-null float64
PT08.S1(CO)     8991 non-null float64
PT08.S3(NOx)    8991 non-null float64
PT08.S4(NO2)    8991 non-null float64
PT08.S5(O3)     8991 non-null float64
T               8991 non-null float64
RH              8991 non-null float64
AH              8991 non-null float64
dtypes: datetime64[ns](1), float64(8)
memory usage: 702.4 KB


In [54]:
cogt_df.reset_index(inplace=True)

#### pre-process CO(GT) series

In [55]:
x_test = cogt_df[cogt_df["CO(GT)"] == -200]
pos = list(x_test.index.values)
cogt_df.drop(cogt_df.index[pos], axis = 0, inplace = True)

x_train = cogt_df.drop("Date", axis = 1)
x_test = x_test.drop(["Date", "CO(GT)"], axis = 1)

y_train = x_train.pop("CO(GT)")

x_train = (x_train - x_train.mean()) / x_train.std()
x_test = (x_test - x_test.mean()) / x_test.std()

#### Build predict model for NaN values

In [270]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression

model = Ridge(alpha = 0.00000001, random_state = 10, normalize = True, solver = 'lsqr')
# model = RandomForestRegressor(n_jobs = -1, n_estimators = 150, max_depth = 5, max_features = 'sqrt')
# model = GradientBoostingRegressor(n_estimators = 300, max_depth = 7, random_state = 15)
model.fit(x_train, y_train)

pred = model.predict(x_test)

In [273]:
len(pred), pred[:4], type(pred)

(1647, array([0.75881164, 0.32358214, 5.47291111, 1.87684306]), numpy.ndarray)

#### Change NaN's to predicted values

In [None]:
ind = 0
for i in pos:
    air_df.loc[i, "CO(GT)"] = pred[ind]
    ind += 1

air_df.head(60)

#### Pre-processing NOx(GT) series

In [279]:
x_test = noxgt_df[noxgt_df["NOx(GT)"] == -200]
pos = list(x_test.index.values)
noxgt_df.drop(noxgt_df.index[pos], axis = 0, inplace = True)

x_train = noxgt_df.drop("Date", axis = 1)
x_test = x_test.drop(["Date", "NOx(GT)"], axis = 1)

y_train = x_train.pop("NOx(GT)")

x_train = (x_train - x_train.mean()) / x_train.std()
x_test = (x_test - x_test.mean()) / x_test.std()

#### Build predict model NOx(GT) series

In [289]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression

# model = Ridge(alpha = 0.00000001, random_state = 10, normalize = True, solver = 'lsqr')
model = RandomForestRegressor(n_jobs = -1, n_estimators = 50, max_depth = 10, max_features = 'sqrt')
# model = GradientBoostingRegressor(n_estimators = 300, max_depth = 7, random_state = 15)
model.fit(x_train, y_train)

pred = model.predict(x_test)

In [290]:
pred[:4]

array([ 57.31454304,  50.66126424, 317.07711181, 165.08146484])

In [282]:
air_df.head(60)

Unnamed: 0,Date,CO(GT),PT08.S1(CO),C6H6(GT),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10 18:00:00,2.6,1360.0,11.881723,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10 19:00:00,2.0,1292.25,9.397165,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10 20:00:00,2.2,1402.0,8.997817,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10 21:00:00,2.2,1375.5,9.228796,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10 22:00:00,1.6,1272.25,6.518224,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794
5,2004-03-10 23:00:00,1.2,1197.0,4.741012,89.0,1336.5,96.0,1393.0,949.25,11.175,59.175,0.784772
6,2004-03-11 00:00:00,1.2,1185.0,3.624399,62.0,1461.75,77.0,1332.75,732.5,11.325,56.775,0.760312
7,2004-03-11 01:00:00,1.0,1136.25,3.326677,62.0,1453.25,76.0,1332.75,729.5,10.675,60.0,0.770238
8,2004-03-11 02:00:00,0.9,1094.0,2.339416,45.0,1579.0,60.0,1276.0,619.5,10.65,59.674999,0.764819
9,2004-03-11 03:00:00,0.6,1009.75,1.696658,-200.0,1705.0,-200.0,1234.75,501.25,10.25,60.200001,0.751657


#### Change NaN's to predicted values

In [291]:
ind = 0
for i in pos:
    air_df.loc[i, "NOx(GT)"] = pred[ind]
    ind += 1

air_df.head(60)

Unnamed: 0,Date,CO(GT),PT08.S1(CO),C6H6(GT),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10 18:00:00,2.6,1360.0,11.881723,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10 19:00:00,2.0,1292.25,9.397165,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10 20:00:00,2.2,1402.0,8.997817,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10 21:00:00,2.2,1375.5,9.228796,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10 22:00:00,1.6,1272.25,6.518224,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794
5,2004-03-10 23:00:00,1.2,1197.0,4.741012,89.0,1336.5,96.0,1393.0,949.25,11.175,59.175,0.784772
6,2004-03-11 00:00:00,1.2,1185.0,3.624399,62.0,1461.75,77.0,1332.75,732.5,11.325,56.775,0.760312
7,2004-03-11 01:00:00,1.0,1136.25,3.326677,62.0,1453.25,76.0,1332.75,729.5,10.675,60.0,0.770238
8,2004-03-11 02:00:00,0.9,1094.0,2.339416,45.0,1579.0,60.0,1276.0,619.5,10.65,59.674999,0.764819
9,2004-03-11 03:00:00,0.6,1009.75,1.696658,57.314543,1705.0,-200.0,1234.75,501.25,10.25,60.200001,0.751657


#### Pre-processing NO2(GT) series

In [292]:
no2gt_df.reset_index(inplace=True)

In [293]:
x_test = no2gt_df[no2gt_df["NO2(GT)"] == -200]
pos = list(x_test.index.values)
no2gt_df.drop(no2gt_df.index[pos], axis = 0, inplace = True)

x_train = no2gt_df.drop("Date", axis = 1)
x_test = x_test.drop(["Date", "NO2(GT)"], axis = 1)

y_train = x_train.pop("NO2(GT)")

x_train = (x_train - x_train.mean()) / x_train.std()
x_test = (x_test - x_test.mean()) / x_test.std()

#### Build predict model NO2(GT) series

In [301]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression

model = Ridge(alpha = 0.000000001, random_state = 10, normalize = True, solver = 'lsqr')
# model = RandomForestRegressor(n_jobs = -1, n_estimators = 50, max_depth = 10, max_features = 'sqrt')
# model = GradientBoostingRegressor(n_estimators = 300, max_depth = 10, random_state = 10)
model.fit(x_train, y_train)

pred = model.predict(x_test)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators = 300, max_depth = 10, random_state = 10)

model.fit(x_train, y_train)

pred = model.predict(x_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_jobs = -1, n_estimators = 50, max_depth = 10, max_features = 'sqrt')

model.fit(x_train, y_train)

pred = model.predict(x_test)

In [295]:
pred[:4] # Ridge

array([ 43.47957959,  45.2816857 , 179.9103468 , 100.25394214])

In [297]:
pred[:4] # RF

array([ 73.0747825 ,  74.04123427, 154.75535046, 105.2414596 ])

In [299]:
pred[:4] # GB

array([ 74.30900731,  77.9914662 , 154.98513216, 100.49850113])

In [300]:
air_df.head(60)

Unnamed: 0,Date,CO(GT),PT08.S1(CO),C6H6(GT),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10 18:00:00,2.6,1360.0,11.881723,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10 19:00:00,2.0,1292.25,9.397165,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10 20:00:00,2.2,1402.0,8.997817,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10 21:00:00,2.2,1375.5,9.228796,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10 22:00:00,1.6,1272.25,6.518224,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794
5,2004-03-10 23:00:00,1.2,1197.0,4.741012,89.0,1336.5,96.0,1393.0,949.25,11.175,59.175,0.784772
6,2004-03-11 00:00:00,1.2,1185.0,3.624399,62.0,1461.75,77.0,1332.75,732.5,11.325,56.775,0.760312
7,2004-03-11 01:00:00,1.0,1136.25,3.326677,62.0,1453.25,76.0,1332.75,729.5,10.675,60.0,0.770238
8,2004-03-11 02:00:00,0.9,1094.0,2.339416,45.0,1579.0,60.0,1276.0,619.5,10.65,59.674999,0.764819
9,2004-03-11 03:00:00,0.6,1009.75,1.696658,57.314543,1705.0,-200.0,1234.75,501.25,10.25,60.200001,0.751657


#### Switching NaN's to predicted values

In [302]:
ind = 0
for i in pos:
    air_df.loc[i, "NO2(GT)"] = pred[ind]
    ind += 1

air_df.head(60)

Unnamed: 0,Date,CO(GT),PT08.S1(CO),C6H6(GT),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10 18:00:00,2.6,1360.0,11.881723,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10 19:00:00,2.0,1292.25,9.397165,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10 20:00:00,2.2,1402.0,8.997817,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10 21:00:00,2.2,1375.5,9.228796,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10 22:00:00,1.6,1272.25,6.518224,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794
5,2004-03-10 23:00:00,1.2,1197.0,4.741012,89.0,1336.5,96.0,1393.0,949.25,11.175,59.175,0.784772
6,2004-03-11 00:00:00,1.2,1185.0,3.624399,62.0,1461.75,77.0,1332.75,732.5,11.325,56.775,0.760312
7,2004-03-11 01:00:00,1.0,1136.25,3.326677,62.0,1453.25,76.0,1332.75,729.5,10.675,60.0,0.770238
8,2004-03-11 02:00:00,0.9,1094.0,2.339416,45.0,1579.0,60.0,1276.0,619.5,10.65,59.674999,0.764819
9,2004-03-11 03:00:00,0.6,1009.75,1.696658,57.314543,1705.0,43.479578,1234.75,501.25,10.25,60.200001,0.751657


## Last step, predict target series C6H6(GT)

In [306]:
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.cross_validation import train_test_split
air_df.columns

Index(['Date', 'CO(GT)', 'PT08.S1(CO)', 'C6H6(GT)', 'NOx(GT)', 'PT08.S3(NOx)',
       'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH'],
      dtype='object')

In [313]:
target = air_df.drop(["Date", "CO(GT)", "PT08.S1(CO)","NOx(GT)","PT08.S3(NOx)","NO2(GT)","PT08.S4(NO2)","PT08.S5(O3)","T","RH","AH"], axis = 1)
features = air_df.drop(["C6H6(GT)"], axis = 1)

feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size = 0.75, random_state=42)
# model = GradientBoostingRegressor(n_estimators = 200, loss = 'ls', max_depth = 10, learning_rate = 0.5, random_state = 10)
model = Ridge(alpha = 0.000000001, random_state = 10, normalize = True, solver = 'lsqr')
model.fit(feature_train, target_train)


TypeError: float() argument must be a string or a number, not 'Timestamp'