## Add libraries

In [771]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import optimize 

from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet
from catboost import CatBoostRegressor
import xgboost as xgb


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

from pandas_profiling import ProfileReport
%matplotlib inline

### Load train dataset

In [772]:
# air_df = pd.read_excel("Air Quality Data Set/AirQualityUCI.xlsx")
air_df = pd.read_csv("train.csv")
air_df.head(5).T

Unnamed: 0,0,1,2,3,4
Datetime,2004-03-10 18:00:00,2004-03-10 19:00:00,2004-03-10 20:00:00,2004-03-10 21:00:00,2004-03-10 22:00:00
CO(GT),2.6,2,2.2,2.2,1.6
PT08.S1(CO),1360,1292.25,1402,1375.5,1272.25
NMHC(GT),150,112,88,80,51
C6H6(GT),11.8817,9.39716,8.99782,9.2288,6.51822
NOx(GT),166,103,131,172,131
PT08.S3(NOx),1056.25,1173.75,1140,1092,1205
NO2(GT),113,92,114,122,116
PT08.S4(NO2),1692,1558.75,1554.5,1583.75,1490
PT08.S5(O3),1267.5,972.25,1074,1203.25,1110


In [773]:
air_df.shape

(6218, 13)

#### Remove rows where is elements has -200 ~ NaN

In [774]:
air_df = air_df[air_df["C6H6(GT)"] != -200]

#### Get info about series with -200 

In [775]:
air_df[air_df == -200].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6218 entries, 0 to 6217
Data columns (total 13 columns):
Datetime        0 non-null object
CO(GT)          1447 non-null float64
PT08.S1(CO)     0 non-null float64
NMHC(GT)        5331 non-null float64
C6H6(GT)        0 non-null float64
NOx(GT)         1406 non-null float64
PT08.S3(NOx)    0 non-null float64
NO2(GT)         1409 non-null float64
PT08.S4(NO2)    0 non-null float64
PT08.S5(O3)     0 non-null float64
T               0 non-null float64
RH              0 non-null float64
AH              0 non-null float64
dtypes: float64(12), object(1)
memory usage: 680.1+ KB


#### Remove highest frequency sets with -200

In [776]:
air_df.drop("NMHC(GT)", axis = 1, inplace = True)
air_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6218 entries, 0 to 6217
Data columns (total 12 columns):
Datetime        6218 non-null object
CO(GT)          6218 non-null float64
PT08.S1(CO)     6218 non-null float64
C6H6(GT)        6218 non-null float64
NOx(GT)         6218 non-null int64
PT08.S3(NOx)    6218 non-null float64
NO2(GT)         6218 non-null int64
PT08.S4(NO2)    6218 non-null float64
PT08.S5(O3)     6218 non-null float64
T               6218 non-null float64
RH              6218 non-null float64
AH              6218 non-null float64
dtypes: float64(9), int64(2), object(1)
memory usage: 631.5+ KB


## Preprocessing
#### Three sets with NaN values

In [777]:
data_cogt_neg = ["NOx(GT)", "NO2(GT)", "C6H6(GT)"]
data_noxgt_neg = ["CO(GT)", "NO2(GT)", "C6H6(GT)"]
data_no2gt_neg = ["NOx(GT)", "CO(GT)", "C6H6(GT)"]

In [778]:
cogt_df = air_df.drop(data_cogt_neg, axis = 1)
noxgt_df = air_df.drop(data_noxgt_neg, axis = 1)
no2gt_df = air_df.drop(data_no2gt_neg, axis = 1)

In [779]:
cogt_df.reset_index(inplace = True)
noxgt_df.reset_index(inplace = True)
no2gt_df.reset_index(inplace = True)

### Preprocessing CO(GT) series

In [780]:
x_test = cogt_df[cogt_df["CO(GT)"] == -200]
pos = list(x_test.index.values)
cogt_df.drop(cogt_df.index[pos], axis = 0, inplace = True)

x_train = cogt_df.drop("Datetime", axis = 1)
x_test = x_test.drop(["Datetime", "CO(GT)"], axis = 1)

y_train = x_train.pop("CO(GT)")

x_train = (x_train - x_train.mean()) / x_train.std()
x_test = (x_test - x_test.mean()) / x_test.std()

In [781]:
x_test.head()

Unnamed: 0,index,PT08.S1(CO),PT08.S3(NOx),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
10,-2.104603,-0.271194,3.962927,-1.415355,-1.462411,-1.711584,0.655672,-1.58234
34,-2.089324,-1.143585,4.272779,-1.649089,-1.637446,-2.260971,0.977237,-1.953135
39,-2.086141,2.314476,-0.361625,1.779941,1.80252,-1.829063,0.402273,-1.857767
58,-2.074044,0.387946,1.141314,-0.61769,0.098612,-2.136581,1.283983,-1.675894
82,-2.058765,0.30313,1.133078,-0.469287,0.267216,-1.431708,0.692719,-1.235665


### Build predicting model

In [782]:
rng = np.random.RandomState(1)
# model = Ridge(alpha = 0.0000001, random_state = 10, solver = 'saga')
model = RandomForestRegressor(n_estimators=500, max_features=5, random_state=rng)
# model = GradientBoostingRegressor(n_estimators = 300, max_depth = 7, random_state = 15)
# model = ElasticNet(alpha = 0.2, random_state = 10)
# model = CatBoostRegressor(depth = 7)
# model = xgb.XGBRegressor()
# model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
model.fit(x_train, y_train)

pred = model.predict(x_test)

### Switch NaN values to predicted

In [783]:
ind = 0
for i in pos:
    air_df.loc[i, "CO(GT)"] = pred[ind]
    ind += 1

air_df.head()

Unnamed: 0,Datetime,CO(GT),PT08.S1(CO),C6H6(GT),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10 18:00:00,2.6,1360.0,11.881723,166,1056.25,113,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10 19:00:00,2.0,1292.25,9.397165,103,1173.75,92,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10 20:00:00,2.2,1402.0,8.997817,131,1140.0,114,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10 21:00:00,2.2,1375.5,9.228796,172,1092.0,122,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10 22:00:00,1.6,1272.25,6.518224,131,1205.0,116,1490.0,1110.0,11.15,59.575001,0.788794


### Preprocessing NOx(GT) series

In [784]:
x_test = noxgt_df[noxgt_df["NOx(GT)"] == -200]
pos = list(x_test.index.values)
noxgt_df.drop(noxgt_df.index[pos], axis = 0, inplace = True)

x_train = noxgt_df.drop("Datetime", axis = 1)
x_test = x_test.drop(["Datetime", "NOx(GT)"], axis = 1)

y_train = x_train.pop("NOx(GT)")

x_train = (x_train - x_train.mean()) / x_train.std()
x_test = (x_test - x_test.mean()) / x_test.std()

### Build predicting model

In [786]:
rng = np.random.RandomState(1)
# model = Ridge(alpha = 0.0000001, random_state = 10, solver = 'saga')
model = RandomForestRegressor(n_jobs = -1, random_state=rng)
# model = GradientBoostingRegressor(n_estimators = 300, max_depth = 7, random_state = 15)
# model = ElasticNet(alpha = 0.2, random_state = 10)
# model = CatBoostRegressor( depth = 7)
# model = xgb.XGBRegressor()
# model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
model.fit(x_train, y_train)

pred = model.predict(x_test)

### Switch NaN(-200) values to predicted

In [787]:
ind = 0
for i in pos:
    air_df.loc[i, "NOx(GT)"] = pred[ind]
    ind += 1

air_df.head()

Unnamed: 0,Datetime,CO(GT),PT08.S1(CO),C6H6(GT),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10 18:00:00,2.6,1360.0,11.881723,166.0,1056.25,113,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10 19:00:00,2.0,1292.25,9.397165,103.0,1173.75,92,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10 20:00:00,2.2,1402.0,8.997817,131.0,1140.0,114,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10 21:00:00,2.2,1375.5,9.228796,172.0,1092.0,122,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10 22:00:00,1.6,1272.25,6.518224,131.0,1205.0,116,1490.0,1110.0,11.15,59.575001,0.788794


### Preprocessing NO2(GT) series

In [788]:
x_test = no2gt_df[no2gt_df["NO2(GT)"] == -200]
pos = list(x_test.index.values)
no2gt_df.drop(no2gt_df.index[pos], axis = 0, inplace = True)

x_train = no2gt_df.drop("Datetime", axis = 1)
x_test = x_test.drop(["Datetime", "NO2(GT)"], axis = 1)

y_train = x_train.pop("NO2(GT)")

x_train = (x_train - x_train.mean()) / x_train.std()
x_test = (x_test - x_test.mean()) / x_test.std()

### Build predicting model

In [789]:
rng = np.random.RandomState(1)
# model = Ridge(alpha = 0.0000001, random_state = 10, solver = 'saga')
model = RandomForestRegressor(n_jobs = -1, random_state=rng)
# model = GradientBoostingRegressor(n_estimators = 300, max_depth = 10, random_state = 10)
# model = ElasticNet(alpha = 0.2, random_state = 10)
# model = CatBoostRegressor(depth = 7)
# model = xgb.XGBRegressor()
# model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
model.fit(x_train, y_train)

pred = model.predict(x_test)

### Switch NaN values to predicted

In [790]:
ind = 0
for i in pos:
    air_df.loc[i, "NO2(GT)"] = pred[ind]
    ind += 1


### Checkout dataset to -200 

In [791]:
air_df[air_df == -200].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6218 entries, 0 to 6217
Data columns (total 12 columns):
Datetime        0 non-null object
CO(GT)          0 non-null float64
PT08.S1(CO)     0 non-null float64
C6H6(GT)        0 non-null float64
NOx(GT)         0 non-null float64
PT08.S3(NOx)    0 non-null float64
NO2(GT)         0 non-null float64
PT08.S4(NO2)    0 non-null float64
PT08.S5(O3)     0 non-null float64
T               0 non-null float64
RH              0 non-null float64
AH              0 non-null float64
dtypes: float64(11), object(1)
memory usage: 791.5+ KB


### Separating dataset

In [792]:
X = air_df[["CO(GT)", "PT08.S1(CO)","NOx(GT)","PT08.S3(NOx)","NO2(GT)","PT08.S4(NO2)","PT08.S5(O3)","T","RH","AH"]].values
y = air_df[["C6H6(GT)"]].values

### Get informationg about models

In [793]:
tscv = TimeSeriesSplit(n_splits=20)
for train_index, test_index in tscv.split(X):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
#     model = GradientBoostingRegressor(n_estimators = 300, max_depth = 3)
#     model = Ridge(alpha=0.000001, random_state=10)
#     model = CatBoostRegressor(depth = 7)
    model = ElasticNet(alpha = 0.41, random_state = 10)
#     model = RandomForestRegressor(n_jobs = -1)
#     model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
#     model = xgb.XGBRegressor()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    print(np.sqrt(mean_squared_error(pred, y_test)))

1.9567726709193172
1.0317066296387565
1.5388812732881871
1.115024473860909
1.0582460432197596
2.5923392904149782
1.3530518427718374
1.2552494426687966
1.3560445765701414
1.5316503746840755
1.2986068389423437
0.9268477976755441
1.7267207645325584
2.2082366233962536
1.952563990836469
1.83441015608291
1.8190960601106854
2.2227327177476823
3.0317215839299196
3.581207677298431


### Train final model for predicting

In [794]:
rng = np.random.RandomState(1)
# final_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
# final_model = Ridge(alpha=0.000001, random_state=10)
# final_model = DecisionTreeRegressor(max_depth = 4, random_state=rng)
# final_model = LinearRegression(n_jobs = -1)
# final_model = RandomForestRegressor(DecisionTreeRegressor(max_depth=4), n_estimators = 300, n_jobs = -1, random_state=rng)
final_model = ElasticNet(alpha = 0.5, random_state = rng)
final_model.fit(X, y)

ElasticNet(alpha=0.5, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=<mtrand.RandomState object at 0x118d819d8>,
      selection='cyclic', tol=0.0001, warm_start=False)

### Load test dataset

In [795]:
air_df = pd.read_csv("test.csv")

## Repeat all previous steps 

In [796]:
air_df.drop("NMHC(GT)", axis = 1, inplace = True)

In [797]:
air_df.head()

Unnamed: 0,Datetime,CO(GT),PT08.S1(CO),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-12-01 00:00:00,2.2,1039.25,273.0,1197.75,113.0,1251.0,933.0,9.8,79.400002,0.962598
1,2004-12-01 01:00:00,1.3,885.5,157.0,1900.0,94.0,1106.0,765.5,10.125,75.275,0.93223
2,2004-12-01 02:00:00,1.2,900.25,117.0,1955.25,76.0,1126.75,748.75,9.15,80.049999,0.929878
3,2004-12-01 03:00:00,0.9,855.5,-200.0,2042.25,-200.0,1100.25,708.5,8.7,82.424999,0.929447
4,2004-12-01 04:00:00,-200.0,856.5,123.0,2061.0,68.0,1082.5,746.0,8.475,81.675001,0.907381


In [798]:
air_df.reset_index(inplace = True)
data_cogt_neg = ["NOx(GT)", "NO2(GT)"]#, "C6H6(GT)"]
data_noxgt_neg = ["CO(GT)", "NO2(GT)"]#, "C6H6(GT)"]
data_no2gt_neg = ["NOx(GT)", "CO(GT)"]#, "C6H6(GT)"]

In [799]:
cogt_df = air_df.drop(data_cogt_neg, axis = 1)
noxgt_df = air_df.drop(data_noxgt_neg, axis = 1)
no2gt_df = air_df.drop(data_no2gt_neg, axis = 1)

cogt_df.reset_index(inplace = True)
noxgt_df.reset_index(inplace = True)
no2gt_df.reset_index(inplace = True)

In [800]:
x_test = cogt_df[cogt_df["CO(GT)"] == -200]
pos = list(x_test.index.values)
cogt_df.drop(cogt_df.index[pos], axis = 0, inplace = True)

x_train = cogt_df.drop("Datetime", axis = 1)
x_test = x_test.drop(["Datetime", "CO(GT)"], axis = 1)

y_train = x_train.pop("CO(GT)")

x_train = (x_train - x_train.mean()) / x_train.std()
x_test = (x_test - x_test.mean()) / x_test.std()

In [801]:
rng = np.random.RandomState(1)
# model = Ridge(alpha = 0.0000001, random_state = 10, solver = 'saga')
model = RandomForestRegressor(n_jobs = -1, random_state=rng)
# model = GradientBoostingRegressor(n_estimators = 300, max_depth = 7, random_state = 15)
# model = ElasticNet(alpha = 0.2, random_state = 10)
# model = CatBoostRegressor(depth = 7)
# model = xgb.XGBRegressor()
# model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
model.fit(x_train, y_train)

pred = model.predict(x_test)

In [802]:
ind = 0
for i in pos:
    air_df.loc[i, "CO(GT)"] = pred[ind]
    ind += 1

In [803]:
x_test = no2gt_df[no2gt_df["NO2(GT)"] == -200]
pos = list(x_test.index.values)
no2gt_df.drop(no2gt_df.index[pos], axis = 0, inplace = True)

x_train = no2gt_df.drop("Datetime", axis = 1)
x_test = x_test.drop(["Datetime", "NO2(GT)"], axis = 1)

y_train = x_train.pop("NO2(GT)")

x_train = (x_train - x_train.mean()) / x_train.std()
x_test = (x_test - x_test.mean()) / x_test.std()

In [804]:
rng = np.random.RandomState(1)
# model = Ridge(alpha = 0.0000001, random_state = 10, solver = 'saga')
model = RandomForestRegressor(n_jobs = -1, random_state=rng)
# model = GradientBoostingRegressor(n_estimators = 300, max_depth = 7, random_state = 15)
# model = ElasticNet(alpha = 0.2, random_state = 10)
# model = CatBoostRegressor(depth = 7)
# model = xgb.XGBRegressor()
# model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
model.fit(x_train, y_train)

pred = model.predict(x_test)

In [805]:
ind = 0
for i in pos:
    air_df.loc[i, "NO2(GT)"] = pred[ind]
    ind += 1

In [806]:
x_test = noxgt_df[noxgt_df["NOx(GT)"] == -200]
pos = list(x_test.index.values)
noxgt_df.drop(noxgt_df.index[pos], axis = 0, inplace = True)

x_train = noxgt_df.drop("Datetime", axis = 1)
x_test = x_test.drop(["Datetime", "NOx(GT)"], axis = 1)

y_train = x_train.pop("NOx(GT)")

x_train = (x_train - x_train.mean()) / x_train.std()
x_test = (x_test - x_test.mean()) / x_test.std()

In [807]:
rng = np.random.RandomState(1)
# x_test = air_df[["CO(GT)","PT08.S1(CO)", "NOx(GT)","PT08.S3(NOx)","NO2(GT)","PT08.S4(NO2)","PT08.S5(O3)","T","RH","AH"]]
# model = Ridge(alpha = 0.0000001, random_state = 10, solver = 'saga')
model = RandomForestRegressor(n_jobs = -1, random_state=rng)
# model = GradientBoostingRegressor(n_estimators = 300, max_depth = 7, random_state = 15)
# model = ElasticNet(alpha = 0.2, random_state = 10)
# model = CatBoostRegressor(depth = 7)
# model = xgb.XGBRegressor()
# model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),n_estimators=300, random_state=rng)
model.fit(x_train, y_train)

pred = model.predict(x_test)

In [808]:
ind = 0
for i in pos:
    air_df.loc[i, "NOx(GT)"] = pred[ind]
    ind += 1

In [809]:
air_df[air_df == -200].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2773 entries, 0 to 2772
Data columns (total 12 columns):
index           0 non-null float64
Datetime        0 non-null object
CO(GT)          0 non-null float64
PT08.S1(CO)     0 non-null float64
NOx(GT)         0 non-null float64
PT08.S3(NOx)    0 non-null float64
NO2(GT)         0 non-null float64
PT08.S4(NO2)    0 non-null float64
PT08.S5(O3)     0 non-null float64
T               0 non-null float64
RH              0 non-null float64
AH              0 non-null float64
dtypes: float64(11), object(1)
memory usage: 260.0+ KB


### Final predict for test dataset

In [810]:
x_test = air_df[["CO(GT)", "PT08.S1(CO)","NOx(GT)","PT08.S3(NOx)","NO2(GT)","PT08.S4(NO2)","PT08.S5(O3)","T","RH","AH"]].values
pred = final_model.predict(x_test)

### Make predicted dataset to submit 

In [811]:
y_pred_df = pd.DataFrame(pred,columns = ['target'])
y_pred_df.to_csv('ridge.csv', index = False)

### Checkout approximation

In [812]:
ans_df = pd.read_csv("ans.csv", sep=';')
ans_df = ans_df['ans']

In [813]:
ans_df

0        7.5
1        3.8
2        3.8
3        3.0
4        2.8
5        2.5
6        3.3
7        7.0
8       18.1
9       24.6
10      19.7
11      13.0
12      17.1
13      19.3
14      28.4
15      22.9
16      25.7
17      26.8
18      29.2
19      11.9
20      10.1
21       7.2
22       7.1
23       7.5
24       8.1
25       5.5
26       5.7
27       3.8
28       2.9
29       2.4
        ... 
2743     4.1
2744     3.4
2745     5.3
2746     5.3
2747     3.8
2748     4.2
2749     5.4
2750     6.3
2751     6.1
2752     4.6
2753    11.1
2754     7.9
2755     6.0
2756     5.8
2757     5.1
2758     3.5
2759     1.9
2760     1.5
2761     1.1
2762     0.8
2763     1.3
2764     4.4
2765    17.4
2766    22.4
2767    13.6
2768    13.5
2769    11.4
2770    12.4
2771     9.5
2772    11.9
Name: ans, Length: 2773, dtype: float64

In [814]:
indxs = ans_df[ans_df == -200].index.values
ans_df.drop(ans_df.index[indxs], axis = 0, inplace = True)
ans_df.reset_index()

Unnamed: 0,index,ans
0,0,7.5
1,1,3.8
2,2,3.8
3,3,3.0
4,4,2.8
5,5,2.5
6,6,3.3
7,7,7.0
8,8,18.1
9,9,24.6


In [815]:
predic_df = pd.read_csv("ridge.csv")
predic_df

Unnamed: 0,target
0,5.190749
1,0.672473
2,-0.110236
3,1.504898
4,-0.575630
5,-1.417497
6,-0.000128
7,5.214690
8,18.284801
9,25.131469


In [816]:
print("RMSE:", mean_squared_error(predic_df, ans_df))

RMSE: 4.988421209910939


RMSE: 4.359570467231344

In [613]:
fake = pd.DataFrame(ans_df, columns = ['target'])
fake.to_csv('ridge2.csv', index = False)