In [1]:
import pandas as pd
import numpy as np
import random
import sklearn.metrics
import math


In [9]:
data = pd.read_csv("..\\Data\\Preprocessed Data\\Preprocessed_data.csv")
data = data.drop(['Unnamed: 0','No'], axis = 1)

In [10]:
data.head()

Unnamed: 0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,...,RAIN,wd,WSPM,station,PM25_AQI,PM10_AQI,SO2_AQI,NO2_AQI,CO_AQI,O3_AQI
0,2013,3,1,0,4.0,4.0,1.526718,3.723404,0.262009,38.5,...,0.0,329.976194,4.4,Aotizhongxin,16.666667,3.636364,2.120441,3.447597,2.911208,30.8
1,2013,3,1,1,8.0,8.0,1.526718,3.723404,0.262009,38.5,...,0.0,350.471596,4.7,Aotizhongxin,33.333333,7.272727,2.120441,3.447597,2.911208,30.8
2,2013,3,1,2,7.0,7.0,1.908397,5.319149,0.262009,36.5,...,0.0,329.976194,5.6,Aotizhongxin,29.166667,6.363636,2.650551,4.925138,2.911208,29.2
3,2013,3,1,3,6.0,6.0,4.198473,5.851064,0.262009,36.0,...,0.0,314.006221,3.1,Aotizhongxin,25.0,5.454545,5.831213,5.417652,2.911208,28.8
4,2013,3,1,4,3.0,3.0,4.580153,6.382979,0.262009,36.0,...,0.0,350.471596,2.0,Aotizhongxin,12.5,2.727273,6.361323,5.910165,2.911208,28.8


# Preparing Train and Test Set 

#### There are 3 models created in order to depict the importance of feature creation and influence of air quality on temperature. 

Feature1 : Uses Pressure, Rain, and Wind speed in order to predict the temperature.

Feature2 : Uses the 4 Pollutant concentration and 2 particle sizes concentration along with the previous 3 features to predict temperature.

Feature3 : Uses the created features (Air quality indices of 6 pollutants) along with the previous 3 features to predict the temperature.

In [11]:
temp_array = data['TEMP'].values
y = temp_array.reshape(-1,1) # Because only one attribute has to be predicted

In [12]:
feature1 = ['PRES','RAIN','WSPM', 'DEWP']

feature2 = ['PM2.5','PM10','SO2','NO2','O3','CO','PRES','RAIN','WSPM', 'DEWP']

feature3 = ['PM25_AQI','PM10_AQI','SO2_AQI','NO2_AQI','O3_AQI','CO_AQI','PRES','RAIN','WSPM','DEWP']

X1 = data.loc[:, feature1].values
X2 = data.loc[:, feature2].values
X3 = data.loc[:, feature3].values


In [15]:
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size = 0.2, random_state = 0)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size = 0.2, random_state = 0)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y, test_size = 0.2, random_state = 0)


## DecisionTree Regression

In [16]:
from sklearn.tree import DecisionTreeRegressor

dtree1 = DecisionTreeRegressor()
dtree2 = DecisionTreeRegressor()
dtree3 = DecisionTreeRegressor()

dtree1.fit(X1_train, y1_train)

dtree2.fit(X2_train, y2_train)

dtree3.fit(X3_train, y3_train)

print(" R^2 Score with feature1 <-- on test set: {}".format(dtree1.score(X1_test, y1_test)))

print(" R^2 Score with feature2 <-- on test set: {}".format(dtree2.score(X2_test, y2_test)))

print(" R^2 Score with feature3 <-- on test set: {}".format(dtree3.score(X3_test, y3_test)))


 R^2 Score with feature1 <-- on test set: 0.8797417842911273
 R^2 Score with feature2 <-- on test set: 0.89158842235077
 R^2 Score with feature3 <-- on test set: 0.8909930796788632


## Ridge Regression 

In [17]:
from sklearn.linear_model import Ridge

ridge1 = Ridge(alpha = 1.0, tol = 1, solver = 'svd').fit(X1_train, y1_train)
ridge2 = Ridge(alpha = 1.0, tol = 1, solver = 'svd').fit(X2_train, y2_train)
ridge3 = Ridge(alpha = 1.0, tol = 1, solver = 'svd').fit(X3_train, y3_train)


print(" R^2 Score with feature1 <-- on test set: {}".format(ridge1.score(X1_test, y1_test)))

print(" R^2 Score with feature2 <-- on test set: {}".format(ridge2.score(X2_test, y2_test)))

print(" R^2 Score with feature3 <-- on test set: {}".format(ridge3.score(X3_test, y3_test)))





 R^2 Score with feature1 <-- on test set: 0.8137456586577634
 R^2 Score with feature2 <-- on test set: 0.873592083039969
 R^2 Score with feature3 <-- on test set: 0.8750426117823807


## Lasso Regression


In [18]:
from sklearn.linear_model import Lasso

lasso1 = Lasso(alpha=1.0, precompute = False,selection = 'random',tol = 0.000001)
lasso1.fit(X1_train,y1_train)

lasso2 = Lasso(alpha=1.0, precompute = False,selection = 'random',tol = 0.000001)
lasso2.fit(X2_train,y2_train)


lasso3 = Lasso(alpha=1.0, precompute = False,selection = 'random',tol = 0.000001)
lasso3.fit(X3_train,y3_train)


print(" R^2 Score with feature1 <-- on test set: {}".format(lasso1.score(X1_test, y1_test)))

print(" R^2 Score with feature2 <-- on test set: {}".format(lasso2.score(X2_test, y2_test)))

print(" R^2 Score with feature3 <-- on test set: {}".format(lasso3.score(X3_test, y3_test)))


 R^2 Score with feature1 <-- on test set: 0.8055535666393012
 R^2 Score with feature2 <-- on test set: 0.8590314881516887
 R^2 Score with feature3 <-- on test set: 0.8650864940930765


## ElasticNet Regression

In [19]:
from sklearn.linear_model import ElasticNet

enet1 = ElasticNet().fit(X1_train,y1_train)
enet2 = ElasticNet().fit(X2_train,y2_train)
enet3 = ElasticNet().fit(X3_train,y3_train)


print(" R^2 Score with feature1 <-- on test set: {}".format(enet1.score(X1_test, y1_test)))

print(" R^2 Score with feature2 <-- on test set: {}".format(enet2.score(X2_test, y2_test)))

print(" R^2 Score with feature3 <-- on test set: {}".format(enet3.score(X3_test, y3_test)))



 R^2 Score with feature1 <-- on test set: 0.8035776722446638
 R^2 Score with feature2 <-- on test set: 0.8625700148276056
 R^2 Score with feature3 <-- on test set: 0.8691704577793922


## Forward Selection

In [21]:
data1 = data.drop(['day','year','month','hour','station','TEMP'], axis = 1)
#Dropping columns of whose corelation cannot be found out.

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [25]:
from statsmodels.api import OLS
import statsmodels.api as sm

def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target.astype(float), sm.add_constant(data[best_features+[new_column]]).astype(float)).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features
target = data['TEMP']
feature4 = forward_selection(data1,target) 

In [26]:
feature4

['SO2',
 'SO2_AQI',
 'PM10_AQI',
 'O3_AQI',
 'DEWP',
 'WSPM',
 'PM25_AQI',
 'O3',
 'NO2_AQI',
 'RAIN',
 'PRES',
 'CO_AQI',
 'CO',
 'PM10',
 'wd',
 'NO2']

In [27]:
X4 = data.loc[:, feature4].values
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y, test_size = 0.2, random_state = 0)


In [28]:
dtree4 = DecisionTreeRegressor()
dtree4.fit(X4_train, y4_train)
print(" R^2 Score with feature4 for DecisionTree Regressor <-- on test set: {}".format(dtree4.score(X4_test, y4_test)))

 R^2 Score with feature4 for DecisionTree Regressor <-- on test set: 0.8946235850323392


In [29]:
ridge4 = Ridge(alpha = 1.0, tol = 1, solver = 'svd')
ridge4.fit(X4_train, y4_train)
print(" R^2 Score with feature4 <-- on test set: {}".format(ridge4.score(X4_test, y4_test)))

 R^2 Score with feature4 <-- on test set: 0.8770318636053822


In [30]:
lasso4 = Lasso(alpha=1.0, precompute = False,selection = 'random',tol = 0.000001)
lasso4.fit(X4_train,y4_train)
print(" R^2 Score with feature4 for Lasso Regressor<-- on test set: {}".format(lasso4.score(X4_test, y4_test)))


 R^2 Score with feature4 for Lasso Regressor<-- on test set: 0.8655030148683419


In [31]:
enet4 = ElasticNet().fit(X4_train,y4_train)
print(" R^2 Score with feature3 for ElastiNet Regressor<-- on test set: {}".format(enet4.score(X4_test, y4_test)))

 R^2 Score with feature3 for ElastiNet Regressor<-- on test set: 0.8697057308922778


## Adjusted R-Squared and Root Mean Squared Error

### DecisionTree

In [32]:
R_Score_DT1 = dtree1.score(X1_test,y1_test)
Adj_R_Score_DT1 = 1 - (1-R_Score_DT1)*(len(y1_train)-1)/(len(y1_train)-X1_train.shape[1]-1)
print("Adjusted R^2 Score with feature1 <-- on test set: {}".format(Adj_R_Score_DT1))

predicted_values = dtree1.predict(X1_test)
mse = sklearn.metrics.mean_squared_error(y1_test, predicted_values)
print("Mean Squared Error with feature1 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature1 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature1 <-- on test set: 0.8797403552358648
Mean Squared Error with feature1 <-- on test set: 15.762025096432561 
Root Mean Squared Error with feature1 <-- on test set: 3.9701416972738595 


In [33]:
R_Score_DT2 = dtree2.score(X2_test,y2_test)
Adj_R_Score_DT2 = 1 - (1-R_Score_DT2)*(len(y2_train)-1)/(len(y2_train)-X2_train.shape[1]-1)
print("Adjusted R^2 Score with feature2 <-- on test set: {}".format(Adj_R_Score_DT2))

predicted_values = dtree2.predict(X2_test)
mse = sklearn.metrics.mean_squared_error(y2_test, predicted_values)
print("Mean Squared Error with feature2 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature2 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature2 <-- on test set: 0.8915852015958258
Mean Squared Error with feature2 <-- on test set: 14.209307842947931 
Root Mean Squared Error with feature2 <-- on test set: 3.7695235564919782 


In [34]:
R_Score_DT3 = dtree3.score(X3_test,y3_test)
Adj_R_Score_DT3 = 1 - (1-R_Score_DT3)*(len(y3_train)-1)/(len(y3_train)-X3_train.shape[1]-1)
print("Adjusted R^2 Score with feature3 <-- on test set: {}".format(Adj_R_Score_DT3))

predicted_values = dtree3.predict(X3_test)
mse = sklearn.metrics.mean_squared_error(y3_test, predicted_values)
print("Mean Squared Error with feature3 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature3 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature3 <-- on test set: 0.8909898412371285
Mean Squared Error with feature3 <-- on test set: 14.287338321616334 
Root Mean Squared Error with feature3 <-- on test set: 3.7798595637425914 


In [35]:
R_Score_DT4 = dtree4.score(X4_test,y4_test)
Adj_R_Score_DT4 = 1 - (1-R_Score_DT4)*(len(y4_train)-1)/(len(y4_train)-X4_train.shape[1]-1)
print("Adjusted R^2 Score with feature4 <-- on test set: {}".format(Adj_R_Score_DT4))

predicted_values = dtree4.predict(X4_test)
mse = sklearn.metrics.mean_squared_error(y4_test, predicted_values)
print("Mean Squared Error with feature4 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature4 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature4 <-- on test set: 0.8946185760077802
Mean Squared Error with feature4 <-- on test set: 13.811494603522672 
Root Mean Squared Error with feature4 <-- on test set: 3.7163819237966744 


### Ridge Regression

In [36]:
R_Score_RR1 = ridge1.score(X1_test,y1_test)
Adj_R_Score_RR1 = 1 - (1-R_Score_RR1)*(len(y1_train)-1)/(len(y1_train)-X1_train.shape[1]-1)
print("Adjusted R^2 Score with feature1 <-- on test set: {}".format(Adj_R_Score_RR1))

predicted_values = ridge1.predict(X1_test)
mse = sklearn.metrics.mean_squared_error(y1_test, predicted_values)
print("Mean Squared Error with feature1 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature1 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature1 <-- on test set: 0.8137434453557858
Mean Squared Error with feature1 <-- on test set: 24.412016969076387 
Root Mean Squared Error with feature1 <-- on test set: 4.94085184650141 


In [37]:
R_Score_RR2 = ridge2.score(X2_test,y2_test)
Adj_R_Score_RR2 = 1 - (1-R_Score_RR2)*(len(y2_train)-1)/(len(y2_train)-X2_train.shape[1]-1)
print("Adjusted R^2 Score with feature2 <-- on test set: {}".format(Adj_R_Score_RR2))

predicted_values = ridge2.predict(X2_test)
mse = sklearn.metrics.mean_squared_error(y2_test, predicted_values)
print("Mean Squared Error with feature2 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature2 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature2 <-- on test set: 0.8735883276391865
Mean Squared Error with feature2 <-- on test set: 16.56805522821977 
Root Mean Squared Error with feature2 <-- on test set: 4.07038760171802 


In [38]:
R_Score_RR3 = ridge3.score(X3_test,y3_test)
Adj_R_Score_RR3 = 1 - (1-R_Score_RR3)*(len(y3_train)-1)/(len(y3_train)-X3_train.shape[1]-1)
print("Adjusted R^2 Score with feature3 <-- on test set: {}".format(Adj_R_Score_RR3))

predicted_values = ridge3.predict(X3_test)
mse = sklearn.metrics.mean_squared_error(y3_test, predicted_values)
print("Mean Squared Error with feature3 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature3 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature3 <-- on test set: 0.8750388994747597
Mean Squared Error with feature3 <-- on test set: 16.377937070335758 
Root Mean Squared Error with feature3 <-- on test set: 4.046966403410801 


In [39]:
R_Score_RR4 = ridge4.score(X4_test,y4_test)
Adj_R_Score_RR4 = 1 - (1-R_Score_RR4)*(len(y4_train)-1)/(len(y4_train)-X4_train.shape[1]-1)
print("Adjusted R^2 Score with feature4 <-- on test set: {}".format(Adj_R_Score_RR4))

predicted_values = ridge4.predict(X4_test)
mse = sklearn.metrics.mean_squared_error(y4_test, predicted_values)
print("Mean Squared Error with feature4 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature4 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature4 <-- on test set: 0.8770260183655781
Mean Squared Error with feature4 <-- on test set: 16.117209460396992 
Root Mean Squared Error with feature4 <-- on test set: 4.014624448238838 


### Lasso Regression

In [40]:
R_Score_La1 = lasso1.score(X1_test,y1_test)
Adj_R_Score_La1 = 1 - (1-R_Score_La1)*(len(y1_train)-1)/(len(y1_train)-X1_train.shape[1]-1)
print("Adjusted R^2 Score with feature1 <-- on test set: {}".format(Adj_R_Score_La1))

predicted_values = lasso1.predict(X1_test)
mse = sklearn.metrics.mean_squared_error(y1_test, predicted_values)
print("Mean Squared Error with feature1 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature1 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature1 <-- on test set: 0.8055512559888628
Mean Squared Error with feature1 <-- on test set: 25.485739535357222 
Root Mean Squared Error with feature1 <-- on test set: 5.048340275313979 


In [41]:
R_Score_La2 = lasso2.score(X2_test,y2_test)
Adj_R_Score_La2 = 1 - (1-R_Score_La2)*(len(y2_train)-1)/(len(y2_train)-X2_train.shape[1]-1)
print("Adjusted R^2 Score with feature2 <-- on test set: {}".format(Adj_R_Score_La2))

predicted_values = lasso2.predict(X2_test)
mse = sklearn.metrics.mean_squared_error(y2_test, predicted_values)
print("Mean Squared Error with feature2 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature2 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature2 <-- on test set: 0.859027300176185
Mean Squared Error with feature2 <-- on test set: 18.47648585555968 
Root Mean Squared Error with feature2 <-- on test set: 4.29842830061869 


In [42]:
R_Score_La3 = lasso3.score(X3_test,y3_test)
Adj_R_Score_La3 = 1 - (1-R_Score_La3)*(len(y3_train)-1)/(len(y3_train)-X3_train.shape[1]-1)
print("Adjusted R^2 Score with feature3 <-- on test set: {}".format(Adj_R_Score_La3))

predicted_values = lasso3.predict(X3_test)
mse = sklearn.metrics.mean_squared_error(y3_test, predicted_values)
print("Mean Squared Error with feature3 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature3 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature3 <-- on test set: 0.8650824860032523
Mean Squared Error with feature3 <-- on test set: 17.68286726538995 
Root Mean Squared Error with feature3 <-- on test set: 4.205100149269926 


In [43]:
R_Score_La4 = lasso4.score(X4_test,y4_test)
Adj_R_Score_La4 = 1 - (1-R_Score_La4)*(len(y4_train)-1)/(len(y4_train)-X4_train.shape[1]-1)
print("Adjusted R^2 Score with feature4 <-- on test set: {}".format(Adj_R_Score_La4))

predicted_values = lasso4.predict(X4_test)
mse = sklearn.metrics.mean_squared_error(y4_test, predicted_values)
print("Mean Squared Error with feature4 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature4 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature4 <-- on test set: 0.8654966216094534
Mean Squared Error with feature4 <-- on test set: 17.62827464671339 
Root Mean Squared Error with feature4 <-- on test set: 4.198603892571123 


### ElastiNet Regression

In [44]:
R_Score_EN1 = enet1.score(X1_test,y1_test)
Adj_R_Score_EN1 = 1 - (1-R_Score_EN1)*(len(y1_train)-1)/(len(y1_train)-X1_train.shape[1]-1)
print("Adjusted R^2 Score with feature1 <-- on test set: {}".format(Adj_R_Score_EN1))

predicted_values = enet1.predict(X1_test)
mse = sklearn.metrics.mean_squared_error(y1_test, predicted_values)
print("Mean Squared Error with feature1 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature1 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature1 <-- on test set: 0.8035753381142305
Mean Squared Error with feature1 <-- on test set: 25.74471641151153 
Root Mean Squared Error with feature1 <-- on test set: 5.073925148394636 


In [45]:
R_Score_EN2 = enet2.score(X2_test,y2_test)
Adj_R_Score_EN2 = 1 - (1-R_Score_EN2)*(len(y2_train)-1)/(len(y2_train)-X2_train.shape[1]-1)
print("Adjusted R^2 Score with feature2 <-- on test set: {}".format(Adj_R_Score_EN2))

predicted_values = enet2.predict(X2_test)
mse = sklearn.metrics.mean_squared_error(y2_test, predicted_values)
print("Mean Squared Error with feature2 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature2 <-- on test set: {} ".format(rmse))


Adjusted R^2 Score with feature2 <-- on test set: 0.8625659319767347
Mean Squared Error with feature2 <-- on test set: 18.012697615052108 
Root Mean Squared Error with feature2 <-- on test set: 4.244136851593279 


In [46]:
R_Score_EN3 = enet3.score(X3_test,y3_test)
Adj_R_Score_EN3 = 1 - (1-R_Score_EN3)*(len(y3_train)-1)/(len(y3_train)-X3_train.shape[1]-1)
print("Adjusted R^2 Score with feature3 <-- on test set: {}".format(Adj_R_Score_EN3))

predicted_values = enet3.predict(X3_test)
mse = sklearn.metrics.mean_squared_error(y3_test, predicted_values)
print("Mean Squared Error with feature3 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature3 <-- on test set: {} ".format(rmse))


Adjusted R^2 Score with feature3 <-- on test set: 0.8691665710183645
Mean Squared Error with feature3 <-- on test set: 17.14758959028741 
Root Mean Squared Error with feature3 <-- on test set: 4.140964813939791 


In [47]:
R_Score_EN4 = enet4.score(X4_test,y4_test)
Adj_R_Score_EN4 = 1 - (1-R_Score_EN4)*(len(y4_train)-1)/(len(y4_train)-X4_train.shape[1]-1)
print("Adjusted R^2 Score with feature4 <-- on test set: {}".format(Adj_R_Score_EN4))

predicted_values = enet4.predict(X4_test)
mse = sklearn.metrics.mean_squared_error(y4_test, predicted_values)
print("Mean Squared Error with feature1 <-- on test set: {} ".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Squared Error with feature1 <-- on test set: {} ".format(rmse))

Adjusted R^2 Score with feature4 <-- on test set: 0.8696995374077674
Mean Squared Error with feature1 <-- on test set: 17.077432319210207 
Root Mean Squared Error with feature1 <-- on test set: 4.132485005321883 
