In [1]:
import pandas as pd
import numpy as np
import random
from sklearn import preprocessing

In [2]:
data = pd.read_csv("Data//Preprocessed Data//Preprocessed_data.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,...,RAIN,wd,WSPM,station,PM25_AQI,PM10_AQI,SO2_AQI,NO2_AQI,CO_AQI,O3_AQI
0,0,1,2013,3,1,0,4.0,4.0,1.526718,3.723404,...,0.0,329.976194,4.4,Aotizhongxin,16.666667,3.636364,2.120441,3.447597,2.911208,30.8
1,1,2,2013,3,1,1,8.0,8.0,1.526718,3.723404,...,0.0,350.471596,4.7,Aotizhongxin,33.333333,7.272727,2.120441,3.447597,2.911208,30.8
2,2,3,2013,3,1,2,7.0,7.0,1.908397,5.319149,...,0.0,329.976194,5.6,Aotizhongxin,29.166667,6.363636,2.650551,4.925138,2.911208,29.2
3,3,4,2013,3,1,3,6.0,6.0,4.198473,5.851064,...,0.0,314.006221,3.1,Aotizhongxin,25.0,5.454545,5.831213,5.417652,2.911208,28.8
4,4,5,2013,3,1,4,3.0,3.0,4.580153,6.382979,...,0.0,350.471596,2.0,Aotizhongxin,12.5,2.727273,6.361323,5.910165,2.911208,28.8


# Preparing Train and Test Set 

#### There are 3 models created in order to depict the importance of feature creation and influence of air quality on temperature. 
Feature1 : Uses Pressure, Rain, and Wind speed in order to predict the temperature.

Feature2 : Uses the 4 Pollutant concentration and 2 particle sizes concentration along with the previous 3 features to predict temperature.

Feature3 : Uses the created features (Air quality indices of 6 pollutants) along with the previous 3 features to predict the temperature.

In [4]:
temp_array = data['TEMP'].values
y = temp_array.reshape(-1,1) # Because only one attribute has to be predicted

In [12]:
feature1 = ['PRES','RAIN','WSPM']

feature2 = ['PM2.5','PM10','SO2','NO2','O3','CO','PRES','RAIN','WSPM']

feature3 = ['PM25_AQI','PM10_AQI','SO2_AQI','NO2_AQI','O3_AQI','CO_AQI','PRES','RAIN','WSPM']

X1 = data.loc[:, feature1].values
X2 = data.loc[:, feature2].values
X3 = data.loc[:, feature3].values


In [13]:
from sklearn.model_selection import train_test_split

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size = 0.2, random_state = 0)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size = 0.2, random_state = 0)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y, test_size = 0.2, random_state = 0)



## DecisionTree Regression

In [14]:
from sklearn.tree import DecisionTreeRegressor

dtree1 = DecisionTreeRegressor()
dtree2 = DecisionTreeRegressor()
dtree3 = DecisionTreeRegressor()

dtree1.fit(X1_train, y1_train)

dtree2.fit(X2_train, y2_train)

dtree3.fit(X3_train, y3_train)

print(" R^2 Score with feature1 <-- on test set: {}".format(dtree1.score(X1_test, y1_test)))

print(" R^2 Score with feature2 <-- on test set: {}".format(dtree2.score(X2_test, y2_test)))

print(" R^2 Score with feature3 <-- on test set: {}".format(dtree3.score(X3_test, y3_test)))


 R^2 Score with feature1<-- on test set: 0.7226228398476233
 R^2 Score with feature2 <-- on test set: 0.7255236235567473
 R^2 Score with feature3 <-- on test set: 0.7254002902689967


## Ridge Regression 

In [15]:
from sklearn.linear_model import Ridge

ridge1 = Ridge(alpha = 1.0, tol = 1, solver = 'svd').fit(X1_train, y1_train)
ridge2 = Ridge(alpha = 1.0, tol = 1, solver = 'svd').fit(X2_train, y2_train)
ridge3 = Ridge(alpha = 1.0, tol = 1, solver = 'svd').fit(X3_train, y3_train)


print(" R^2 Score with feature1 <-- on test set: {}".format(ridge1.score(X1_test, y1_test)))

print(" R^2 Score with feature2 <-- on test set: {}".format(ridge2.score(X2_test, y2_test)))

print(" R^2 Score with feature3 <-- on test set: {}".format(ridge3.score(X3_test, y3_test)))


 R^2 Score with feature1 <-- on test set: 0.668813412604142
 R^2 Score with feature2 <-- on test set: 0.7502428511220475
 R^2 Score with feature3 <-- on test set: 0.7507785142635952


## Lasso Regression


In [16]:
from sklearn.linear_model import Lasso

lasso1 = Lasso(alpha=1.0, precompute = False,selection = 'random',tol = 0.000001)
lasso1.fit(X1_train,y1_train)

lasso2 = Lasso(alpha=1.0, precompute = False,selection = 'random',tol = 0.000001)
lasso2.fit(X2_train,y2_train)


lasso3 = Lasso(alpha=1.0, precompute = False,selection = 'random',tol = 0.000001)
lasso3.fit(X3_train,y3_train)


print(" R^2 Score with feature1 <-- on test set: {}".format(lasso1.score(X1_test, y1_test)))

print(" R^2 Score with feature2 <-- on test set: {}".format(lasso2.score(X2_test, y2_test)))

print(" R^2 Score with feature3 <-- on test set: {}".format(lasso3.score(X3_test, y3_test)))


 R^2 Score with feature1 <-- on test set: 0.6632206984150215
 R^2 Score with feature2 <-- on test set: 0.7437820410345286
 R^2 Score with feature3 <-- on test set: 0.7502508124379266


## ElasticNet Regression

In [19]:
from sklearn.linear_model import ElasticNet

enet1 = ElasticNet().fit(X1_train,y1_train)
enet2 = ElasticNet().fit(X2_train,y2_train)
enet3 = ElasticNet().fit(X3_train,y3_train)


print(" R^2 Score with feature1 <-- on test set: {}".format(enet1.score(X1_test, y1_test)))

print(" R^2 Score with feature2 <-- on test set: {}".format(enet2.score(X2_test, y2_test)))

print(" R^2 Score with feature3 <-- on test set: {}".format(enet3.score(X3_test, y3_test)))

 R^2 Score with feature1 <-- on test set: 0.6660343309393048
 R^2 Score with feature2 <-- on test set: 0.7439282443449935
 R^2 Score with feature3 <-- on test set: 0.7504926807317898
