# Loading the data

In [18]:
import pandas as pd
import numpy as np

df = pd.read_csv('mysore_data.csv')

df['day']=pd.to_datetime(df.Date,format="%Y-%m-%d").dt.day
df['month']=pd.to_datetime(df.Date,format="%Y-%m-%d").dt.month
df['year']=pd.to_datetime(df.Date,format="%Y-%m-%d").dt.year

df.drop(['Date'],axis=1,inplace=True)

In [19]:
print(df)

     Temperature  Humidity  Gas    CO  NH3  PM 2.5 (ug/m3)  day  month  year
0             31        64   45  0.34   47               5   16      6  2022
1             31        63   48  0.59   28               5   16      6  2022
2             31        63   58  0.34   29               5   16      6  2022
3             31        63   48  0.68   31               7   16      6  2022
4             31        63   42  0.32   26               7   16      6  2022
..           ...       ...  ...   ...  ...             ...  ...    ...   ...
144           29        81   56  0.84   37               2   16      6  2022
145           29        81   58  0.68   43               2   16      6  2022
146           29        81   45  0.47   39               3   16      6  2022
147           29        81   60  0.59   27               3   16      6  2022
148           29        81   56  0.39   26               3   16      6  2022

[149 rows x 9 columns]


# Just handling the missing values and not outliers

In [20]:
def fill_mean(data):#replacing null fields with the mean value of the respective filed
    null_fields = data.isna().sum()
    col = data.columns #storing the column names
    x = 0
    for i in null_fields:
        if i != 0:
            data = data.fillna({col[x]:data[col[x]].mean()})# replaces null field with mean of column values
        x += 1     
    return data

df = fill_mean(df)

# Handling outliers

In [21]:
import pandas as pd
import numpy as np

## handling outliers

def detect_outliers_iqr(data):
    outliers = []
    data = sorted(data)
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    for i in data: 
        if (i<lwr_bound or i>upr_bound):
            outliers.append(i)
    return outliers

def collect_outliers_iqr(data):
    outliers_detected_iqr = {}
    for i in data.columns:
        outliers = detect_outliers_iqr(data[i])
        outliers_detected_iqr[i] = outliers
    return outliers_detected_iqr


def floor_clapp_outliers(data, outliers):
    for i, j in outliers.items():
        if len(outliers[i]) != 0:
            IQR = data[i].quantile(0.75) - data[i].quantile(0.25)
            lower_bridge = data[i].quantile(0.25) - (IQR*1.5)
            upper_bridge = data[i].quantile(0.75) + (IQR*1.5)
            data.loc[data[i] > upper_bridge, i] = upper_bridge
            data.loc[data[i] < lower_bridge, i] = lower_bridge
    return data

outliers = collect_outliers_iqr(df)
df = floor_clapp_outliers(df, outliers)

In [22]:
df.columns

Index(['Temperature', 'Humidity', 'Gas', 'CO', 'NH3', 'PM 2.5 (ug/m3)', 'day',
       'month', 'year'],
      dtype='object')

# Separating the independent and dependent variables(features)

In [23]:
X = df[['Temperature', 'Humidity', 'Gas', 'CO', 'NH3', 'day',
       'month', 'year']]
y = df['PM 2.5 (ug/m3)']

# Splitting the data into training and testing parts, 65% of the dataset will be used for training the model, whereas 35% dataset will be used for testing the accuracy of the model.

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=0)

print('X_train : ',X_train.shape,' X_test : ',X_test.shape)
print('y_train : ',y_train.shape,'    y_test : ',y_test.shape)

X_train :  (96, 8)  X_test :  (53, 8)
y_train :  (96,)     y_test :  (53,)


# Testing with different regression models

In [25]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import GridSearchCV
import math
scores = {}

## 1. Using Linear Regression

## 2. Using Ridge Regression

In [27]:
%%time

from sklearn.linear_model import Ridge

ridge=Ridge()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
ridge_regressor=GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=5)
ridge_regressor.fit(X_train,y_train)
y_pred = ridge_regressor.predict(X_test)

scores['RidgeRegression '] = round((r2_score(y_test,y_pred)*100),2)

print("Ridge Regression_________________________")
print()
print('R2 score            : ',r2_score(y_test,y_pred))
print('mean Absolute Error : ',math.sqrt(mean_squared_error(np.array(y_test),y_pred)))
print("Accuracy            :  {:.2f}".format(r2_score(y_test,y_pred)*100),"%")
print('-----------------------------------------')

Ridge Regression_________________________

R2 score            :  0.856970549137154
mean Absolute Error :  0.5854757608772747
Accuracy            :  85.70 %
-----------------------------------------
Wall time: 370 ms


## 4. Using Decision Tree Regression

In [28]:
%%time

from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.1, random_state=3)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
       
scores['DecisionTreeReg.'] = round((r2_score(y_test,y_pred)*100),2)

print("Decision Tree Regression_________________")
print()
print('R2 score            : ',r2_score(y_test,y_pred))
print('mean Absolute Error : ',math.sqrt(mean_squared_error(np.array(y_test),y_pred)))
print("Accuracy            :  {:.2f}".format(r2_score(y_test,y_pred)*100),"%")
print('-----------------------------------------')

Decision Tree Regression_________________

R2 score            :  0.8071526609524122
mean Absolute Error :  0.6798342013201245
Accuracy            :  80.72 %
-----------------------------------------
Wall time: 6.96 ms


## 5. Using Random Forest

In [29]:
%%time

from sklearn.ensemble import RandomForestRegressor

rfc = RandomForestRegressor(n_estimators = 150,random_state = 0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
y_pred=y_pred.round(decimals=3)

scores['RandomForestReg.'] = round((r2_score(y_test,y_pred)*100),2)

print("Random Forest Regression_________________")
print()
print('R2 score            : ',r2_score(y_test,y_pred))
print('mean Absolute Error : ',math.sqrt(mean_squared_error(np.array(y_test),y_pred)))
print("Accuracy            :  {:.2f}".format(r2_score(y_test,y_pred)*100),"%")
print('-----------------------------------------')

Random Forest Regression_________________

R2 score            :  0.8540231596850861
mean Absolute Error :  0.5914774114891344
Accuracy            :  85.40 %
-----------------------------------------
Wall time: 285 ms


## 6. Using K Nearest Neighbour (KNN)

In [30]:
%%time

from sklearn import neighbors
from math import sqrt

params = {'n_neighbors':[2,3,4,5,6,7,8,9]}
knn = neighbors.KNeighborsRegressor()
model = GridSearchCV(knn, params, cv=5)
model.fit(X_train,y_train)
N = model.best_params_
N = N['n_neighbors']
model = neighbors.KNeighborsRegressor(n_neighbors = N)
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
        
scores['KNNregression   '] = round((r2_score(y_test,y_pred)*100),2)

print("K Nearest Neighbour Regression___________")
print()
print('R2 score            : ',r2_score(y_test,y_pred))
print('mean Absolute Error : ',math.sqrt(mean_squared_error(np.array(y_test),y_pred)))
print("Accuracy            :  {:.2f}".format(r2_score(y_test,y_pred)*100),"%")
print('-----------------------------------------')

K Nearest Neighbour Regression___________

R2 score            :  0.8348309020577929
mean Absolute Error :  0.6291592448737351
Accuracy            :  83.48 %
-----------------------------------------
Wall time: 218 ms


# All Scores in different models

In [31]:
count = 0
print('     Model               Accuracy')
print()
for i,j in scores.items():
    count +=1
    print(count,'. ',i,'->',j,'%')
    print()

     Model               Accuracy

1 .  LinearRegression -> 85.78 %

2 .  RidgeRegression  -> 85.7 %

3 .  DecisionTreeReg. -> 80.72 %

4 .  RandomForestReg. -> 85.4 %

5 .  KNNregression    -> 83.48 %



### From above, we can conclude that for this project's requirement, Random Forest Regression (accuracy of 83 % ) is best suited.

In [32]:
## scores dictionary (without outlier handling)
scores

{'LinearRegression': 85.78,
 'RidgeRegression ': 85.7,
 'DecisionTreeReg.': 80.72,
 'RandomForestReg.': 85.4,
 'KNNregression   ': 83.48}

In [33]:
## scores dictionary (with outlier handling)
scores

{'LinearRegression': 85.78,
 'RidgeRegression ': 85.7,
 'DecisionTreeReg.': 80.72,
 'RandomForestReg.': 85.4,
 'KNNregression   ': 83.48}