# Loading the data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_excel('mysore_city_data.xlsx')
df['day']=pd.to_datetime(df.Date,format="%Y-%m-%d").dt.day
df['month']=pd.to_datetime(df.Date,format="%Y-%m-%d").dt.month
df['year']=pd.to_datetime(df.Date,format="%Y-%m-%d").dt.year

df.drop(['Date'],axis=1,inplace=True)

In [2]:
print(df)

      Temperature  Humidity        Gas        CO   NH3  PM 2.5 (ug/m3)  PM10  \
0            31.0      64.0  45.000000  0.340000  47.0        5.000000     3   
1            31.0      63.0  48.000000  0.590000  28.0        5.000000     3   
2            31.0      63.0  58.000000  0.340000  29.0        5.000000     3   
3            31.0      63.0  48.000000  0.680000  31.0        7.000000     4   
4            31.0      63.0  42.000000  0.320000  26.0        7.000000     4   
...           ...       ...        ...       ...   ...             ...   ...   
4529         30.0      64.0  38.000000  0.530000  18.0       13.000000    10   
4530         30.0      64.0  42.000000  0.290000  37.0       13.000000    10   
4531         30.0      64.0  53.000000  0.220000  26.0       13.000000    10   
4532         30.0      64.0  49.000000  0.280000  37.0       13.000000    10   
4533         30.0      64.0  46.306294  0.444436  37.0       22.536731    10   

      day  month  year  
0      16     

# Just handling the missing values and not outliers

In [3]:
def fill_mean(data):#replacing null fields with the mean value of the respective filed
    null_fields = data.isna().sum()
    col = data.columns #storing the column names
    x = 0
    for i in null_fields:
        if i != 0:
            data = data.fillna({col[x]:data[col[x]].mean()})# replaces null field with mean of column values
        x += 1     
    return data

df = fill_mean(df)

# Handling outliers

In [4]:
import pandas as pd
import numpy as np

## handling outliers

def detect_outliers_iqr(data):
    outliers = []
    data = sorted(data)
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    for i in data: 
        if (i<lwr_bound or i>upr_bound):
            outliers.append(i)
    return outliers

def collect_outliers_iqr(data):
    outliers_detected_iqr = {}
    for i in data.columns:
        outliers = detect_outliers_iqr(data[i])
        outliers_detected_iqr[i] = outliers
    return outliers_detected_iqr


def floor_clapp_outliers(data, outliers):
    for i, j in outliers.items():
        if len(outliers[i]) != 0:
            IQR = data[i].quantile(0.75) - data[i].quantile(0.25)
            lower_bridge = data[i].quantile(0.25) - (IQR*1.5)
            upper_bridge = data[i].quantile(0.75) + (IQR*1.5)
            data.loc[data[i] > upper_bridge, i] = upper_bridge
            data.loc[data[i] < lower_bridge, i] = lower_bridge
    return data

outliers = collect_outliers_iqr(df)
df = floor_clapp_outliers(df, outliers)

In [5]:
df.columns

Index(['Temperature', 'Humidity', 'Gas', 'CO', 'NH3', 'PM 2.5 (ug/m3)', 'PM10',
       'day', 'month', 'year'],
      dtype='object')

# Separating the independent and dependent variables(features)

In [6]:
X = df[['Temperature', 'Humidity', 'Gas', 'CO', 'NH3','PM10', 'day',
       'month', 'year']]
y = df['PM 2.5 (ug/m3)']

# Splitting the data into training and testing parts, 70% of the dataset will be used for training the model, whereas 30% dataset will be used for testing the accuracy of the model.

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print('X_train : ',X_train.shape,' X_test : ',X_test.shape)
print('y_train : ',y_train.shape,'    y_test : ',y_test.shape)

X_train :  (3173, 9)  X_test :  (1361, 9)
y_train :  (3173,)     y_test :  (1361,)


# Testing with different regression models

In [8]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import GridSearchCV
import math
scores = {}

## 1. Using Linear Regression

In [9]:
%%time

from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)

scores['LinearRegression'] = round((r2_score(y_test,y_pred)*100),2)

print("Linear Regression________________________")
print()
print('R2 score            : ',r2_score(y_test,y_pred))
print('mean Absolute Error : ',math.sqrt(mean_squared_error(np.array(y_test),y_pred)))
print("Accuracy            :  {:.2f}".format(r2_score(y_test,y_pred)*100),"%")
print('-----------------------------------------')

Linear Regression________________________

R2 score            :  0.9369389882291742
mean Absolute Error :  2.39323229055664
Accuracy            :  93.69 %
-----------------------------------------
CPU times: user 28 ms, sys: 19.5 ms, total: 47.6 ms
Wall time: 533 ms


## 2. Using Ridge Regression

In [10]:
%%time

from sklearn.linear_model import Ridge

ridge=Ridge()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
ridge_regressor=GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=5)
ridge_regressor.fit(X_train,y_train)
y_pred = ridge_regressor.predict(X_test)

scores['RidgeRegression '] = round((r2_score(y_test,y_pred)*100),2)

print("Ridge Regression_________________________")
print()
print('R2 score            : ',r2_score(y_test,y_pred))
print('mean Absolute Error : ',math.sqrt(mean_squared_error(np.array(y_test),y_pred)))
print("Accuracy            :  {:.2f}".format(r2_score(y_test,y_pred)*100),"%")
print('-----------------------------------------')

  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,


Ridge Regression_________________________

R2 score            :  0.9369514578270932
mean Absolute Error :  2.3929956616241297
Accuracy            :  93.70 %
-----------------------------------------
CPU times: user 861 ms, sys: 130 ms, total: 991 ms
Wall time: 339 ms


## 4. Using Decision Tree Regression

In [11]:
%%time

from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.1, random_state=3)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
       
scores['DecisionTreeReg.'] = round((r2_score(y_test,y_pred)*100),2)

print("Decision Tree Regression_________________")
print()
print('R2 score            : ',r2_score(y_test,y_pred))
print('mean Absolute Error : ',math.sqrt(mean_squared_error(np.array(y_test),y_pred)))
print("Accuracy            :  {:.2f}".format(r2_score(y_test,y_pred)*100),"%")
print('-----------------------------------------')

Decision Tree Regression_________________

R2 score            :  0.9122029879818838
mean Absolute Error :  2.8238671066675907
Accuracy            :  91.22 %
-----------------------------------------
CPU times: user 23.2 ms, sys: 17.2 ms, total: 40.4 ms
Wall time: 566 ms


## 5. Using Random Forest

In [12]:
%%time

from sklearn.ensemble import RandomForestRegressor

rfc = RandomForestRegressor(n_estimators = 150,random_state = 0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
y_pred=y_pred.round(decimals=3)

scores['RandomForestReg.'] = round((r2_score(y_test,y_pred)*100),2)

print("Random Forest Regression_________________")
print()
print('R2 score            : ',r2_score(y_test,y_pred))
print('mean Absolute Error : ',math.sqrt(mean_squared_error(np.array(y_test),y_pred)))
print("Accuracy            :  {:.2f}".format(r2_score(y_test,y_pred)*100),"%")
print('-----------------------------------------')

Random Forest Regression_________________

R2 score            :  0.9628046555834747
mean Absolute Error :  1.8380134239423043
Accuracy            :  96.28 %
-----------------------------------------
CPU times: user 671 ms, sys: 16.9 ms, total: 688 ms
Wall time: 750 ms


## 6. Using K Nearest Neighbour (KNN)

In [13]:
%%time

from sklearn import neighbors
from math import sqrt

params = {'n_neighbors':[2,3,4,5,6,7,8,9]}
knn = neighbors.KNeighborsRegressor()
model = GridSearchCV(knn, params, cv=5)
model.fit(X_train,y_train)
N = model.best_params_
N = N['n_neighbors']
model = neighbors.KNeighborsRegressor(n_neighbors = N)
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
        
scores['KNNregression   '] = round((r2_score(y_test,y_pred)*100),2)

print("K Nearest Neighbour Regression___________")
print()
print('R2 score            : ',r2_score(y_test,y_pred))
print('mean Absolute Error : ',math.sqrt(mean_squared_error(np.array(y_test),y_pred)))
print("Accuracy            :  {:.2f}".format(r2_score(y_test,y_pred)*100),"%")
print('-----------------------------------------')

K Nearest Neighbour Regression___________

R2 score            :  0.9390452869281154
mean Absolute Error :  2.3529247191837652
Accuracy            :  93.90 %
-----------------------------------------
CPU times: user 365 ms, sys: 4.48 ms, total: 370 ms
Wall time: 369 ms


# All Scores in different models

In [14]:
count = 0
print('     Model               Accuracy')
print()
for i,j in scores.items():
    count +=1
    print(count,'. ',i,'->',j,'%')
    print()

     Model               Accuracy

1 .  LinearRegression -> 93.69 %

2 .  RidgeRegression  -> 93.7 %

3 .  DecisionTreeReg. -> 91.22 %

4 .  RandomForestReg. -> 96.28 %

5 .  KNNregression    -> 93.9 %



### From above, we can conclude that for this project's requirement, Random Forest Regression (accuracy of 83 % ) is best suited.

In [138]:
## scores dictionary (without outlier handling)
scores

{'RidgeRegression ': 94.76,
 'DecisionTreeReg.': 91.93,
 'RandomForestReg.': 96.81,
 'KNNregression   ': 95.03}

In [139]:
## scores dictionary (with outlier handling)
scores

{'RidgeRegression ': 94.76,
 'DecisionTreeReg.': 91.93,
 'RandomForestReg.': 96.81,
 'KNNregression   ': 95.03}