# Implementing Decision Tree

In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import r2_score
from datetime import datetime
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [19]:
data = pd.read_csv("C:/Users/AK/Downloads/Air-Quality-index-Prediction/Air-Quality-index-Prediction-main/Data/final_data.csv")

data.head()

Unnamed: 0,PM2.5,NO2,CO,SO2,O3,AQI
0,83.13,28.71,6.93,49.52,59.76,209.0
1,79.84,28.68,13.85,48.49,97.07,328.0
2,94.52,32.66,24.39,67.39,111.33,514.0
3,135.99,42.08,43.48,75.23,102.7,782.0
4,178.33,35.31,54.56,55.04,107.38,914.0


In [20]:
#Splitting Data
X = data.iloc[:, :-1] #Independent features
y = data.iloc[:, -1] #Dependent feature

#Train Test Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [21]:
#Calling and Fitting model
model = DecisionTreeRegressor(criterion="squared_error")

model.fit(X_train,y_train)

In [22]:
print("Coefficient of determination R^2 <-- on train set: {}".format(model.score(X_train, y_train)))
print("Coefficient of determination R^2 <-- on test set: {}".format(model.score(X_test, y_test)))

Coefficient of determination R^2 <-- on train set: 1.0
Coefficient of determination R^2 <-- on test set: 0.7793052056383352


We can clearly see model is overfitting. Let's try Cross Validation

# Cross Validation

In [25]:
score=cross_val_score(model,X,y,cv=5)
score.mean()

0.6783346736328574

In [26]:
prediction=model.predict(X_test)
sns.distplot(y_test-prediction)

<Axes: xlabel='AQI', ylabel='Density'>

In [27]:
plt.scatter(y_test,prediction)

<matplotlib.collections.PathCollection at 0x2708e26de80>

Even after cross validation, we are getting low accuracy. Let's tune the model and see the result.

#  Hyper Parameter Tunning

In [30]:
params={
"splitter" : ["best", "random"] ,
"max_depth" : [1, 3, 5, 7, 9, 11, 12, 15],
"min_samples_leaf" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"min_weight_fraction_leaf":[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
"max_features" : ["auto", "log2", "sqrt", None],
"max_leaf_nodes" : [None, 10, 20, 30, 40, 50, 60, 70, 80, 90] 
}

In [31]:
# Fitting GridSearchCV
random_search=GridSearchCV(model, param_grid=params, scoring='neg_mean_squared_error', n_jobs=-1,cv=10, verbose=3)

In [32]:
#Using time function to calculate total time taken for tunning
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [51]:
start_time = timer(None)
random_search.fit(X,y)
timer(start_time)

Fitting 10 folds for each of 57600 candidates, totalling 576000 fits

 Time taken: 0 hours 17 minutes and 5.82 seconds.


In [34]:
#Printing Best Parameters and Score
print(random_search.best_params_)
print(random_search.best_score_)

{'max_depth': 12, 'max_features': 'log2', 'max_leaf_nodes': 30, 'min_samples_leaf': 8, 'min_weight_fraction_leaf': 0.1, 'splitter': 'best'}
-10091.735820312688


In [35]:
tuned_pred=random_search.predict(X_test)
sns.distplot(y_test-tuned_pred)

<Axes: xlabel='AQI', ylabel='Density'>

In [54]:
print('MAE:', metrics.mean_absolute_error(y_test, tuned_pred))
print('MSE:', metrics.mean_squared_error(y_test, tuned_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, tuned_pred)))

MAE: 52.083065951310964
MSE: 8806.06549743906
RMSE: 93.84063883754767


In [55]:
r2_score(y_test, prediction)

0.7793052056383352

# Insights
- Even after tunning, we are not getting high accuracy.
- MSE is too high which is making model dumb