In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.metrics import accuracy_score
import joblib

In [4]:
df=pd.read_csv('city_day.csv')
# Reading the dataset

In [None]:
df.head()
# Loading the dataset

In [None]:
df.shape

In [None]:
df.info()
# Checking the over all information on the dataset.

In [None]:
df.isnull().sum()
# There are a lot of missing values present in the dataset

In [None]:
df.describe()
# Checking the descriptive stats of the numeric values present in the data like mean, standard deviation, min values and max value present in the data

In [None]:
df.nunique()
# These are all the unique values present in the dataframe

In [None]:
df.columns
# These are all the columns present in the dataset.

In [None]:
sns.pairplot(data=df)

In [None]:
df['City'].value_counts()
# Viewing the count of values present in the state column
plt.figure(figsize=(15, 6))
plt.xticks(rotation=90)
df.City.hist()
plt.xlabel('City')
plt.ylabel('Frequencies')
plt.plot()
# The visualization shows us the count of states present in the dataset.


In [None]:
plt.figure(figsize=(30, 10))
plt.xticks(rotation=90)
sns.barplot(x='City',y='PM2.5',data=df);
# This visualization shows the name of the state having higher PM2.5 levels in the air which is Patna

In [None]:
plt.rcParams['figure.figsize']=(30,10)

In [None]:
df[['PM2.5','City']].groupby(["City"]).mean().sort_values(by='PM2.5').plot.bar(color='purple')
plt.show()
# We can also use the groupby function to sort values in an ascending order based on the x-axis, y-axis and its keys
# Below we get a clear picture of the states in an increasing order based on their PM2.5 levels.

In [None]:
plt.figure(figsize=(30, 10))
plt.xticks(rotation=90)
sns.barplot(x='City',y='PM10',data=df);
# Delhi has a higher PM10 level compared to other states

In [None]:
df[['PM10','City']].groupby(["City"]).mean().sort_values(by='PM10').plot.bar(color='purple')
plt.show()
# We can also use the groupby function to sort values in an ascending order based on the x-axis, y-axis and its keys
# Below we get a clear picture of the states in an increasing order based on their PM10 levels.

In [None]:
plt.figure(figsize=(30, 10))
plt.xticks(rotation=90)
sns.barplot(x='City',y='SO2',data=df);
# Ahmedabad has higher so2 level compared to other states

In [None]:
plt.figure(figsize=(30, 10))
plt.xticks(rotation=90)
sns.barplot(x='City',y='NOx',data=df);
# Kochi has higher NOx level compared to other states

In [None]:
plt.figure(figsize=(30, 10))
plt.xticks(rotation=90)
sns.barplot(x='City',y='NH3',data=df);
# Chennai has higher NH3 level compared to other states

In [None]:
plt.figure(figsize=(30, 10))
plt.xticks(rotation=90)
sns.barplot(x='City',y='CO',data=df);
# Ahmedabad has higher CO level compared to other states

In [None]:
plt.figure(figsize=(30, 10))
plt.xticks(rotation=90)
sns.barplot(x='City',y='O3',data=df);
# Bhopal has higher O3 level compared to other states

In [None]:
plt.figure(figsize=(30, 10))
plt.xticks(rotation=90)
sns.barplot(x='City',y='AQI',data=df)

In [13]:
nullvalues = df.isnull().sum().sort_values(ascending=False)
# Checking all null values

In [None]:
nullvalues

In [None]:
df.isnull().sum()
# Now checking the null values

In [None]:
df

In [17]:
df.fillna(0, inplace=True)
# null values are replaced with zeros for the numerical data

In [None]:
df.isnull().sum()
# Now we have successfully imputed null values which were present in the dataset

In [None]:
df

In [19]:
df["PM10_24hr_avg"] = df.groupby("City")["PM10"].rolling(window = 24, min_periods = 16).mean().values
df["PM2.5_24hr_avg"] = df.groupby("City")["PM2.5"].rolling(window = 24, min_periods = 16).mean().values
df["SO2_24hr_avg"] = df.groupby("City")["SO2"].rolling(window = 24, min_periods = 16).mean().values
df["NOx_24hr_avg"] = df.groupby("City")["NOx"].rolling(window = 24, min_periods = 16).mean().values
df["NH3_24hr_avg"] = df.groupby("City")["NH3"].rolling(window = 24, min_periods = 16).mean().values
df["CO_8hr_max"] = df.groupby("City")["CO"].rolling(window = 8, min_periods = 1).max().values
df["O3_8hr_max"] = df.groupby("City")["O3"].rolling(window = 8, min_periods = 1).max().values

In [None]:
df.fillna(0, inplace=True)
df.isnull().sum()

In [None]:
df

In [None]:
def get_PM25_subindex(x):
    if x <= 30:
        return x * 50 / 30
    elif x <= 60:
        return 50 + (x - 30) * 50 / 30
    elif x <= 90:
        return 100 + (x - 60) * 100 / 30
    elif x <= 120:
        return 200 + (x - 90) * 100 / 30
    elif x <= 250:
        return 300 + (x - 120) * 100 / 130
    elif x > 250:
        return 400 + (x - 250) * 100 / 130
    else:
        return 0

df["PM2.5_SubIndex"] = df["PM2.5_24hr_avg"].apply(lambda x: get_PM25_subindex(x))
data= df[['PM2.5_24hr_avg','PM2.5_SubIndex']]
data.head()

# calculating the individual pollutant index for so2(sulphur dioxide)

In [None]:
data.tail()

In [None]:
## PM10 Sub-Index calculation
def get_PM10_subindex(x):
    if x <= 50:
        return x
    elif x <= 100:
        return x
    elif x <= 250:
        return 100 + (x - 100) * 100 / 150
    elif x <= 350:
        return 200 + (x - 250)
    elif x <= 430:
        return 300 + (x - 350) * 100 / 80
    elif x > 430:
        return 400 + (x - 430) * 100 / 80
    else:
        return 0

df["PM10_SubIndex"] = df["PM10_24hr_avg"].apply(lambda x: get_PM10_subindex(x))
data= df[['PM10_24hr_avg','PM10_SubIndex']]
data.head()
# calculating the individual pollutant index for PM10

In [None]:
data.tail()

In [None]:
## SO2 Sub-Index calculation
def get_SO2_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 380:
        return 100 + (x - 80) * 100 / 300
    elif x <= 800:
        return 200 + (x - 380) * 100 / 420
    elif x <= 1600:
        return 300 + (x - 800) * 100 / 800
    elif x > 1600:
        return 400 + (x - 1600) * 100 / 800
    else:
        return 0

df["SO2_SubIndex"] = df["SO2_24hr_avg"].apply(lambda x: get_SO2_subindex(x))
data= df[['SO2_24hr_avg','SO2_SubIndex']]
data.head()
# calculating the individual pollutant index for so2

In [None]:
data.tail()

In [None]:
## NOx Sub-Index calculation
def get_NOx_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 180:
        return 100 + (x - 80) * 100 / 100
    elif x <= 280:
        return 200 + (x - 180) * 100 / 100
    elif x <= 400:
        return 300 + (x - 280) * 100 / 120
    elif x > 400:
        return 400 + (x - 400) * 100 / 120
    else:
        return 0

df["NOx_SubIndex"] = df["NOx_24hr_avg"].apply(lambda x: get_NOx_subindex(x))
data= df[['NOx_24hr_avg','NOx_SubIndex']]
data.head()
# calculating the individual pollutant index for nox

In [None]:
data.tail()

In [None]:
## NH3 Sub-Index calculation
def get_NH3_subindex(x):
    if x <= 200:
        return x * 50 / 200
    elif x <= 400:
        return 50 + (x - 200) * 50 / 200
    elif x <= 800:
        return 100 + (x - 400) * 100 / 400
    elif x <= 1200:
        return 200 + (x - 800) * 100 / 400
    elif x <= 1800:
        return 300 + (x - 1200) * 100 / 600
    elif x > 1800:
        return 400 + (x - 1800) * 100 / 600
    else:
        return 0

df["NH3_SubIndex"] = df["NH3_24hr_avg"].apply(lambda x: get_NH3_subindex(x))
data= df[['NH3_24hr_avg','NH3_SubIndex']]
data.head()
# calculating the individual pollutant index for nh3

In [None]:
data.tail()

In [None]:
## CO Sub-Index calculation
def get_CO_subindex(x):
    if x <= 1:
        return x * 50 / 1
    elif x <= 2:
        return 50 + (x - 1) * 50 / 1
    elif x <= 10:
        return 100 + (x - 2) * 100 / 8
    elif x <= 17:
        return 200 + (x - 10) * 100 / 7
    elif x <= 34:
        return 300 + (x - 17) * 100 / 17
    elif x > 34:
        return 400 + (x - 34) * 100 / 17
    else:
        return 0

df["CO_SubIndex"] = df["CO_8hr_max"].apply(lambda x: get_CO_subindex(x))
data= df[['CO_8hr_max','CO_SubIndex']]
data.head()
# calculating the individual pollutant index for co

In [None]:
data.tail()

In [None]:
## O3 Sub-Index calculation
def get_O3_subindex(x):
    if x <= 50:
        return x * 50 / 50
    elif x <= 100:
        return 50 + (x - 50) * 50 / 50
    elif x <= 168:
        return 100 + (x - 100) * 100 / 68
    elif x <= 208:
        return 200 + (x - 168) * 100 / 40
    elif x <= 748:
        return 300 + (x - 208) * 100 / 539
    elif x > 748:
        return 400 + (x - 400) * 100 / 539
    else:
        return 0

df["O3_SubIndex"] = df["O3_8hr_max"].apply(lambda x: get_O3_subindex(x))
data= df[['O3_8hr_max','O3_SubIndex']]
data.head()
# calculating the individual pollutant index for o3

In [None]:
data.tail()

In [None]:
## AQI bucketing
def get_AQI_bucket(x):
    if x <= 50:
        return "Good"
    elif x <= 100:
        return "Satisfactory"
    elif x <= 200:
        return "Moderate"
    elif x <= 300:
        return "Poor"
    elif x <= 400:
        return "Very Poor"
    elif x > 400:
        return "Severe"
    else:
        return np.NaN

df["Checks"] = (df["PM2.5_SubIndex"] > 0).astype(int) + \
                (df["PM10_SubIndex"] > 0).astype(int) + \
                (df["SO2_SubIndex"] > 0).astype(int) + \
                (df["NOx_SubIndex"] > 0).astype(int) + \
                (df["NH3_SubIndex"] > 0).astype(int) + \
                (df["CO_SubIndex"] > 0).astype(int) + \
                (df["O3_SubIndex"] > 0).astype(int)

df["AQI_calculated"] = round(df[["PM2.5_SubIndex", "PM10_SubIndex", "SO2_SubIndex", "NOx_SubIndex",
                                 "NH3_SubIndex", "CO_SubIndex", "O3_SubIndex"]].max(axis = 1))
df.loc[df["PM2.5_SubIndex"] + df["PM10_SubIndex"] <= 0, "AQI_calculated"] = np.NaN
df.loc[df.Checks < 3, "AQI_calculated"] = np.NaN

df["AQI_bucket_calculated"] = df["AQI_calculated"].apply(lambda x: get_AQI_bucket(x))
df[~df.AQI_calculated.isna()].head(13)
# Caluclating the Air Quality Index.

In [None]:
replacement_text = 'N/A'
df['AQI_bucket_calculated'].fillna(replacement_text, inplace=True)
df.isnull().sum()

In [None]:
df.fillna(0, inplace=True)
df.isnull().sum()

In [None]:
df[~df.AQI_calculated.isna()].AQI_bucket_calculated.value_counts()

In [None]:
X=df[["PM2.5_SubIndex", "PM10_SubIndex", "SO2_SubIndex", "NOx_SubIndex","NH3_SubIndex", "CO_SubIndex", "O3_SubIndex"]]
Y=df['AQI_calculated']
X.tail()

In [None]:
Y.tail()
# the AQI column is the target column

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [35]:
X2 = df[["PM2.5_SubIndex", "PM10_SubIndex", "SO2_SubIndex", "NOx_SubIndex","NH3_SubIndex", "CO_SubIndex", "O3_SubIndex"]]
Y2 = df["AQI_calculated"]
# Splitting the data into independent and dependent columns for classification

In [None]:
df['AQI_bucket_calculated'].value_counts()

In [37]:
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, test_size=0.33)
# Splitting the data into training and testing data

In [38]:
#fit the model on train data
RF=RandomForestClassifier().fit(X_train2,Y_train2)
#predict on train
train_preds4 = RF.predict(X_train2)
#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train2, train_preds4))

#predict on test
test_preds4 = RF.predict(X_test2)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test2, test_preds4))
print('-'*50)

# Kappa Score
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test2,test_preds4))

Model accuracy on train is:  1.0
Model accuracy on test is:  0.5834188384978453
--------------------------------------------------
KappaScore is:  0.5775912355195565


In [39]:
#fit the model on train data
DT2 = DecisionTreeClassifier().fit(X_train2,Y_train2)

#predict on train
train_preds3 = DT2.predict(X_train2)
#accuracy on train
print("Model accuracy on train is: ", accuracy_score(Y_train2, train_preds3))

#predict on test
test_preds3 = DT2.predict(X_test2)
#accuracy on test
print("Model accuracy on test is: ", accuracy_score(Y_test2, test_preds3))
print('-'*50)

# Kappa Score
print('KappaScore is: ', metrics.cohen_kappa_score(Y_test2,test_preds3))

Model accuracy on train is:  1.0
Model accuracy on test is:  0.8280320131335933
--------------------------------------------------
KappaScore is:  0.8256248881618611


In [40]:
DT2.predict([[7.4,47.7,78.182,100,60,35,45]])
# Predictions on random values



array([49.])

In [41]:
RF.predict([[7.4,47.7,78.182,100,60,35,45]])
# Predictions on random values



array([20.])

In [None]:
joblib.dump(DT2, 'decision_tree_model.pkl')