In [2]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('jaipur_data.csv') 

In [None]:
df.head(3)

In [None]:
df.tail(3) 

In [None]:
df.hist()
plt.show()

## Descriptive Statistics

In [None]:
df.columns

In [None]:
df.describe()

## Data Visualization

In [None]:
df[['so2','type']].groupby(["type"]).mean().sort_values(by='so2').head(20).plot.bar(color='skyblue')
plt.show()

In [None]:
df[['no2','type']].groupby(["type"]).mean().sort_values(by='no2').head(20).plot.bar(color='skyblue')
plt.show()

In [None]:
df[['no2','date']].groupby(["date"]).mean().sort_values(by='date').head(20).plot.bar(color='skyblue')
plt.show()

## EDA

In [None]:
df.drop(['stn_code','agency','sampling_date','location_monitoring_station'],axis=1,inplace=True)

In [None]:
total = df.isnull().sum().sort_values(ascending=False)
total.head(5)

In [None]:
def impute(series):
    return series.fillna(series.mean())
df['rspm']=df.groupby('state')['rspm'].transform(impute)  
df['so2']=df.groupby('state')['so2'].transform(impute)
df['no2']=df.groupby('state')['no2'].transform(impute)
df['spm']=df.groupby('state')['spm'].transform(impute)
df['pm2_5']=df.groupby('state')['pm2_5'].transform(impute)

In [None]:
df.isnull().sum()

In [None]:
#I={\frac {I_{high}-I_{low}}{C_{high}-C_{low}}}(C-C_{low})+I_{low}
#This equation is used to convert concentration into AQI

In [None]:
def cal_SOi(so2):
    si=0
    if (so2<=40):
        si= so2*(50/40)
    elif (so2>40 and so2<=80):
        si= 50+(so2-40)*(50/40)
    elif (so2>80 and so2<=380):
        si= 100+(so2-80)*(100/300)
    elif (so2>380 and so2<=800):
        si= 200+(so2-380)*(100/420)
    elif (so2>800 and so2<=1600):
        si= 300+(so2-800)*(100/800)
    elif (so2>1600):
        si= 400+(so2-1600)*(100/800)
    return si
df['SOi']=df['so2'].apply(cal_SOi)
df1= df[['so2','SOi']]
df1.head()
def cal_Noi(no2):
    ni=0
    if(no2<=40):
        ni= no2*50/40
    elif(no2>40 and no2<=80):
        ni= 50+(no2-40)*(50/40)
    elif(no2>80 and no2<=180):
        ni= 100+(no2-80)*(100/100)
    elif(no2>180 and no2<=280):
        ni= 200+(no2-180)*(100/100)
    elif(no2>280 and no2<=400):
        ni= 300+(no2-280)*(100/120)
    else:
        ni= 400+(no2-400)*(100/120)
    return ni
df['Noi']=df['no2'].apply(cal_Noi)
df1= df[['no2','Noi']]
df1.head()
def cal_RSPMi(rspm):
    rpi=0
    if(rspm<=100):
        rpi = rspm
    elif(rspm>=101 and rspm<=150):
         rpi= 101+(rspm-101)*((200-101)/(150-101))
    elif(rspm>=151 and rspm<=350):
         ni= 201+(rspm-151)*((300-201)/(350-151))
    elif(rspm>=351 and rspm<=420):
        ni= 301+(rspm-351)*((400-301)/(420-351))
    elif(rspm>420):
        ni= 401+(rspm-420)*((500-401)/(420-351))
    return rpi
df['RSPMi']=df['rspm'].apply(cal_RSPMi)
df1= df[['rspm','RSPMi']]
df1.head()
def cal_SPMi(spm):
    spi=0
    if(spm<=50):
        spi=spm*50/50
    elif(spm>50 and spm<=100):
        spi=50+(spm-50)*(50/50)
    elif(spm>100 and spm<=250):
        spi= 100+(spm-100)*(100/150)
    elif(spm>250 and spm<=350):
        spi=200+(spm-250)*(100/100)
    elif(spm>350 and spm<=430):
        spi=300+(spm-350)*(100/80)
    else:
        spi=400+(spm-430)*(100/430)
    return spi
   
df['SPMi']=df['spm'].apply(cal_SPMi)
df1= df[['spm','SPMi']]
df1.head()
def cal_pmi(pm2_5):
    pmi=0
    if(pm2_5<=50):
        pmi=pm2_5*(50/50)
    elif(pm2_5>50 and pm2_5<=100):
         pmi=50+(pm2_5-50)*(50/50)
    elif(pm2_5>100 and pm2_5<=250):
        pmi= 100+(pm2_5-100)*(100/150)
    elif(pm2_5>250 and pm2_5<=350):
        pmi=200+(pm2_5-250)*(100/100)
    elif(pm2_5>350 and pm2_5<=450):
        pmi=300+(pm2_5-350)*(100/100)
    else:
        pmi=400+(pm2_5-430)*(100/80)
    return pmi
df['PMi']=df['pm2_5'].apply(cal_pmi)
df1= df[['pm2_5','PMi']]
df1.head()
def cal_aqi(si,ni,rspmi,spmi):
    aqi=0
    if(si>ni and si>rspmi and si>spmi):
        aqi=si
    if(ni>si and ni>rspmi and ni>spmi ):
        aqi=ni
    if(rspmi>si and rspmi>ni and rspmi>spmi ):
        aqi=rspmi
    if(spmi>si and spmi>ni and spmi>rspmi):
        aqi=spmi
    return aqi

df['AQI']=df.apply(lambda x:cal_aqi(x['SOi'],x['Noi'],x['RSPMi'],x['SPMi']),axis=1)
df1= df[['state','SOi','Noi','RSPMi','SPMi','AQI']]
df1.head()
def AQI_Range(x):
    if x<101:
        return "Good"
    elif x>101 and x<=201:
        return "Moderate"
    elif x>201 and x<=1000:
        return "Poor"
df['AQI_Range'] = df['AQI'] .apply(AQI_Range)
df.head()


In [None]:
df['AQI_Range']=df['AQI_Range'].astype(str)

##  LabelEncoding

In [None]:
from sklearn.preprocessing import LabelEncoder
s=LabelEncoder()
enc=s.fit_transform(df['AQI_Range'])
df2=pd.DataFrame(data=enc)
df3=pd.concat([df,df2],axis=1)
df3
df3.rename(columns={0: 'AQI_LABEL'}, inplace=True)
df3.head(3)
df3.tail(50)

In [None]:
df3.hist(layout=(5,3),figsize=(22,22))
plt.show

In [None]:
df5=df3.drop(["PMi","pm2_5"],axis=1)
#df.drop(['A'], axis = 1) 

In [None]:
scatter_matrix(df5,figsize=(9,9))
plt.show

##  Applying Diffrent Models

## Logistic Regression Model 1

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X1 = df3[["SOi","Noi","RSPMi","SPMi"]]
Y1 = df3['AQI_LABEL']


In [None]:
x_train, x_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3, random_state=10)

In [None]:
model = LogisticRegression()
model.fit(x_train,y_train)

In [None]:
pred=model.predict(x_test)

In [None]:
model.score(x_test,y_test) 

In [None]:
from sklearn.metrics import confusion_matrix 
con_mat=confusion_matrix(y_test,pred)
print(con_mat)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

## Logistic Regression Model 2 with balance Class Using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
a = df3[["SOi","Noi","RSPMi","SPMi"]]
b = df3['AQI_LABEL']

In [None]:
x_train0, x_test0, y_train0, y_test0 = train_test_split(a, b, test_size=0.3, random_state=10)

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
x_train0 = sc_X.fit_transform(x_train0)
x_test0 = sc_X.transform(x_test0)

In [None]:
from imblearn.over_sampling import SMOTE
smt = SMOTE()
x_train0, y_train0 = smt.fit_sample(x_train0, y_train0)

In [None]:
model2 = LogisticRegression()
model2.fit(x_train0,y_train0)

In [None]:
np.bincount(y_train0)
y_pred0=model2.predict(x_test0)

In [None]:
model2.score(x_test0,y_test0) 

In [None]:
confusion_matrix(y_test0, y_pred0)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test0, y_pred0))

## KNeighborsClassifier

In [None]:
X2 = df3[["SOi","Noi","RSPMi","SPMi"]]
Y2 = df3['AQI_LABEL']

In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(X2, Y2, test_size=0.3, random_state=10)

In [None]:
## StandardScaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
x_train2 = sc_X.fit_transform(x_train1)
x_test2 = sc_X.transform(x_test1)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model3 = KNeighborsClassifier(n_neighbors=6, weights='uniform')
model3.fit(x_train2, y_train1)

In [None]:
y_predict = model3.predict(x_test2)

In [None]:
accuracy_score(y_test1,y_predict)

In [None]:
confusion_matrix(y_test1, y_predict)

In [None]:
print(classification_report(y_test1,y_predict))

## Support Vector Classifier

In [None]:
X3 = df3[["SPMi","SOi","Noi","RSPMi"]]
Y3 = df3['AQI_LABEL']

In [None]:
x_train3, x_test3, y_train3, y_test3 = train_test_split(X3, Y3, test_size=0.3, random_state=10)

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
x_train4 = sc_X.fit_transform(x_train3)
x_test4 = sc_X.transform(x_test3)

In [None]:
from sklearn.svm import SVC

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 30)
classifier.fit(x_train4, y_train3)

In [None]:
Y_Pred = classifier.predict(x_test4)

In [None]:
accuracy_score(y_test3,Y_Pred)

In [None]:
cm = confusion_matrix(y_test3, Y_Pred)
cm

In [None]:
print(classification_report(y_test3,Y_Pred))