In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
import joblib

In [2]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
air_quality = pd.read_csv('Data/air_quality_data.csv', parse_dates=True)
air_quality

Unnamed: 0,City,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01 01:00:00,,,1.00,40.01,36.37,,1.00,122.07,,0.00,0.00,0.00,,
1,Ahmedabad,2015-01-01 02:00:00,,,0.02,27.75,19.73,,0.02,85.90,,0.00,0.00,0.00,,
2,Ahmedabad,2015-01-01 03:00:00,,,0.08,19.32,11.08,,0.08,52.83,,0.00,0.00,0.00,,
3,Ahmedabad,2015-01-01 04:00:00,,,0.30,16.45,9.20,,0.30,39.53,153.58,0.00,0.00,0.00,,
4,Ahmedabad,2015-01-01 05:00:00,,,0.12,14.90,7.85,,0.12,32.63,,0.00,0.00,0.00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737401,Visakhapatnam,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,12.07,0.73,41.0,Good
737402,Visakhapatnam,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,2.21,0.38,70.0,Satisfactory
737403,Visakhapatnam,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,0.01,0.00,68.0,Satisfactory
737404,Visakhapatnam,2020-06-30,16.64,49.97,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.00,0.00,0.00,54.0,Satisfactory


In [4]:
air_quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 737406 entries, 0 to 737405
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   City        737406 non-null  object 
 1   Datetime    737406 non-null  object 
 2   PM2.5       587720 non-null  float64
 3   PM10        429529 non-null  float64
 4   NO          617192 non-null  float64
 5   NO2         616699 non-null  float64
 6   NOx         609997 non-null  float64
 7   NH3         454536 non-null  float64
 8   CO          648830 non-null  float64
 9   SO2         603179 non-null  float64
 10  O3          604176 non-null  float64
 11  Benzene     568137 non-null  float64
 12  Toluene     508758 non-null  float64
 13  Xylene      263468 non-null  float64
 14  AQI         603645 non-null  float64
 15  AQI_Bucket  603645 non-null  object 
dtypes: float64(13), object(3)
memory usage: 90.0+ MB


In [5]:
air_quality.drop(columns=["City","Datetime","AQI_Bucket","NO","NOx","NH3","Benzene","Toluene","Xylene"],
                 inplace=True)
air_quality.dropna(inplace=True)
air_quality.info()

<class 'pandas.core.frame.DataFrame'>
Index: 357515 entries, 38289 to 737405
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   PM2.5   357515 non-null  float64
 1   PM10    357515 non-null  float64
 2   NO2     357515 non-null  float64
 3   CO      357515 non-null  float64
 4   SO2     357515 non-null  float64
 5   O3      357515 non-null  float64
 6   AQI     357515 non-null  float64
dtypes: float64(7)
memory usage: 21.8 MB


In [6]:
air_quality.describe()

Unnamed: 0,PM2.5,PM10,NO2,CO,SO2,O3,AQI
count,357515.0,357515.0,357515.0,357515.0,357515.0,357515.0,357515.0
mean,58.501929,120.040644,29.897311,1.374107,12.478692,35.929979,142.821896
std,60.153426,102.553345,26.401232,4.106973,14.22762,28.773134,106.350151
min,0.01,0.01,0.01,0.0,0.01,0.01,8.0
25%,23.46,53.88,11.88,0.49,5.6,15.4,74.0
50%,41.61,92.8,22.93,0.78,9.11,28.1,108.0
75%,70.72,148.14,39.34,1.24,14.32,49.33,172.0
max,999.99,1000.0,499.51,392.16,199.96,497.62,2859.0


In [7]:
x = air_quality.drop(columns=["AQI"])
y = air_quality["PM2.5"]

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [9]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 286012 entries, 711282 to 649481
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   PM2.5   286012 non-null  float64
 1   PM10    286012 non-null  float64
 2   NO2     286012 non-null  float64
 3   CO      286012 non-null  float64
 4   SO2     286012 non-null  float64
 5   O3      286012 non-null  float64
dtypes: float64(6)
memory usage: 15.3 MB


In [10]:
y_train.info()

<class 'pandas.core.series.Series'>
Index: 286012 entries, 711282 to 649481
Series name: PM2.5
Non-Null Count   Dtype  
--------------   -----  
286012 non-null  float64
dtypes: float64(1)
memory usage: 4.4 MB


In [11]:
y_test.info()

<class 'pandas.core.series.Series'>
Index: 71503 entries, 293189 to 174222
Series name: PM2.5
Non-Null Count  Dtype  
--------------  -----  
71503 non-null  float64
dtypes: float64(1)
memory usage: 1.1 MB


In [12]:
x_test.info()  

<class 'pandas.core.frame.DataFrame'>
Index: 71503 entries, 293189 to 174222
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PM2.5   71503 non-null  float64
 1   PM10    71503 non-null  float64
 2   NO2     71503 non-null  float64
 3   CO      71503 non-null  float64
 4   SO2     71503 non-null  float64
 5   O3      71503 non-null  float64
dtypes: float64(6)
memory usage: 3.8 MB


In [13]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [14]:
model.fit(x_train, y_train)


In [15]:
#Printing the model coefficients
print('Intercept: ',model.intercept_)
# pair the feature names with the coefficients
list(zip(x, model.coef_))

Intercept:  -2.4868995751603507e-13


[('PM2.5', np.float64(1.0000000000000038)),
 ('PM10', np.float64(2.3490135528338973e-16)),
 ('NO2', np.float64(1.0975112310471396e-16)),
 ('CO', np.float64(-1.5045401015528893e-17)),
 ('SO2', np.float64(-3.17083237260132e-17)),
 ('O3', np.float64(-2.865704638448669e-17))]

In [16]:
y_pred = model.predict(x_test)

In [17]:
#Actual value and the predicted value
model_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred})
model_diff

Unnamed: 0,Actual value,Predicted value
293189,20.85,20.85
332517,218.57,218.57
270798,173.97,173.97
244476,11.67,11.67
370843,5.25,5.25
...,...,...
80057,100.31,100.31
53464,25.25,25.25
734443,30.64,30.64
416107,21.05,21.05


In [18]:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

print('Mean Absolute Error:', mae)
print('Mean Square Error:', mse)
print('Root Mean Square Error:', r2)

Mean Absolute Error: 1.579552784933014e-13
Mean Square Error: 6.195406145144475e-26
Root Mean Square Error: 2.489057280406474e-13


In [19]:
import warnings
warnings.filterwarnings("ignore")

In [20]:
check = pd.DataFrame([138,246,70.9,1140,9.8,67.1])

In [25]:
# Predicting the data in 'check'
predicted_value = model.predict(check.T)
predicted_value

array([138.])

In [26]:
joblib.dump(model, 'air_quality_model.pkl')
# Load the model

['air_quality_model.pkl']

In [27]:
loaded_model = joblib.load('air_quality_model.pkl')
# Predicting the data in 'check'
predicted_value = loaded_model.predict(check.T)
predicted_value

array([138.])