In [75]:
# import the required modules

In [76]:
import pandas as pd,numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [77]:
# download the dataset from UCI website and load to pandas data frame

In [78]:
air=pd.read_excel('AirQualityUCI.xlsx')

In [79]:
air.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [80]:
# data preparation and cleaning 
# let check the dimension of the dataframe

In [81]:
air.shape

(9357, 15)

In [82]:
# NA values are represented by -200 . replace -200 with NAN

In [83]:
air=air.replace(-200,np.nan)

In [84]:
# let check the missing values in the dateframe

In [85]:
round(100*(air.isnull().sum()/len(air.index)),2)

Date              0.00
Time              0.00
CO(GT)           17.99
PT08.S1(CO)       3.91
NMHC(GT)         90.23
C6H6(GT)          3.91
PT08.S2(NMHC)     3.91
NOx(GT)          17.52
PT08.S3(NOx)      3.91
NO2(GT)          17.55
PT08.S4(NO2)      3.91
PT08.S5(O3)       3.91
T                 3.91
RH                3.91
AH                3.91
dtype: float64

In [86]:
# will drop the columns having more than 40% NA values

In [87]:
air=air.drop(air.loc[:,list(round(100*(air.isnull().sum()/len(air.index)),2)>40)].columns,1)

In [88]:
# let drop all the rows having NAN values

In [89]:
air=air.dropna()

In [90]:
air.shape

(6941, 14)

In [91]:
air.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6941 entries, 0 to 9356
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           6941 non-null   datetime64[ns]
 1   Time           6941 non-null   object        
 2   CO(GT)         6941 non-null   float64       
 3   PT08.S1(CO)    6941 non-null   float64       
 4   C6H6(GT)       6941 non-null   float64       
 5   PT08.S2(NMHC)  6941 non-null   float64       
 6   NOx(GT)        6941 non-null   float64       
 7   PT08.S3(NOx)   6941 non-null   float64       
 8   NO2(GT)        6941 non-null   float64       
 9   PT08.S4(NO2)   6941 non-null   float64       
 10  PT08.S5(O3)    6941 non-null   float64       
 11  T              6941 non-null   float64       
 12  RH             6941 non-null   float64       
 13  AH             6941 non-null   float64       
dtypes: datetime64[ns](1), float64(12), object(1)
memory usage: 813.4+ KB


In [92]:
# let drop Data and Time

In [93]:
air=air.drop(['Date','Time'],axis=1)

In [94]:
# split train dataset into X and Y

In [95]:
y=air.pop('RH')

In [96]:
x=air

In [97]:
# rescaling and split the data set into train and test
# create a scaling object 

In [98]:
scaler=StandardScaler()

In [99]:
# scale these variable using "fit_transfrom"

In [100]:
xstd=scaler.fit_transform(x)

In [101]:
# split the data into train and test with test size and 30% and train size as 70%

In [102]:
x_train, x_test, y_train, y_test=train_test_split(xstd,y,test_size=0.3,random_state=100)

In [103]:
print('Training data size:',x_train.shape)

Training data size: (4858, 11)


In [104]:
print('Test data size:',x_test.shape)

Test data size: (2083, 11)


In [105]:
# model buliding 
# predict using LinearRegression

In [106]:
lr = LinearRegression()

In [107]:
lrm = lr.fit(x_train,y_train)

In [108]:
y_pred_lr = lrm.predict(x_test)

In [109]:
print('RMSE of Linear Regression model:',np.sqrt(mean_squared_error(y_test,y_pred_lr)))

RMSE of Linear Regression model: 5.916300662074108


In [110]:
# predict using Random Forest Regressor

In [111]:
rf = RandomForestRegressor()

In [112]:
rfm = rf.fit(x_train,y_train)

In [113]:
y_pred_rf = rfm.predict(x_test)

In [114]:
print('RMSE of Random Forest model:',np.sqrt(mean_squared_error(y_test,y_pred_rf)))

RMSE of Random Forest model: 0.72172343345108


In [115]:
# predict using SV Regressor

In [116]:
svr = SVR()

In [117]:
svm = svr.fit(x_train,y_train)

In [118]:
y_pred_sv = svm.predict(x_test)

In [119]:
print('RMSE of SVM model:',np.sqrt(mean_squared_error(y_test,y_pred_sv)))

RMSE of SVM model: 3.4898461468289943


In [None]:
# RMSE is the way of measure of efficiency of the predictive models.