In [1]:
#https://www.kaggle.com/datasets/mujtabamatin/air-quality-and-pollution-assessment

In [2]:
import pandas as pd

In [3]:
df=pd.read_csv("updated_pollution_dataset.csv")

In [4]:
df.shape

(5000, 10)

In [5]:
df.columns

Index(['Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO',
       'Proximity_to_Industrial_Areas', 'Population_Density', 'Air Quality'],
      dtype='object')

In [6]:
df.isnull().sum()

Temperature                      0
Humidity                         0
PM2.5                            0
PM10                             0
NO2                              0
SO2                              0
CO                               0
Proximity_to_Industrial_Areas    0
Population_Density               0
Air Quality                      0
dtype: int64

In [7]:
from scipy.stats import f_oneway

In [8]:
df_cat=df.copy()
c_list=[]
nc_list=[]
for c1 in list(df_cat.columns):
    if c1 == 'Air Quality':
        continue
    else:
        groups = [df_cat[c1][df_cat['Air Quality'] == category] for category in df_cat['Air Quality'].unique()]
        f_stat, p_value = f_oneway(*groups)
        print(f"column : {c1}, F-statistic: {f_stat}, P-value: {p_value}")
        if p_value < 0.05:
            c_list.append(c1)
        else:
            nc_list.append(c1)

column : Temperature, F-statistic: 2191.9374022972033, P-value: 0.0
column : Humidity, F-statistic: 1071.2805847369366, P-value: 0.0
column : PM2.5, F-statistic: 354.8001717963518, P-value: 6.991035853584263e-209
column : PM10, F-statistic: 745.4262411714283, P-value: 0.0
column : NO2, F-statistic: 2676.2159089566176, P-value: 0.0
column : SO2, F-statistic: 2018.304415432383, P-value: 0.0
column : CO, F-statistic: 8292.518513696, P-value: 0.0
column : Proximity_to_Industrial_Areas, F-statistic: 3714.945062564165, P-value: 0.0
column : Population_Density, F-statistic: 1189.932964847901, P-value: 0.0


In [9]:
df_num = df.select_dtypes(include=['float64', 'int64'])
df_cat = df.select_dtypes(include=['object'])

In [13]:
df_num.columns

Index(['Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO',
       'Proximity_to_Industrial_Areas', 'Population_Density'],
      dtype='object')

In [14]:
df_cat.columns

Index(['Air Quality'], dtype='object')

In [12]:
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)  # First quartile (25th percentile)
        Q3 = df[col].quantile(0.75)  # Third quartile (75th percentile)
        IQR = Q3 - Q1               # Interquartile range
        lower_bound = Q1 - 1.5 * IQR  # Lower bound
        upper_bound = Q3 + 1.5 * IQR  # Upper bound

        # Remove outliers
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    
    return df

In [15]:

columns_to_check = ['Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO',
       'Proximity_to_Industrial_Areas', 'Population_Density'] 

df_no_outliers = remove_outliers_iqr(df_num, columns_to_check)

print("Original DataFrame:")
print(df_num.shape)
print("\nDataFrame after Outlier Treatment:")
print(df_no_outliers.shape)

Original DataFrame:
(5000, 9)

DataFrame after Outlier Treatment:
(4158, 9)


In [17]:
# Encode categorical variables
encoded_df = pd.get_dummies(df, columns=['Air Quality'])

# Check the shape of the encoded dataframe
encoded_df.shape

(5000, 13)

In [18]:
df_no_outliers.shape

(4158, 9)

# min max scale

In [19]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [20]:
columns_to_scale = ['Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO',
       'Proximity_to_Industrial_Areas', 'Population_Density']

In [21]:
minmax_scaler = MinMaxScaler()
df_minmax_scaled = df_no_outliers.copy()
df_minmax_scaled[columns_to_scale] = minmax_scaler.fit_transform(df_no_outliers[columns_to_scale])

In [22]:
df_minmax_scaled.head()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density
0,0.483776,0.304749,0.091873,0.283255,0.292621,0.52381,0.540404,0.224852,0.194651
1,0.439528,0.522427,0.040636,0.194053,0.59542,0.540816,0.5,0.207101,0.628529
2,0.286136,0.510554,0.471731,0.532081,0.43257,0.639456,0.494949,0.159763,0.640416
3,0.40413,0.040897,0.107774,0.101721,0.155216,0.391156,0.252525,0.508876,0.539376
4,0.386431,0.457784,0.121908,0.253521,0.368957,0.401361,0.181818,0.60355,0.170877


it transforms the data so that it has a mean of 0 and a std of 1

In [23]:
standard_scaler = StandardScaler()
df_standard_scaled = df_no_outliers.copy()
df_standard_scaled[columns_to_scale] = standard_scaler.fit_transform(df_no_outliers[columns_to_scale])

In [24]:
df_standard_scaled.head()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density
0,0.208528,-0.572424,-0.684376,-0.302789,-0.747481,0.155763,0.85237,-0.79046,-1.079606
1,-0.057517,0.557506,-0.924151,-0.708591,0.856706,0.255309,0.663646,-0.878203,1.008238
2,-0.979808,0.495873,1.093263,0.829185,-0.00605,0.832674,0.640056,-1.112184,1.06544
3,-0.270354,-1.942035,-0.609963,-1.128632,-1.475431,-0.620693,-0.492284,0.613427,0.579229
4,-0.376772,0.221951,-0.543819,-0.438056,-0.343064,-0.560965,-0.82255,1.081389,-1.194008


In [25]:
final_df = pd.concat([df_standard_scaled, encoded_df], axis=1)

In [26]:
final_df.shape

(5000, 22)

In [27]:
final_df.columns

Index(['Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO',
       'Proximity_to_Industrial_Areas', 'Population_Density', 'Temperature',
       'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO',
       'Proximity_to_Industrial_Areas', 'Population_Density',
       'Air Quality_Good', 'Air Quality_Hazardous', 'Air Quality_Moderate',
       'Air Quality_Poor'],
      dtype='object')