In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/global-air-quality-dataset/global_air_quality_data_10000.csv


# Predictive Model to Analyze the Impact of Meteorological Factors on Air Quality

**Dataset loading and Exploration**

* loading of data
* check no of rows and columns
* check types of data
* Statistical Anaylsis

In [2]:
# Load the dataset
data = pd.read_csv('/kaggle/input/global-air-quality-dataset/global_air_quality_data_10000.csv')
data.head(4)

Unnamed: 0,City,Country,Date,PM2.5,PM10,NO2,SO2,CO,O3,Temperature,Humidity,Wind Speed
0,Bangkok,Thailand,2023-03-19,86.57,25.19,99.88,30.63,4.46,36.29,17.67,59.35,13.76
1,Istanbul,Turkey,2023-02-16,50.63,97.39,48.14,8.71,3.4,144.16,3.46,67.51,6.36
2,Rio de Janeiro,Brazil,2023-11-13,130.21,57.22,98.51,9.92,0.12,179.31,25.29,29.3,12.87
3,Mumbai,India,2023-03-16,119.7,130.52,10.96,33.03,7.74,38.65,23.15,99.97,7.71


In [3]:
# checking row and coloumns
data.shape

(10000, 12)

In [4]:
# total no of samples
data.size

120000

In [5]:
# data types and complete details
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   City         10000 non-null  object 
 1   Country      10000 non-null  object 
 2   Date         10000 non-null  object 
 3   PM2.5        10000 non-null  float64
 4   PM10         10000 non-null  float64
 5   NO2          10000 non-null  float64
 6   SO2          10000 non-null  float64
 7   CO           10000 non-null  float64
 8   O3           10000 non-null  float64
 9   Temperature  10000 non-null  float64
 10  Humidity     10000 non-null  float64
 11  Wind Speed   10000 non-null  float64
dtypes: float64(9), object(3)
memory usage: 937.6+ KB


**Handle Missing Values:**

* Identify missing values in the dataset.

In [6]:
# Check for missing values
missing_values = data.isnull().sum()
missing_values

City           0
Country        0
Date           0
PM2.5          0
PM10           0
NO2            0
SO2            0
CO             0
O3             0
Temperature    0
Humidity       0
Wind Speed     0
dtype: int64

**Handle Outliers:**

In [7]:
from scipy.stats import zscore

# Calculate Z-scores of the numerical columns
z_scores = data[['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'Temperature', 'Humidity', 'Wind Speed']].apply(zscore)

# Remove rows with Z-scores greater than 3 or less than -3
data = data[(z_scores < 3).all(axis=1)]


**Data Normalization:**

In [8]:
from sklearn.preprocessing import StandardScaler

# Select numeri columns to normalize
columns_to_normalize = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'Temperature', 'Humidity', 'Wind Speed']

# Apply StandardScaler
scaler = StandardScaler()
data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])


**Convert the 'Date' column to datetime format and extract features**

In [9]:
# Convert 'Date' column to datetime
data['Date'] = pd.to_datetime(data['Date'])

# Extract additional features from date
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['DayOfWeek'] = data['Date'].dt.dayofweek

In [10]:
data.head(2)

Unnamed: 0,City,Country,Date,PM2.5,PM10,NO2,SO2,CO,O3,Temperature,Humidity,Wind Speed,Year,Month,Day,DayOfWeek
0,Bangkok,Thailand,2023-03-19,0.217565,-1.439315,1.745347,0.375112,-0.206131,-1.266221,0.191985,0.164406,0.626447,2023,3,19,6
1,Istanbul,Turkey,2023-02-16,-0.639665,-0.12801,-0.148564,-1.180547,-0.577737,0.692254,-0.791878,0.478482,-0.687393,2023,2,16,3


**Feature Engineering**

* Create lagged features for pollutants and meteorological data

In [11]:
# Create lagged features for pollutants and meteorological data
lag_features = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'Temperature', 'Humidity', 'Wind Speed']
for feature in lag_features:
    for lag in range(1, 4):  # Create 3 lagged features
        data[f'{feature}_lag{lag}'] = data[feature].shift(lag)

# Drop rows with NaN values created by shifting
data.dropna(inplace=True)


**Aggregate data at different time scales (daily, weekly, monthly).**

In [12]:
# Separate numeric columns
numeric_columns = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3', 'Temperature', 'Humidity', 'Wind Speed']
non_numeric_columns = ['City', 'Country']

# Resample and aggregate numeric columns
daily_data = data.set_index('Date')[numeric_columns].resample('D').mean()
weekly_data = data.set_index('Date')[numeric_columns].resample('W').mean()
monthly_data = data.set_index('Date')[numeric_columns].resample('M').mean()

# Reset index to bring 'Date' back as a column
daily_data.reset_index(inplace=True)
weekly_data.reset_index(inplace=True)
monthly_data.reset_index(inplace=True)

# Merge non-numeric columns back to the aggregated data if required
# Note: This step assumes that non-numeric data is constant within each resampling period, which may not be the case.
# You might need to use a different approach if non-numeric data varies.

daily_data = data[['Date']].merge(daily_data, on='Date', how='left')
weekly_data = data[['Date']].merge(weekly_data, on='Date', how='left')
monthly_data = data[['Date']].merge(monthly_data, on='Date', how='left')

# Display the results
print("Daily Data:")
print(daily_data.head())
print("\nWeekly Data:")
print(weekly_data.head())
print("\nMonthly Data:")
print(monthly_data.head())

Daily Data:
        Date     PM2.5      PM10       NO2       SO2        CO        O3  \
0 2023-03-16 -0.303002  0.156178  0.099159 -0.133505  0.086862  0.007106   
1 2023-04-04  0.039780 -0.260048  0.054895  0.326084  0.094339  0.242457   
2 2023-01-05  0.162421 -0.136776 -0.195066  0.058559  0.236534 -0.220400   
3 2023-09-17 -0.222109  0.132705  0.192023 -0.002512  0.079639 -0.112833   
4 2023-11-23  0.076746  0.334524  0.109735 -0.242943  0.009242 -0.195466   

   Temperature  Humidity  Wind Speed  
0    -0.149838  0.351490   -0.094282  
1    -0.200389 -0.041947    0.428039  
2     0.176007 -0.007170   -0.207678  
3    -0.164818  0.198044   -0.183593  
4     0.209445  0.078490   -0.009322  

Weekly Data:
        Date     PM2.5      PM10       NO2      SO2        CO        O3  \
0 2023-03-16       NaN       NaN       NaN      NaN       NaN       NaN   
1 2023-04-04       NaN       NaN       NaN      NaN       NaN       NaN   
2 2023-01-05       NaN       NaN       NaN      NaN       

  monthly_data = data.set_index('Date')[numeric_columns].resample('M').mean()


**Model Train and Prediction**

* LinearRegression
* DecisionTreeRegressor

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

# Define features and target variable
features = data.drop(columns=['Date', 'City', 'Country', 'PM2.5'])
target = data['PM2.5']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Decision Tree
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)


* RandomForestRegressor
* GradientBoostingRegressor

In [14]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Random Forest
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Gradient Boosting
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)


**Evaluate the model performance on the test set using metrics like RMSE or MAE**

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Predictions
linear_preds = linear_model.predict(X_test)
tree_preds = tree_model.predict(X_test)
rf_preds = rf_model.predict(X_test)
gb_preds = gb_model.predict(X_test)

# Evaluate Linear Regression
linear_rmse = mean_squared_error(y_test, linear_preds, squared=False)
linear_mae = mean_absolute_error(y_test, linear_preds)

# Evaluate Decision Tree
tree_rmse = mean_squared_error(y_test, tree_preds, squared=False)
tree_mae = mean_absolute_error(y_test, tree_preds)

# Evaluate Random Forest
rf_rmse = mean_squared_error(y_test, rf_preds, squared=False)
rf_mae = mean_absolute_error(y_test, rf_preds)

# Evaluate Gradient Boosting
gb_rmse = mean_squared_error(y_test, gb_preds, squared=False)
gb_mae = mean_absolute_error(y_test, gb_preds)

# Print evaluation results
print(f'Linear Regression RMSE: {linear_rmse}, MAE: {linear_mae}')
print(f'Decision Tree RMSE: {tree_rmse}, MAE: {tree_mae}')
print(f'Random Forest RMSE: {rf_rmse}, MAE: {rf_mae}')
print(f'Gradient Boosting RMSE: {gb_rmse}, MAE: {gb_mae}')


Linear Regression RMSE: 1.0085833896981793, MAE: 0.8780362620318172
Decision Tree RMSE: 1.3927844763268755, MAE: 1.1366511437084377
Random Forest RMSE: 1.0123765698897134, MAE: 0.8799864303281281
Gradient Boosting RMSE: 1.0111187667010708, MAE: 0.8809445205833916


**Summary**

Using PM2.5 as the target variable focuses the model on predicting levels of a pollutant that is critical for public health and regulatory compliance. By training a model to predict PM2.5 concentrations based on other air quality and meteorological factors, we can gain valuable insights and take proactive measures to improve air quality.