<a href="https://www.kaggle.com/code/atifmasih/ozone-concentration-levels-prediction?scriptVersionId=192646729" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ozone-concentration-levels-in-castnet-dataset/air quality  ozone concentrations.csv


# Problem Statement:

**Our goal is to develop a predictive model that can accurately forecast ozone concentration levels (O3) at various sites based on historical data. The model should be able to capture the complex relationships between ozone levels, time, and environmental factors.**

*Import Libraries*

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, message=".*use_inf_as_na.*")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

*Load Dataset*

In [3]:
# Load data
data = pd.read_csv('/kaggle/input/ozone-concentration-levels-in-castnet-dataset/air quality  ozone concentrations.csv')
data.head(5)

Unnamed: 0,Site ID,Ozone,Units,QA Code,Update_Date,Ozone F,Selected Date_Time
0,ABT147,27,PPB,3,12/31/2023 1:20:26 AM,-,12/31/2023 12:00:00 AM
1,ALC188,25,PPB,3,12/31/2023 2:20:25 AM,-,12/31/2023 12:00:00 AM
2,ANA115,25,PPB,3,12/31/2023 1:20:30 AM,-,12/31/2023 12:00:00 AM
3,ARE128,21,PPB,3,12/31/2023 1:20:31 AM,-,12/31/2023 12:00:00 AM
4,ASH135,-,-,X,12/30/2023 10:00:03 PM,-,12/31/2023 12:00:00 AM


*Explore Dataset*

In [4]:
data.shape

(211043, 7)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211043 entries, 0 to 211042
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Site ID             211043 non-null  object
 1   Ozone               211043 non-null  object
 2   Units               211043 non-null  object
 3   QA Code             211043 non-null  object
 4   Update_Date         211043 non-null  object
 5   Ozone F             211043 non-null  object
 6   Selected Date_Time  211043 non-null  object
dtypes: object(7)
memory usage: 11.3+ MB


*Convert columns to appropriate data types*

In [6]:
#Convert date related columns to datatime
data['Update_Date'] = pd.to_datetime(data['Update_Date'], errors='coerce')
data['Selected Date_Time'] = pd.to_datetime(data['Selected Date_Time'],format='%Y-%m-%d %H:%M:%S', errors='coerce')

In [7]:
data.head(4)

Unnamed: 0,Site ID,Ozone,Units,QA Code,Update_Date,Ozone F,Selected Date_Time
0,ABT147,27,PPB,3,2023-12-31 01:20:26,-,NaT
1,ALC188,25,PPB,3,2023-12-31 02:20:25,-,NaT
2,ANA115,25,PPB,3,2023-12-31 01:20:30,-,NaT
3,ARE128,21,PPB,3,2023-12-31 01:20:31,-,NaT


In [8]:
# Convert numeric columns (assume columns with numerical data need conversion)
numeric_columns = data.columns.drop(['Site ID','Units', 'Update_Date', 'Selected Date_Time']).tolist()

for column in numeric_columns:
    data[column] = pd.to_numeric(data[column], errors='coerce')

# Check data types after conversion
print(data.dtypes)

Site ID                       object
Ozone                        float64
Units                         object
QA Code                      float64
Update_Date           datetime64[ns]
Ozone F                      float64
Selected Date_Time    datetime64[ns]
dtype: object


In [9]:
data.describe()

Unnamed: 0,Ozone,QA Code,Update_Date,Ozone F,Selected Date_Time
count,183986.0,208863.0,211043,0.0,0
mean,30.204603,2.98489,2023-10-05 07:18:46.343285504,,NaT
min,0.0,1.0,2023-06-30 22:00:06,,NaT
25%,21.0,3.0,2023-08-18 23:42:15,,NaT
50%,30.0,3.0,2023-10-05 11:00:41,,NaT
75%,39.0,3.0,2023-11-19 05:20:29,,NaT
max,103.0,3.0,2024-07-19 14:15:57,,NaT
std,13.349808,0.173184,,,


**Data PreProcessing**

In [10]:
# Check again for missing values
print(data.isnull().sum())

Site ID                    0
Ozone                  27057
Units                      0
QA Code                 2180
Update_Date                0
Ozone F               211043
Selected Date_Time    211043
dtype: int64


In [11]:
data = data.drop(columns = ["Selected Date_Time","Update_Date","Ozone F"], axis = 'columns')

In [12]:
#data = data.drop(columns = ["Ozone F"], axis = 'columns')

In [13]:
data.head(2)

Unnamed: 0,Site ID,Ozone,Units,QA Code
0,ABT147,27.0,PPB,3.0
1,ALC188,25.0,PPB,3.0


In [14]:
# Impute missing values using mean
data = data.dropna()
data.shape

(183986, 4)

In [15]:
data.head(5)

Unnamed: 0,Site ID,Ozone,Units,QA Code
0,ABT147,27.0,PPB,3.0
1,ALC188,25.0,PPB,3.0
2,ANA115,25.0,PPB,3.0
3,ARE128,21.0,PPB,3.0
5,BFT142,24.0,PPB,3.0


*Handling Categorical Values*

In [16]:
data = pd.get_dummies(data, drop_first = True)

*Normlizing the values*

In [17]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Scale the features
df_scaled = scaler.fit_transform(data)

*Create train and test*

In [18]:
X = data.drop("Ozone", axis = 'columns')
y = data["Ozone"]

In [19]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

*Models definition*

In [20]:
# Define machine learning algorithms
lr_model = LinearRegression()
dt_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor(n_estimators=100)
nn_model = MLPRegressor(hidden_layer_sizes=(100,))

*Models training*

In [21]:
# Train models
lr_model.fit(X_train, y_train)

In [22]:
dt_model.fit(X_train, y_train)

In [23]:
rf_model.fit(X_train, y_train)


In [24]:
nn_model.fit(X_train, y_train)


*Models evaluations*

In [25]:
# Evaluate models
print("Linear Regression:", lr_model.score(X_test, y_test))
print("Decision Tree:", dt_model.score(X_test, y_test))
print("Random Forest:", rf_model.score(X_test, y_test))
print("Neural Network:", nn_model.score(X_test, y_test))


Linear Regression: 0.16817576993293182
Decision Tree: 0.16817576993293193
Random Forest: 0.16816406315992005
Neural Network: 0.16794093814378375


*Models Predictions*

In [26]:
# Make predictions
y_pred_lr = lr_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)
y_pred_nn = nn_model.predict(X_test)


*Models Prediction's Comparision*

In [27]:
# Compare predictions
print("Mean Absolute Error (MAE):")
print("Linear Regression:", np.mean(np.abs(y_pred_lr - y_test)))
print("Decision Tree:", np.mean(np.abs(y_pred_dt - y_test)))
print("Random Forest:", np.mean(np.abs(y_pred_rf - y_test)))
print("Neural Network:", np.mean(np.abs(y_pred_nn - y_test)))

Mean Absolute Error (MAE):
Linear Regression: 9.748286836062293
Decision Tree: 9.748286836062293
Random Forest: 9.748591445656771
Neural Network: 9.74704339160495
