IMPORT DATASET

In [17]:
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import classification_report, accuracy_score

import matplotlib.pyplot as plt

import seaborn as sns

In [18]:
data = pd.read_csv("air_data.csv")
data

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2010-01-01,DKI1 (Bunderan HI),60.0,,4.0,73.0,27.0,14.0,73.0,CO,SEDANG
1,2010-01-02,DKI1 (Bunderan HI),32.0,,2.0,16.0,33.0,9.0,33.0,O3,BAIK
2,2010-01-03,DKI1 (Bunderan HI),27.0,,2.0,19.0,20.0,9.0,27.0,PM10,BAIK
3,2010-01-04,DKI1 (Bunderan HI),22.0,,2.0,16.0,15.0,6.0,22.0,PM10,BAIK
4,2010-01-05,DKI1 (Bunderan HI),25.0,,2.0,17.0,15.0,8.0,25.0,PM10,BAIK
...,...,...,...,...,...,...,...,...,...,...,...
4620,2023-11-26,DKI1 (Bunderan HI),55.0,75.0,43.0,15.0,15.0,25.0,75.0,PM25,SEDANG
4621,2023-11-27,DKI3 (Jagakarsa),54.0,77.0,56.0,13.0,27.0,16.0,77.0,PM25,SEDANG
4622,2023-11-28,DKI1 (Bunderan HI),62.0,96.0,45.0,15.0,29.0,34.0,96.0,PM25,SEDANG
4623,2023-11-29,DKI4 (Lubang Buaya),71.0,105.0,30.0,19.0,22.0,14.0,105.0,PM25,TIDAK SEHAT


In [19]:
data['categori_encoded'] = data['categori'].map({'BAIK': 0, 'SEDANG': 1, 'TIDAK SEHAT': 2, 'SANGAT TIDAK SEHAT': 3, 'BERBAHAYA': 4})
data = data.drop(columns=["tanggal", "stasiun", "max", "critical", "pm25","categori"])
data

Unnamed: 0,pm10,so2,co,o3,no2,categori_encoded
0,60.0,4.0,73.0,27.0,14.0,1
1,32.0,2.0,16.0,33.0,9.0,0
2,27.0,2.0,19.0,20.0,9.0,0
3,22.0,2.0,16.0,15.0,6.0,0
4,25.0,2.0,17.0,15.0,8.0,0
...,...,...,...,...,...,...
4620,55.0,43.0,15.0,15.0,25.0,1
4621,54.0,56.0,13.0,27.0,16.0,1
4622,62.0,45.0,15.0,29.0,34.0,1
4623,71.0,30.0,19.0,22.0,14.0,2


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4625 entries, 0 to 4624
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   pm10              4466 non-null   float64
 1   so2               4607 non-null   float64
 2   co                4618 non-null   float64
 3   o3                4621 non-null   float64
 4   no2               4618 non-null   float64
 5   categori_encoded  4625 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 216.9 KB


In [21]:
data.describe()

Unnamed: 0,pm10,so2,co,o3,no2,categori_encoded
count,4466.0,4607.0,4618.0,4621.0,4618.0,4625.0
mean,65.086879,28.183851,30.263534,86.58667,18.120832,1.366486
std,19.24315,13.822107,13.640344,50.295879,8.451605,0.66483
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,54.0,18.0,21.0,51.0,13.0,1.0
50%,64.0,27.0,29.0,78.0,17.0,1.0
75%,75.0,34.0,37.0,110.0,21.0,2.0
max,179.0,126.0,134.0,314.0,134.0,4.0


In [22]:
from scipy.stats import chi2
from scipy.spatial import distance

def mahalanobis_impute(data):

    # Calculate the mean and covariance of the non-null values

    mean_values = data.mean()

    cov_matrix = data.cov()

    inv_cov_matrix = np.linalg.inv(cov_matrix)


    # Impute null values

    for column in data.columns:

        if data[column].isnull().any():

            # Get indices of null values

            null_indices = data[data[column].isnull()].index

            

            for idx in null_indices:

                # Calculate Mahalanobis distance for each observation

                diff = data.loc[idx] - mean_values

                mahalanobis_dist = distance.mahalanobis(diff, np.zeros(len(mean_values)), inv_cov_matrix)

                

                # Use the Mahalanobis distance to find the closest observation

                # Here we can simply replace NaN with the mean, or use more sophisticated methods

                data.loc[idx, column] = mean_values[column]  # Replace with mean for simplicity


    return data


# Impute null values using Mahalanobis distance

imputed_data = mahalanobis_impute(data)


# Display the imputed DataFrame

print(imputed_data)

      pm10   so2    co    o3   no2  categori_encoded
0     60.0   4.0  73.0  27.0  14.0                 1
1     32.0   2.0  16.0  33.0   9.0                 0
2     27.0   2.0  19.0  20.0   9.0                 0
3     22.0   2.0  16.0  15.0   6.0                 0
4     25.0   2.0  17.0  15.0   8.0                 0
...    ...   ...   ...   ...   ...               ...
4620  55.0  43.0  15.0  15.0  25.0                 1
4621  54.0  56.0  13.0  27.0  16.0                 1
4622  62.0  45.0  15.0  29.0  34.0                 1
4623  71.0  30.0  19.0  22.0  14.0                 2
4624  38.0  43.0  12.0  34.0  34.0                 1

[4625 rows x 6 columns]


In [23]:
# Define features and target variable

X = data.drop('categori_encoded', axis=1)

y = data['categori_encoded']

In [24]:
# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
# Train Random Forest model

rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train, y_train)

rf_predictions = rf_model.predict(X_test)

rf_report = classification_report(y_test, rf_predictions)


In [26]:
# Train Gradient Boosting model

gb_model = GradientBoostingClassifier(random_state=42)

gb_model.fit(X_train, y_train)

gb_predictions = gb_model.predict(X_test)

gb_report = classification_report(y_test, gb_predictions)

In [27]:
# Evaluate Random Forest model

print("Random Forest Classification Report:")

print(rf_report)

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97        91
           1       0.97      0.97      0.97       762
           2       0.96      0.96      0.96       463
           3       1.00      0.97      0.99        72

    accuracy                           0.97      1388
   macro avg       0.98      0.97      0.97      1388
weighted avg       0.97      0.97      0.97      1388



In [28]:
# Evaluate Gradient Boosting model

print("Gradient Boosting Classification Report:")

print(gb_report)

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97        91
           1       0.98      0.97      0.97       762
           2       0.95      0.97      0.96       463
           3       1.00      0.97      0.99        72

    accuracy                           0.97      1388
   macro avg       0.97      0.97      0.97      1388
weighted avg       0.97      0.97      0.97      1388

