## Import Library and Load data

In [1]:
#Modifikasi
import warnings
import zipfile
import numpy as np
import pandas as pd
from pathlib import Path
pd.set_option('display.max_columns', 100)

#Visualisasi
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly_express as px
import plotly.subplots as sp
from statsmodels.graphics.tsaplots import plot_acf
from matplotlib import rcParams

#Perhitungan
import itertools
from statsmodels.formula.api import ols
import statsmodels.api as sm
from geopy.geocoders import Nominatim
from scipy import stats
from statsmodels.tsa.seasonal import seasonal_decompose


#Imputasi
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


# Modeling
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV, GroupKFold,KFold, TimeSeriesSplit   
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc,roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

#Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# Feature Importance
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

## Data Understanding (Train and Test)

In [3]:
train.head()

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,GT_NO2
0,ID_ENTGC7,1/1/19,PD01,45.601585,11.903551,0.0,,0.230527,0.559117,2.4e-05,0.000117,,14440.82126,31.0
1,ID_8JCCXC,1/1/19,PD04,45.371005,11.84083,3.047342,,-0.074006,0.869309,2.4e-05,0.000127,,14441.79815,42.0
2,ID_V3136Z,1/1/19,RO01,45.045825,12.060869,0.0,,0.02447,0.67416,2.4e-05,8.6e-05,,14437.38294,31.0
3,ID_KRVZDJ,1/1/19,RO02,45.104075,11.553241,1.200467,,-0.010442,0.920054,2.4e-05,0.000124,,14440.83831,30.0
4,ID_PR351A,1/1/19,RO03,45.038758,11.790152,1.274564,,-0.176178,0.747464,2.4e-05,0.000116,,14438.79037,58.0


In [4]:
train.info(),train.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86584 entries, 0 to 86583
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID_Zindi            86584 non-null  object 
 1   Date                86584 non-null  object 
 2   ID                  86584 non-null  object 
 3   LAT                 86584 non-null  float64
 4   LON                 86584 non-null  float64
 5   Precipitation       86584 non-null  float64
 6   LST                 46798 non-null  float64
 7   AAI                 73709 non-null  float64
 8   CloudFraction       73709 non-null  float64
 9   NO2_strat           73709 non-null  float64
 10  NO2_total           73709 non-null  float64
 11  NO2_trop            51111 non-null  float64
 12  TropopausePressure  73709 non-null  float64
 13  GT_NO2              82051 non-null  float64
dtypes: float64(11), object(3)
memory usage: 9.2+ MB


(None, (86584, 14))

In [5]:
train.isna().sum()

ID_Zindi                  0
Date                      0
ID                        0
LAT                       0
LON                       0
Precipitation             0
LST                   39786
AAI                   12875
CloudFraction         12875
NO2_strat             12875
NO2_total             12875
NO2_trop              35473
TropopausePressure    12875
GT_NO2                 4533
dtype: int64

## Data Cleaning

In [6]:
train['Date'] = pd.to_datetime(train['Date'], dayfirst=True, errors='coerce')

Date = train.copy()
Date['Date'] = pd.to_datetime(Date['Date'])


# 3. Menetapkan kolom Date sebagai index
train.set_index('Date', inplace=True)


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [7]:
test['Date'] = pd.to_datetime(test['Date'], dayfirst=True, errors='coerce')

Date_test = test.copy()
Date_test['Date'] = pd.to_datetime(Date['Date'])


# 3. Menetapkan kolom Date sebagai index
test.set_index('Date', inplace=True)


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



1. Precipitation
    - Description: Amount of precipitation (rainfall or equivalent) measured at the location.
    - Type: Continuous (float64).
    - Units: Likely millimeters (mm) or equivalent.
    - Purpose: A critical feature for environmental and climate modeling.

2. LST (Land Surface Temperature)
    - Description: Measured temperature of the Earth's surface at the location.
    - Type: Continuous (float64).
    - Units: Likely degrees Celsius or Kelvin.
    - Purpose: Used in studies of heat distribution, climate, or agriculture.

3. AAI (Absorbing Aerosol Index)
    - Description: Indicates the presence of aerosols (e.g., dust, smoke) in the atmosphere.
    - Type: Continuous (float64).
    - Range: Positive values indicate absorbing aerosols; negative or near-zero values may indicate clear conditions.
    - Purpose: Relevant for air quality analysis and environmental studies.

4. CloudFraction
    - Description: Fractional coverage of clouds in the observed area.
    - Type: Continuous (float64).
    - Range: 0 (clear) to 1 (completely overcast).
    - Purpose: Used in weather and climate modeling.

5. NO2_strat (Stratospheric NO₂)
    - Description: Concentration of nitrogen dioxide in the stratosphere.
    - Type: Continuous (float64).
    - Units: Likely micrograms per cubic meter (µg/m³).
    - Purpose: Provides insights into atmospheric chemistry and pollution sources.

6. NO2_total (Total NO₂)
    - Description: Total nitrogen dioxide concentration in the atmosphere.
    - Type: Continuous (float64).
    - Purpose: Useful for assessing air pollution levels.

7. NO2_trop (Tropospheric NO₂)
    - Description: Concentration of nitrogen dioxide in the troposphere.
    - Type: Continuous (float64).
    - Purpose: Indicates near-surface pollution and human activity impact.

8. TropopausePressure
    - Description: Pressure at the tropopause (the boundary between the troposphere and stratosphere).
    - Type: Continuous (float64).
    - Units: Likely hectopascals (hPa).
    - Purpose: Provides insights into atmospheric dynamics and weather.

9. GT_NO2 (Ground Truth NO₂)
    - Description: Ground-level concentration of nitrogen dioxide.
    - Type: Continuous (float64).
    - Purpose: Used as a benchmark or target value for validation and modeling.


## Imputasi

In [8]:
# df = train.copy()
# df.drop(columns=['ID_Zindi','ID'],inplace=True)
# test.drop(columns=['ID_Zindi','ID'],inplace=True)

# def impute_missing_values(df, cols_to_impute, drop_cols=['LAT', 'LON'], n_estimators=100, random_state=42):
#     for col in cols_to_impute:
#         if df[col].isna().sum() > 0:  # Cek apakah ada nilai NaN pada kolom
#             non_missing_data = df[df[col].notna()]  # Data tanpa nilai NaN untuk training
#             X_train = non_missing_data.drop(columns=[col] + drop_cols)  # Fitur training tanpa kolom target
#             y_train = non_missing_data[col]  # Target untuk training
            
#             # Inisiasi Random Forest Regressor dan training
#             rf_imputer = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
#             rf_imputer.fit(X_train, y_train)
            
#             # Melakukan prediksi untuk mengisi nilai NaN
#             X_pred = df[df[col].isna()].drop(columns=[col] + drop_cols)
#             df.loc[df[col].isna(), col] = rf_imputer.predict(X_pred)
    
#     return df


# cols_to_impute_rf = ['AAI', 'CloudFraction','LST', 'NO2_trop', 'NO2_strat', 'NO2_total', 'TropopausePressure']
# df = impute_missing_values(df, cols_to_impute_rf)
# test = impute_missing_values(test, cols_to_impute_rf)

# # Imputasi untuk kolom dengan missing data sedikit (Mean Imputation)
# cols_to_impute_mean = ['GT_NO2']
# mean_imputer = SimpleImputer(strategy='mean')
# df[cols_to_impute_mean] = mean_imputer.fit_transform(df[cols_to_impute_mean])

# # Time series imputation using Iterative Imputer (Multiple Imputation)
# time_series_cols = ['Precipitation']
# time_series_imputer = IterativeImputer(random_state=42)
# df[time_series_cols] = time_series_imputer.fit_transform(df[time_series_cols])
# test[time_series_cols] = time_series_imputer.fit_transform(test[time_series_cols])

# Feature Engineering

In [9]:
# df.to_csv('train_imputed_with_rf_regressor.csv', index=False)
# test.to_csv('test_imputed_with_rf_regressor.csv', index=False)

data = pd.read_csv('train_imputed_with_rf_regressor.csv')
dtest = pd.read_csv('test_imputed_with_rf_regressor.csv')

data.drop(['LAT','LON'],axis=1,inplace=True)
dtest.drop(['LAT','LON'],axis=1,inplace=True)

In [10]:
kmeans = KMeans(n_clusters=3)
data['Kmeans'] = kmeans.fit_predict(data[['NO2_strat', 'NO2_total', 'NO2_trop']])
dtest['Kmeans'] = kmeans.fit_predict(dtest[['NO2_strat', 'NO2_total', 'NO2_trop']])

In [11]:
scaler = StandardScaler()
data['TropopausePressure'] = scaler.fit_transform(data[['TropopausePressure']])
dtest['TropopausePressure'] = scaler.fit_transform(dtest[['TropopausePressure']])

In [12]:
from sklearn.preprocessing import PolynomialFeatures

# Misalnya, data berisi kolom 'Precipitation', 'LST', 'AAI'
poly = PolynomialFeatures(degree=2)

# Menerapkan transformasi polinomial pada kolom-kolom yang ditentukan
poly_features = poly.fit_transform(data[['Precipitation', 'LST', 'AAI']])
poly_features_test = poly.fit_transform(dtest[['Precipitation', 'LST', 'AAI']])

# Membuat DataFrame baru dengan nama kolom yang sesuai
poly_feature_columns = poly.get_feature_names_out(['Precipitation', 'LST', 'AAI'])

# Menggabungkan hasilnya dengan data asli (jika perlu)
poly_data = pd.DataFrame(poly_features, columns=poly_feature_columns)
poly_dtest = pd.DataFrame(poly_features_test, columns=poly_feature_columns)

poly_data.drop(['1','Precipitation','LST','AAI',],axis=1,inplace=True)
poly_dtest.drop(['1','Precipitation','LST','AAI',],axis=1,inplace=True)

In [13]:
result = pd.concat([data, poly_data], axis=1)
result_test = pd.concat([dtest, poly_dtest], axis=1)

In [None]:
result['GT_NO2'] = pd.to_numeric(result['GT_NO2'], errors='coerce')  # Pastikan target numerik


# Model Creation

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import numpy as np

In [38]:
# Pisahkan data menjadi fitur dan target
X = result.drop(columns=['GT_NO2'])
y = result['GT_NO2']

# Pisahkan data menjadi train dan validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Inisialisasi model yang akan digunakan
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}

# Menyimpan hasil evaluasi
results = {}

# Loop melalui setiap model
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    results[name] = rmse

# Tampilkan hasil
print("\nModel Evaluation Results (RMSE):")
for name, rmse in results.items():
    print(f"{name}: {rmse:.4f}")

Training Linear Regression...
Training Decision Tree...
Training Random Forest...
Training Gradient Boosting...

Model Evaluation Results (RMSE):
Linear Regression: 12.3161
Decision Tree: 13.4085
Random Forest: 9.4391
Gradient Boosting: 11.0532
