## Import Library and Load data

In [1]:
#Modifikasi
import warnings
import zipfile
import numpy as np
import pandas as pd
from pathlib import Path
pd.set_option('display.max_columns', 100)

#Perhitungan
from sklearn.preprocessing import PolynomialFeatures

import matplotlib.pyplot as plt

#Imputasi
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


# Modeling
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV, GroupKFold,KFold, TimeSeriesSplit   
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc,roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, RepeatVector, TimeDistributed

#Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# Feature Importance
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
train = pd.read_csv('raw_dataset/Train.csv')
test = pd.read_csv('raw_dataset/Test.csv')

## Data Cleaning

In [3]:
train['Date'] = pd.to_datetime(train['Date'], dayfirst=True, errors='coerce')

Date = train.copy()
Date['Date'] = pd.to_datetime(Date['Date'])


# 3. Menetapkan kolom Date sebagai index
train.set_index('Date', inplace=True)

  train['Date'] = pd.to_datetime(train['Date'], dayfirst=True, errors='coerce')


In [4]:
test['Date'] = pd.to_datetime(test['Date'], dayfirst=True, errors='coerce')

Date_test = test.copy()
Date_test['Date'] = pd.to_datetime(Date['Date'])


# 3. Menetapkan kolom Date sebagai index
test.set_index('Date', inplace=True)

  test['Date'] = pd.to_datetime(test['Date'], dayfirst=True, errors='coerce')


## Imputasi

In [5]:
# df = train.copy()
# df.drop(columns=['ID_Zindi','ID'],inplace=True)
# test.drop(columns=['ID_Zindi','ID'],inplace=True)

# def impute_missing_values(df, cols_to_impute, drop_cols=['LAT', 'LON'], n_estimators=100, random_state=42):
#     for col in cols_to_impute:
#         if df[col].isna().sum() > 0:  # Cek apakah ada nilai NaN pada kolom
#             non_missing_data = df[df[col].notna()]  # Data tanpa nilai NaN untuk training
#             X_train = non_missing_data.drop(columns=[col] + drop_cols)  # Fitur training tanpa kolom target
#             y_train = non_missing_data[col]  # Target untuk training
            
#             # Inisiasi Random Forest Regressor dan training
#             rf_imputer = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
#             rf_imputer.fit(X_train, y_train)
            
#             # Melakukan prediksi untuk mengisi nilai NaN
#             X_pred = df[df[col].isna()].drop(columns=[col] + drop_cols)
#             df.loc[df[col].isna(), col] = rf_imputer.predict(X_pred)
    
#     return df


# cols_to_impute_rf = ['AAI', 'CloudFraction','LST', 'NO2_trop', 'NO2_strat', 'NO2_total', 'TropopausePressure']
# df = impute_missing_values(df, cols_to_impute_rf)
# test = impute_missing_values(test, cols_to_impute_rf)

# # Imputasi untuk kolom dengan missing data sedikit (Mean Imputation)
# cols_to_impute_mean = ['GT_NO2']
# mean_imputer = SimpleImputer(strategy='mean')
# df[cols_to_impute_mean] = mean_imputer.fit_transform(df[cols_to_impute_mean])

# # Time series imputation using Iterative Imputer (Multiple Imputation)
# time_series_cols = ['Precipitation']
# time_series_imputer = IterativeImputer(random_state=42)
# df[time_series_cols] = time_series_imputer.fit_transform(df[time_series_cols])
# test[time_series_cols] = time_series_imputer.fit_transform(test[time_series_cols])

# Feature Engineering

In [6]:
# df.to_csv('train_imputed_with_rf_regressor.csv', index=False)
# test.to_csv('test_imputed_with_rf_regressor.csv', index=False)

data = pd.read_csv('final_dataset/train_imputed_with_rf_regressor.csv')
dtest = pd.read_csv('final_dataset/test_imputed_with_rf_regressor.csv')

data.drop(['LAT','LON'],axis=1,inplace=True)
dtest.drop(['LAT','LON'],axis=1,inplace=True)

In [7]:
# Opsional: Cek pola musiman dengan boxplot
data["month"] = train.index.month
data["year"] = train.index.year

In [8]:
# Sinusoidal Encoding Bulan ``
data['month_Sin'] = np.sin(2 * np.pi * data['month'] / 12)
data['month_Cos'] = np.cos(2 * np.pi * data['month'] / 12)

# Fitur Elapsed months (pastikan tahun dimulai dari min tahun di data)
min_year = data['year'].min()
data['Elapsed_months'] = (data['year'] - min_year) * 12 + data['month']

# Quarter & Semester
data['Quarter'] = ((data['month'] - 1) // 3 + 1).astype(int)

data['Semester'] = ((data['month'] - 1) // 6 + 1).astype(int)

# year-month Identifier (Format YYYYMM)
data['yearmonth'] = (data['year'] * 100 + data['month']).astype(int)

In [9]:
kmeans = KMeans(n_clusters=3)
data['Kmeans'] = kmeans.fit_predict(data[['NO2_strat', 'NO2_total', 'NO2_trop']])

In [10]:
scaler = StandardScaler()
data['TropopausePressure'] = scaler.fit_transform(data[['TropopausePressure']])

In [11]:
# Misalnya, data berisi kolom 'Precipitation', 'LST', 'AAI'
poly = PolynomialFeatures(degree=2)

# Menerapkan transformasi polinomial pada kolom-kolom yang ditentukan
poly_features = poly.fit_transform(data[['Precipitation', 'LST', 'AAI']])
poly_features_test = poly.fit_transform(dtest[['Precipitation', 'LST', 'AAI']])

# Membuat DataFrame baru dengan nama kolom yang sesuai
poly_feature_columns = poly.get_feature_names_out(['Precipitation', 'LST', 'AAI'])

# Menggabungkan hasilnya dengan data asli (jika perlu)
poly_data = pd.DataFrame(poly_features, columns=poly_feature_columns)
poly_dtest = pd.DataFrame(poly_features_test, columns=poly_feature_columns)

poly_data.drop(['1','Precipitation','LST','AAI',],axis=1,inplace=True)
poly_dtest.drop(['1','Precipitation','LST','AAI',],axis=1,inplace=True)

In [12]:
result = pd.concat([data, poly_data], axis=1)
result_test = pd.concat([dtest, poly_dtest], axis=1)

In [13]:
result['GT_NO2'] = pd.to_numeric(result['GT_NO2'], errors='coerce')  # Pastikan target numerik

In [14]:
target_col = 'GT_NO2'  # Nama kolom target
cols = [col for col in data.columns if col != target_col] + [target_col]
data = data[cols]

# Model Creation

In [20]:
def create_sequences(data, seq_length, output_length=1):
    X, y = [], []
    for i in range(len(data) - seq_length - output_length + 1):
        X.append(data[i:i+seq_length])  # Ambil input sequence
        y.append(data[i+seq_length:i+seq_length+output_length])  # Output beberapa langkah ke depan
    return np.array(X), np.array(y)

In [21]:
from sklearn.ensemble import HistGradientBoostingRegressor


In [24]:
seq_length = 7
output_length = 1  # Prediksi 1 langkah ke depan
X, y = create_sequences(data['GT_NO2'].values, seq_length, output_length)

# Split data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Model SGBRegressor
model = HistGradientBoostingRegressor(loss="squared_error", max_iter=100)
model.fit(X_train, y_train.ravel())  # `.ravel()` agar sesuai dengan input

# Prediksi
y_pred = model.predict(X_test)

# Evaluasi
from sklearn.metrics import root_mean_squared_error 
mse = root_mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')

MSE: 10.104330172445012
