# Hyperparameter Tuning

In [18]:
import pandas as pd
import matplotlib.pyplot as plt

In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Load Dataset

In [20]:
df = pd.read_csv('../data/processed_data_with_anomaly.csv')

## Definisi Fitur dan Target

In [21]:
features = ['Lag_1_Status', 'Count_Telat_7D', 'Count_Alpa_30D', 'Streak_Telat', 'Avg_Arrival_Time_7D', 'DayOfWeek']
le = LabelEncoder()

X = df[features]
# Encode 'Lag_1_Status' in the features
X['Lag_1_Status'] = le.fit_transform(X['Lag_1_Status'].astype(str))

y = le.fit_transform(df['note'].astype(str))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Lag_1_Status'] = le.fit_transform(X['Lag_1_Status'].astype(str))


## Split Data

In [22]:
# 80% train-test split

split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"Data Latih: {X_train.shape[0]} baris")
print(f"Data Uji: {X_test.shape[0]} baris")

Data Latih: 101904 baris
Data Uji: 25476 baris


In [23]:
# Define hyperparameter grids for each model
param_grids = {
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {
            'max_depth': [25, 50, 75, 100], 
            'criterion': ['gini', 'entropy'], 
            'min_samples_split': [2, 4, 8, 10, 15, 20], 
            'min_samples_leaf': [1, 2, 4, 6, 8, 10]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 300], 
            'max_depth': [3, 4, 7, 9], 
            'min_samples_split': [2, 5, 8], 
            'max_features' : [2, 3, 4]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
        'params': {
            'n_estimators': [50, 100], 
            'learning_rate': [0.01, 0.1], 
            'max_depth': [3, 10, 18], 
            'gamma' : [3, 5, 9],
            'reg_alpha' :[40, 100, 180], 
            'reg_lambda' :[0, 1],
            'colsample_bytree' :[0.5, 0.7, 1],
            'min_child_weight' :[2, 6, 10],
            'n_estimators' :[180], 
            'seed' :[0]
        },
    },
    'Gaussian NB': {
        'model': Pipeline([
            ('scaler', StandardScaler()), 
            ('clf', GaussianNB())
        ]),
        'params': {
            'var_smoothing': [1e-9, 1e-7, 1.0, 1.2, 1.4]
        }
    },
    'SVM': {
        'model': Pipeline([
            ('scaler', StandardScaler()), 
            ('clf', SVC(random_state=42))
        ]),
        'params': {
            'C': [1, 3, 5, 7], 
            'kernel': ['linear', 'rbf'], 
            'gamma': ['scale', 'auto']
        }
    }
}

In [24]:
best_params = {}

# TimeSeriesSplit untuk Cross Validation (agar tidak bocor waktu)
tscv = TimeSeriesSplit(n_splits=3)

In [25]:
for name, config in param_grids.items():
    print(f"Tuning {name}...")
    grid = GridSearchCV(config['model'], config['params'], cv=tscv, scoring='f1_macro', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_params[name] = grid.best_estimator_
    print(f"  Best Params: {grid.best_params_}")
    print(f"  Best CV Score: {grid.best_score_:.4f}")

Tuning Decision Tree...


KeyboardInterrupt: 