# ❗ Install Requrement

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


# 📂 Prepared Dataset

In [None]:
import pandas as pd
pd.set_option('display.float_format', '{:.5f}'.format)

# Load the dataset
df = pd.read_csv('star_classification.csv')

# 🔧 Data Preprocessing

In [None]:
# featurre selection
features = ['alpha','delta','u','g','r','i','z','redshift','class']
X_features = ['alpha','delta','u','g','r','i','z','redshift',]
df = df[features]

In [None]:
print("Duplicate count:", df.duplicated().sum())
print("Null count:", df.isnull().sum().sum())

Duplicate count: 0
Null count: 1


In [None]:
# Convert 'class' to integer labels
map_class = {
    'GALAXY': 0,
    'QSO': 1,
    'STAR': 2
}
df['class'] = df['class'].map(map_class)

# replace -9999 (missing value) with the median
df = df.replace(-9999, df.median())

In [None]:
df.head()

Unnamed: 0,alpha,delta,u,g,r,i,z,redshift,class
0,135.68911,32.49463,23.87882,22.2753,20.39501,19.16573,18.79371,0.63479,0
1,144.8261,31.27418,24.77759,22.83188,22.58444,21.16812,21.61427,0.77914,0
2,142.18879,35.58244,25.26307,22.66389,20.60976,19.34857,18.94827,0.64419,0
3,338.74104,-0.40283,22.13682,23.77656,21.61162,20.50454,19.2501,0.93235,0
4,345.28259,21.18387,19.43718,17.58028,16.49747,15.97711,15.54461,0.11612,0


In [None]:
# Valid Input
min_max = df.agg(['min', 'max'])
min_max

Unnamed: 0,alpha,delta,u,g,r,i,z,redshift,class
min,0.00553,-18.78533,10.99623,10.4982,9.82207,9.4699,9.61233,-0.00997,0
max,359.99981,83.00052,32.78139,31.60224,29.57186,32.14147,29.38374,7.01124,2


# 🛠 Hyperparameter

In [None]:
X = df.drop(columns='class')
y = df['class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## LGBM

In [None]:
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
import lightgbm as lgb
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import os

def optimize_lgb(X_train, y_train, n_trials=50, output_dir="models"):
    os.makedirs(output_dir, exist_ok=True)

    def objective(trial):
        params = {
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'boosting_type': 'gbdt',
            'num_class': len(np.unique(y_train)),
            'verbosity': -1,
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 31, 150),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 5.0),
            'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 5.0),
        }

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = []

        for train_idx, val_idx in cv.split(X_train, y_train):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

            model = lgb.LGBMClassifier(**params)
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                eval_metric='multi_logloss',
                callbacks=[lgb.early_stopping(30, verbose=False)],
            )

            preds = model.predict(X_val)
            acc = accuracy_score(y_val, preds)
            scores.append(acc)

        return np.mean(scores)

    # Optuna tuning
    sampler = TPESampler(seed=42)
    study = optuna.create_study(direction='maximize', sampler=sampler)
    study.optimize(objective, n_trials=n_trials)

    # Train best model
    best_model = lgb.LGBMClassifier(**study.best_params)
    best_model.fit(X_train, y_train)

    # Predict & Evaluate
    y_pred = best_model.predict(X_test)

    # Save confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title("Confusion Matrix (Train Data)")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "lgb_confusion_matrix.png"))
    plt.close()

    # Save classification report
    report = classification_report(y_test, y_pred)
    with open(os.path.join(output_dir, "lgb_classification_report.txt"), "w") as f:
        f.write(report)

    best_model = lgb.LGBMClassifier(**study.best_params)
    best_model.fit(X, y)

    # Save model
    model_path = os.path.join(output_dir, "lgb_model.pkl")
    joblib.dump(best_model, model_path)

    return best_model, study.best_params


In [None]:
model_lgb, lgb_params = optimize_lgb(X_train=X_train, y_train=y_train, n_trials=20)

[I 2025-06-04 08:02:11,657] A new study created in memory with name: no-name-f382fe22-3b6d-492e-a67d-7b372cda5801
[I 2025-06-04 08:02:27,528] Trial 0 finished with value: 0.9778547173580133 and parameters: {'learning_rate': 0.030710573677773714, 'num_leaves': 145, 'max_depth': 12, 'min_child_samples': 62, 'lambda_l1': 0.7800932022121826, 'lambda_l2': 0.7799726016810132}. Best is trial 0 with value: 0.9778547173580133.
[I 2025-06-04 08:02:40,262] Trial 1 finished with value: 0.9757240828284173 and parameters: {'learning_rate': 0.011900590783184251, 'num_leaves': 134, 'max_depth': 10, 'min_child_samples': 72, 'lambda_l1': 0.10292247147901223, 'lambda_l2': 4.8495492608099715}. Best is trial 0 with value: 0.9778547173580133.
[I 2025-06-04 08:02:52,842] Trial 2 finished with value: 0.9776416539050535 and parameters: {'learning_rate': 0.12106896936002161, 'num_leaves': 56, 'max_depth': 5, 'min_child_samples': 22, 'lambda_l1': 1.5212112147976886, 'lambda_l2': 2.6237821581611893}. Best is tria

## XGBoost

In [None]:
import xgboost as xgb

def optimize_xgb(X_train, y_train, X_test, y_test, n_trials=50, output_dir="models"):
    os.makedirs(output_dir, exist_ok=True)
    num_class = len(np.unique(y_train))

    def objective(trial):
        params = {
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss',
            'num_class': num_class,
            'booster': 'gbtree',
            'verbosity': 0,
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0.0, 5.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        }

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = []

        for train_idx, val_idx in cv.split(X_train, y_train):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

            model = xgb.XGBClassifier(**params)
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=30,
                verbose=False
            )

            preds = model.predict(X_val)
            acc = accuracy_score(y_val, preds)
            scores.append(acc)

        return np.mean(scores)

    sampler = TPESampler(seed=42)
    study = optuna.create_study(direction='maximize', sampler=sampler)
    study.optimize(objective, n_trials=n_trials)

    best_model = xgb.XGBClassifier(**study.best_params)
    best_model.fit(X_train, y_train)

    # Predict & Evaluate on Test
    y_pred = best_model.predict(X_test)

    # Save confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title("Confusion Matrix (XGBoost)")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "xgb_confusion_matrix.png"))
    plt.close()

    # Save classification report
    report = classification_report(y_test, y_pred)
    with open(os.path.join(output_dir, "xgb_classification_report.txt"), "w") as f:
        f.write(report)

    # Save model
    model_path = os.path.join(output_dir, "xgb_model.pkl")
    joblib.dump(best_model, model_path)

    return best_model, study.best_params


In [None]:
model_xgb, xgb_params = optimize_lgb(X_train=X_train, y_train=y_train, n_trials=10)

[I 2025-06-04 08:32:30,693] A new study created in memory with name: no-name-bc803671-29fb-4f57-99bc-8a8adebde0fc
[I 2025-06-04 08:32:49,838] Trial 0 finished with value: 0.9778547173580133 and parameters: {'learning_rate': 0.030710573677773714, 'num_leaves': 145, 'max_depth': 12, 'min_child_samples': 62, 'lambda_l1': 0.7800932022121826, 'lambda_l2': 0.7799726016810132}. Best is trial 0 with value: 0.9778547173580133.
[I 2025-06-04 08:33:02,318] Trial 1 finished with value: 0.9757240828284173 and parameters: {'learning_rate': 0.011900590783184251, 'num_leaves': 134, 'max_depth': 10, 'min_child_samples': 72, 'lambda_l1': 0.10292247147901223, 'lambda_l2': 4.8495492608099715}. Best is trial 0 with value: 0.9778547173580133.
[I 2025-06-04 08:33:15,116] Trial 2 finished with value: 0.9776416539050535 and parameters: {'learning_rate': 0.12106896936002161, 'num_leaves': 56, 'max_depth': 5, 'min_child_samples': 22, 'lambda_l1': 1.5212112147976886, 'lambda_l2': 2.6237821581611893}. Best is tria

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

def optimize_rf(X_train, y_train, X_test, y_test, n_trials=50, output_dir="models"):
    os.makedirs(output_dir, exist_ok=True)

    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 5, 30),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        }

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = []

        for train_idx, val_idx in cv.split(X_train, y_train):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

            model = RandomForestClassifier(**params)
            model.fit(X_tr, y_tr)

            preds = model.predict(X_val)
            acc = accuracy_score(y_val, preds)
            scores.append(acc)

        return np.mean(scores)

    sampler = TPESampler(seed=42)
    study = optuna.create_study(direction='maximize', sampler=sampler)
    study.optimize(objective, n_trials=n_trials)

    best_model = RandomForestClassifier(**study.best_params)
    best_model.fit(X_train, y_train)

    # Predict & Evaluate on Test
    y_pred = best_model.predict(X_test)

    # Save confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title("Confusion Matrix (Random Forest)")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "rf_confusion_matrix.png"))
    plt.close()

    # Save classification report
    report = classification_report(y_test, y_pred)
    with open(os.path.join(output_dir, "rf_classification_report.txt"), "w") as f:
        f.write(report)

    # Save model
    model_path = os.path.join(output_dir, "rf_model.pkl")
    joblib.dump(best_model, model_path)

    return best_model, study.best_params


In [None]:
rf_model, rf_params = optimize_rf(X_train, y_train, X_test, y_test, n_trials = 10)

[I 2025-06-04 08:34:54,385] A new study created in memory with name: no-name-aed92232-29d7-474d-a7d1-15c83c019c60
[I 2025-06-04 08:39:32,310] Trial 0 finished with value: 0.9764032225847259 and parameters: {'n_estimators': 250, 'max_depth': 29, 'min_samples_split': 8, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9764032225847259.
[I 2025-06-04 08:42:44,598] Trial 1 finished with value: 0.9779079832212532 and parameters: {'n_estimators': 123, 'max_depth': 27, 'min_samples_split': 7, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 1 with value: 0.9779079832212532.
[I 2025-06-04 08:51:27,397] Trial 2 finished with value: 0.9776549703708636 and parameters: {'n_estimators': 433, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 1 with value: 0.9779079832212532.
[I 2025-06-04 08:57:30,743] Trial 3 finished with value: 0.9781609960716426 and parameters: {'n_estimators': 273, 'max_depth': 12, 'm

In [None]:
import json

best_params_dict = {
    "lgb_params": lgb_params,
    "xgb_params": xgb_params,
    "rf_params": rf_params
}

with open("best_params.json", "w") as f:
    json.dump(best_params_dict, f, indent=4)

print("Best parameters saved to best_params.json")


Best parameters saved to best_params.json
