In [19]:
# Ensemble: XGBoost + CatBoost + LightGBM

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter
import joblib

# Tree-based models
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.utils.class_weight import compute_class_weight

# 1) Load & clean data

df = pd.read_csv("kepler_koi.csv", comment="#")

features = [
    'koi_period', 'koi_prad', 'koi_sma', 'koi_incl',
    'koi_teq', 'koi_slogg', 'koi_srad', 'koi_smass', 'koi_steff'
]

df_clean = df[features + ['koi_disposition']].copy()

# Fill NaNs
for col in features:
    df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# Feature engineering
df_clean['density_star'] = df_clean['koi_smass'] / (df_clean['koi_srad']**3 + 1e-6)
df_clean['prad_ratio']   = df_clean['koi_prad'] / (df_clean['koi_srad'] + 1e-6)
df_clean['period_ratio'] = df_clean['koi_period'] / (df_clean['koi_sma'] + 1e-6)
df_clean['teq_scaled']   = df_clean['koi_teq'] / (df_clean['koi_steff'] + 1e-6)

# Extra features 
extra_features = ['density_star', 'prad_ratio', 'period_ratio', 'teq_scaled']


all_features = features + extra_features

# Scaling
scaler = StandardScaler()
X = scaler.fit_transform(df_clean[all_features])

# Label encoding
le = LabelEncoder()
y = le.fit_transform(df_clean['koi_disposition'])

# 2) Train-test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 3) Compute class weights

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_dict = {i: w for i, w in enumerate(class_weights)}
print("Class weights:", class_weights_dict)

# 4) Define individual models

xgb_model = xgb.XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, random_state=42,
    eval_metric='mlogloss', use_label_encoder=False
)

cat_model = CatBoostClassifier(
    iterations=500, depth=6, learning_rate=0.05,
    loss_function='MultiClass', class_weights=class_weights_dict,
    eval_metric='TotalF1', random_seed=42, verbose=0
)

lgb_model = lgb.LGBMClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.05,
    class_weight='balanced', random_state=42
)

# 5) Ensemble with soft voting

ensemble = VotingClassifier(
    estimators=[('xgb', xgb_model), ('cat', cat_model), ('lgb', lgb_model)],
    voting='soft', n_jobs=-1
)

# 6) Train ensemble

ensemble.fit(X_train, y_train)

# 7) Predict & evaluate

y_pred = ensemble.predict(X_test)

print("\n=== Test Set Classification Report ===")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 8) Save ensemble & preprocessor

joblib.dump(ensemble, "ensemble_pipeline.joblib")
joblib.dump(scaler, "scaler.joblib")
joblib.dump(le, "label_encoder.joblib")
print("✅ Saved ensemble pipeline for deployment")



Class weights: {0: np.float64(1.611076016003369), 1: np.float64(1.1608253679259597), 2: np.float64(0.6588306208559374)}

=== Test Set Classification Report ===
                precision    recall  f1-score   support

     CANDIDATE       0.48      0.52      0.50       396
     CONFIRMED       0.72      0.80      0.76       549
FALSE POSITIVE       0.86      0.78      0.82       968

      accuracy                           0.73      1913
     macro avg       0.69      0.70      0.69      1913
  weighted avg       0.74      0.73      0.73      1913


Confusion Matrix:
[[204  98  94]
 [ 80 440  29]
 [139  75 754]]
✅ Saved ensemble pipeline for deployment


