In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import joblib
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb

print('Imports loaded')

Imports loaded


In [12]:
# Load the dataset
print('Loading dataframe...')
df = pd.read_csv('data/TOI_2025.10.04_05.29.58.csv', comment='#', on_bad_lines='skip')
df = df.drop(columns=['toi', 'tid', 'toi_created', 'rowupdate'])
print('Dataframe loaded successfully!\n')

print('Dataframe shape:', df.shape, '\n')

Loading dataframe...
Dataframe loaded successfully!

Dataframe shape: (7703, 19) 



In [13]:
# --- Data Cleaning ---
# Drop rows with missing values
df.dropna(inplace=True)
print('Dataframe shape after dropping missing values:', df.shape, '\n')


Dataframe shape after dropping missing values: (6589, 19) 



In [14]:
# --- Feature Selection ---
# Select numerical features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Remove transit time column (if present) as it's a timestamp
if 'pl_tranmid' in numeric_cols:
    numeric_cols.remove('pl_tranmid')



In [15]:
# Prepare X and y
X = df[numeric_cols].copy()
y = df['tfopwg_disp'].copy()

print('Features used (numerical):', numeric_cols)
print('Missing values per feature:')
display(X.isnull().sum())

Features used (numerical): ['ra', 'dec', 'st_pmra', 'st_pmdec', 'pl_orbper', 'pl_trandurh', 'pl_trandep', 'pl_rade', 'pl_insol', 'pl_eqt', 'st_tmag', 'st_dist', 'st_teff', 'st_logg', 'st_rad']
Missing values per feature:


ra             0
dec            0
st_pmra        0
st_pmdec       0
pl_orbper      0
pl_trandurh    0
pl_trandep     0
pl_rade        0
pl_insol       0
pl_eqt         0
st_tmag        0
st_dist        0
st_teff        0
st_logg        0
st_rad         0
dtype: int64

In [None]:
# --- One-vs-Rest LGBM Classifiers (preprocess per-class, GridSearch, then robust refit with early stopping)
# Create and train a binary LGBM classifier for each exoplanet category
classifiers = {}
for category in y.unique():
    print(f"Training a binary classifier for category: {category}")

    # Create a binary target for the current category
    y_binary = (y == category).astype(int)

    # Split the data (try stratify; if not possible, fallback)
    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)
    except ValueError:
        X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

    # Preprocessing pipeline (fit only on train)
    preproc = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
    ])
    X_train_t = preproc.fit_transform(X_train)
    X_test_t = preproc.transform(X_test)

    # Search over LGBM hyperparams using GridSearchCV (on preprocessed arrays)
    search_pipe = Pipeline([('lgbm', lgb.LGBMClassifier(random_state=42, verbose=-1))])
    param_grid = {
        'lgbm__n_estimators': [100, 200, 500],
        'lgbm__learning_rate': [0.01, 0.05, 0.1],
        'lgbm__num_leaves': [31, 50, 70]
    }
    grid = GridSearchCV(search_pipe, param_grid, cv=3, n_jobs=-1, scoring='f1')
    grid.fit(X_train_t, y_train)

    print(f"Best parameters for {category}: {grid.best_params_}")
    print(f"Best CV score for {category}: {grid.best_score_}\n")

    # Extract lgbm params and create classifier
    lgbm_params = {k.split('__',1)[1]: v for k, v in grid.best_params_.items() if k.startswith('lgbm__')}
    clf = lgb.LGBMClassifier(random_state=42, verbose=-1, **lgbm_params)
    clf.set_params(n_estimators=5000)

    # Fit on preprocessed arrays with early stopping (robust: try multiple APIs)
    fitted_with_native_booster = False
    try:
        # Preferred: scikit-learn wrapper accepting early_stopping_rounds
        clf.fit(X_train_t, y_train, eval_set=[(X_test_t, y_test)], early_stopping_rounds=50, verbose=False)
    except TypeError:
        # Try callbacks API supported by newer lightgbm
        try:
            clf.fit(X_train_t, y_train, eval_set=[(X_test_t, y_test)], callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])
        except Exception:
            # Fallback: native lgb.train and wrap the Booster
            dtrain = lgb.Dataset(X_train_t, label=y_train)
            dvalid = lgb.Dataset(X_test_t, label=y_test, reference=dtrain)

            # Build a lightweight native params dict from clf.get_params()
            params = clf.get_params()
            native_params = {
                'learning_rate': params.get('learning_rate', 0.1),
                'num_leaves': params.get('num_leaves', 31),
                'max_depth': params.get('max_depth', -1),
                'min_data_in_leaf': params.get('min_child_samples', 20),
                'subsample': params.get('subsample', 1.0),
                'colsample_bytree': params.get('colsample_bytree', 1.0),
                'reg_alpha': params.get('reg_alpha', 0.0),
                'reg_lambda': params.get('reg_lambda', 0.0),
                'objective': 'binary',
                'verbosity': -1,
            }

            # Train native booster with early stopping
            gbm = lgb.train(native_params, dtrain, num_boost_round=5000, valid_sets=[dvalid], early_stopping_rounds=50, verbose_eval=False)

            # wrapper to expose predict / predict_proba like a sklearn estimator
            class _NativeWrapper:
                def __init__(self, booster):
                    self.booster_ = booster
                def predict(self, X):
                    probs = self.booster_.predict(X, num_iteration=self.booster_.best_iteration)
                    return (probs > 0.5).astype(int)
                def predict_proba(self, X):
                    probs = self.booster_.predict(X, num_iteration=self.booster_.best_iteration)
                    return np.vstack([1 - probs, probs]).T

            clf = _NativeWrapper(gbm)
            fitted_with_native_booster = True

    # Evaluate
    y_pred = clf.predict(X_test_t)
    print(classification_report(y_test, y_pred))

    # Store pipeline (preprocessing + trained classifier or wrapper)
    classifiers[category] = Pipeline([('preproc', preproc), ('lgbm', clf)])


Training a binary classifier for category: FP




KeyboardInterrupt: 

In [37]:
# --- Meta-Classifier (Voting Classifier) ---
# Combine the individual classifiers using a VotingClassifier
# The voting='soft' option uses the predicted probabilities to make the final decision
voting_clf = VotingClassifier(
    estimators=[(name, clf) for name, clf in classifiers.items()],
    voting='soft'
)

# We need to train the voting classifier on the original multi-class data
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
voting_clf.fit(X_train_multi, y_train_multi)

0,1,2
,estimators,"[('FP', ...), ('PC', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,50
,max_depth,-1
,learning_rate,0.1
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,50
,max_depth,-1
,learning_rate,0.1
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001
