In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import joblib
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb

print('Imports loaded')

Imports loaded


In [3]:
# Load the dataset
print('Loading dataframe...')
df = pd.read_csv('data/TOI_2025.10.04_05.29.58.csv', comment='#', on_bad_lines='skip')
df = df.drop(columns=['toi', 'tid', 'toi_created', 'rowupdate'])
print('Dataframe loaded successfully!\n')

print('Dataframe shape:', df.shape, '\n')

Loading dataframe...
Dataframe loaded successfully!

Dataframe shape: (7703, 19) 

Dataframe loaded successfully!

Dataframe shape: (7703, 19) 



In [4]:
# --- Data Cleaning ---
# Drop rows with missing values
df.dropna(inplace=True)
print('Dataframe shape after dropping missing values:', df.shape, '\n')


Dataframe shape after dropping missing values: (6589, 19) 



In [5]:
# --- Feature Selection ---
# Select numerical features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Remove transit time column (if present) as it's a timestamp
if 'pl_tranmid' in numeric_cols:
    numeric_cols.remove('pl_tranmid')



In [6]:
# Prepare X and y
X = df[numeric_cols].copy()
y = df['tfopwg_disp'].copy()

print('Features used (numerical):', numeric_cols)
print('Missing values per feature:')
display(X.isnull().sum())

Features used (numerical): ['ra', 'dec', 'st_pmra', 'st_pmdec', 'pl_orbper', 'pl_trandurh', 'pl_trandep', 'pl_rade', 'pl_insol', 'pl_eqt', 'st_tmag', 'st_dist', 'st_teff', 'st_logg', 'st_rad']
Missing values per feature:


ra             0
dec            0
st_pmra        0
st_pmdec       0
pl_orbper      0
pl_trandurh    0
pl_trandep     0
pl_rade        0
pl_insol       0
pl_eqt         0
st_tmag        0
st_dist        0
st_teff        0
st_logg        0
st_rad         0
dtype: int64

In [7]:
# --- One-vs-Rest LGBM Classifiers (preprocess per-class, GridSearch, then robust refit with early stopping)
# Create and train a binary LGBM classifier for each exoplanet category
classifiers = {}
for category in y.unique():
    print(f"Training a binary classifier for category: {category}")

    # Create a binary target for the current category
    y_binary = (y == category).astype(int)

    # Split the data (try stratify; if not possible, fallback)
    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)
    except ValueError:
        X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

    # Preprocessing pipeline (fit only on train)
    preproc = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
    ])
    X_train_t = preproc.fit_transform(X_train)
    X_test_t = preproc.transform(X_test)

    # Search over LGBM hyperparams using GridSearchCV (on preprocessed arrays)
    search_pipe = Pipeline([('lgbm', lgb.LGBMClassifier(random_state=42, verbose=-1))])
    param_grid = {
        'lgbm__n_estimators': [100, 200, 500, 1000, 2000, 5000, 10000],
        'lgbm__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1.0, 2.0],
        'lgbm__num_leaves': [31, 50, 70, 100, 150, 200, 300],
    }
    grid = GridSearchCV(search_pipe, param_grid, cv=3, n_jobs=-1, scoring='f1')
    grid.fit(X_train_t, y_train)

    print(f"Best parameters for {category}: {grid.best_params_}")
    print(f"Best CV score for {category}: {grid.best_score_}\n")

    # Extract lgbm params and create classifier
    lgbm_params = {k.split('__',1)[1]: v for k, v in grid.best_params_.items() if k.startswith('lgbm__')}
    clf = lgb.LGBMClassifier(random_state=42, verbose=-1, **lgbm_params)
    clf.set_params(n_estimators=5000)

    # Fit on preprocessed arrays with early stopping (robust: try multiple APIs)
    fitted_with_native_booster = False
    try:
        # Preferred: scikit-learn wrapper accepting early_stopping_rounds
        clf.fit(X_train_t, y_train, eval_set=[(X_test_t, y_test)], early_stopping_rounds=50, verbose=False)
    except TypeError:
        # Try callbacks API supported by newer lightgbm
        try:
            clf.fit(X_train_t, y_train, eval_set=[(X_test_t, y_test)], callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])
        except Exception:
            # Fallback: native lgb.train and wrap the Booster
            dtrain = lgb.Dataset(X_train_t, label=y_train)
            dvalid = lgb.Dataset(X_test_t, label=y_test, reference=dtrain)

            # Build a lightweight native params dict from clf.get_params()
            params = clf.get_params()
            native_params = {
                'learning_rate': params.get('learning_rate', 0.1),
                'num_leaves': params.get('num_leaves', 31),
                'max_depth': params.get('max_depth', -1),
                'min_data_in_leaf': params.get('min_child_samples', 20),
                'subsample': params.get('subsample', 1.0),
                'colsample_bytree': params.get('colsample_bytree', 1.0),
                'reg_alpha': params.get('reg_alpha', 0.0),
                'reg_lambda': params.get('reg_lambda', 0.0),
                'objective': 'binary',
                'verbosity': -1,
            }

            # Train native booster with early stopping
            gbm = lgb.train(native_params, dtrain, num_boost_round=5000, valid_sets=[dvalid], early_stopping_rounds=50, verbose_eval=False)

            # wrapper to expose predict / predict_proba like a sklearn estimator
            class _NativeWrapper:
                def __init__(self, booster):
                    self.booster_ = booster
                def predict(self, X):
                    probs = self.booster_.predict(X, num_iteration=self.booster_.best_iteration)
                    return (probs > 0.5).astype(int)
                def predict_proba(self, X):
                    probs = self.booster_.predict(X, num_iteration=self.booster_.best_iteration)
                    return np.vstack([1 - probs, probs]).T

            clf = _NativeWrapper(gbm)
            fitted_with_native_booster = True

    # Evaluate
    y_pred = clf.predict(X_test_t)
    print(classification_report(y_test, y_pred))

    # Store pipeline (preprocessing + trained classifier or wrapper)
    classifiers[category] = Pipeline([('preproc', preproc), ('lgbm', clf)])


Training a binary classifier for category: FP
Best parameters for FP: {'lgbm__learning_rate': 0.5, 'lgbm__n_estimators': 10000, 'lgbm__num_leaves': 100}
Best CV score for FP: 0.4130454096856206

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's binary_logloss: 0.349131
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      1130
           1       0.54      0.26      0.35       188

    accuracy                           0.86      1318
   macro avg       0.71      0.61      0.64      1318
weighted avg       0.84      0.86      0.84      1318

Training a binary classifier for category: PC
Best parameters for FP: {'lgbm__learning_rate': 0.5, 'lgbm__n_estimators': 10000, 'lgbm__num_leaves': 100}
Best CV score for FP: 0.4130454096856206

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's binary_logloss: 0.349131
             



Best parameters for PC: {'lgbm__learning_rate': 0.05, 'lgbm__n_estimators': 200, 'lgbm__num_leaves': 200}
Best CV score for PC: 0.7978804864690726

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[81]	valid_0's binary_logloss: 0.504096
              precision    recall  f1-score   support

           0       0.73      0.62      0.67       517
           1       0.77      0.85      0.81       801

    accuracy                           0.76      1318
   macro avg       0.75      0.73      0.74      1318
weighted avg       0.76      0.76      0.75      1318

Training a binary classifier for category: KP
Early stopping, best iteration is:
[81]	valid_0's binary_logloss: 0.504096
              precision    recall  f1-score   support

           0       0.73      0.62      0.67       517
           1       0.77      0.85      0.81       801

    accuracy                           0.76      1318
   macro avg       0.75      0.73      0.74      1



Best parameters for KP: {'lgbm__learning_rate': 0.05, 'lgbm__n_estimators': 10000, 'lgbm__num_leaves': 31}
Best CV score for KP: 0.4434508180685551

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[126]	valid_0's binary_logloss: 0.166497
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      1206
           1       0.74      0.35      0.47       112

    accuracy                           0.93      1318
   macro avg       0.84      0.67      0.72      1318
weighted avg       0.92      0.93      0.92      1318

Training a binary classifier for category: APC




Best parameters for APC: {'lgbm__learning_rate': 0.3, 'lgbm__n_estimators': 100, 'lgbm__num_leaves': 50}
Best CV score for APC: 0.1458341811591591

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.195572
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1247
           1       0.10      0.03      0.04        71

    accuracy                           0.93      1318
   macro avg       0.52      0.51      0.50      1318
weighted avg       0.90      0.93      0.92      1318

Training a binary classifier for category: FA




Best parameters for FA: {'lgbm__learning_rate': 0.5, 'lgbm__n_estimators': 100, 'lgbm__num_leaves': 70}
Best CV score for FA: 0.041379310344827586

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's binary_logloss: 0.179505
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1305
           1       0.05      0.08      0.06        13

    accuracy                           0.97      1318
   macro avg       0.52      0.53      0.52      1318
weighted avg       0.98      0.97      0.98      1318

Training a binary classifier for category: CP
Early stopping, best iteration is:
[2]	valid_0's binary_logloss: 0.179505
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1305
           1       0.05      0.08      0.06        13

    accuracy                           0.97      1318
   macro avg       0.52      0.53      0.52      131



Best parameters for CP: {'lgbm__learning_rate': 1.0, 'lgbm__n_estimators': 5000, 'lgbm__num_leaves': 70}
Best CV score for CP: 0.44395174017400535

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.407329
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      1185
           1       0.37      0.46      0.41       133

    accuracy                           0.87      1318
   macro avg       0.65      0.69      0.67      1318
weighted avg       0.88      0.87      0.87      1318





In [10]:
# --- Meta-Classifier (Voting Classifier) ---
# Combine the individual classifiers using a VotingClassifier
# The voting='soft' option uses the predicted probabilities to make the final decision
voting_clf = VotingClassifier(
    estimators=[(name, clf) for name, clf in classifiers.items()],
    voting='soft'
)

# We need to train the voting classifier on the original multi-class data
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
voting_clf.fit(X_train_multi, y_train_multi)

0,1,2
,estimators,"[('FP', ...), ('PC', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,100
,max_depth,-1
,learning_rate,0.5
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,200
,max_depth,-1
,learning_rate,0.05
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,50
,max_depth,-1
,learning_rate,0.3
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,70
,max_depth,-1
,learning_rate,0.5
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,70
,max_depth,-1
,learning_rate,1.0
,n_estimators,5000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


Error in callback <function _enable_matplotlib_integration.<locals>.configure_once at 0x000001636ED10220> (for post_run_cell), with arguments args (<ExecutionResult object at 1636ea57850, execution_count=10 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 1636ea57770, raw_cell="# --- Meta-Classifier (Voting Classifier) ---
# Co.." transformed_cell="# --- Meta-Classifier (Voting Classifier) ---
# Co.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/c%3A/Users/pedro/OneDrive/Documentos/a/exoplanet_classifier/main.ipynb#X20sZmlsZQ%3D%3D> result=VotingClassifier(estimators=[('FP',
                              Pipeline(steps=[('preproc',
                                               Pipeline(steps=[('imputer',
                                                                SimpleImputer(strategy='median')),
                                                               ('scaler',
                                                     

AttributeError: module 'matplotlib' has no attribute 'backends'

In [9]:
# --- MATRIZ DE CONFUSÃO DO VOTING CLASSIFIER ---
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
import matplotlib.pyplot as plt

print("=" * 60)
print("🎯 MATRIZ DE CONFUSÃO - VOTING CLASSIFIER")
print("=" * 60)

# Fazer predições com o voting classifier
y_pred_voting = voting_clf.predict(X_test_multi)

# Criar matriz de confusão
cm_voting = confusion_matrix(y_test_multi, y_pred_voting)
disp_voting = ConfusionMatrixDisplay(confusion_matrix=cm_voting, 
                                   display_labels=voting_clf.classes_)

# Plotar matriz de confusão
plt.figure(figsize=(12, 10))
disp_voting.plot(cmap='Blues', values_format='d')
plt.title('MATRIZ DE CONFUSÃO - VOTING CLASSIFIER\n(Classificação Multi-classe)', 
          fontsize=16, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right', fontsize=11)
plt.yticks(rotation=0, fontsize=11)
plt.xlabel('Categoria Predita', fontsize=12)
plt.ylabel('Categoria Real', fontsize=12)
plt.tight_layout()
plt.show()

# Relatório de classificação
print("\n📊 RELATÓRIO DE CLASSIFICAÇÃO - VOTING CLASSIFIER:")
print(classification_report(y_test_multi, y_pred_voting))

# Estatísticas detalhadas
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test_multi, y_pred_voting)
precision = precision_score(y_test_multi, y_pred_voting, average='weighted')
recall = recall_score(y_test_multi, y_pred_voting, average='weighted')
f1 = f1_score(y_test_multi, y_pred_voting, average='weighted')

print(f"\n📈 MÉTRICAS GERAIS:")
print(f"Acurácia: {accuracy:.4f}")
print(f"Precisão (média ponderada): {precision:.4f}")
print(f"Recall (média ponderada): {recall:.4f}")
print(f"F1-Score (média ponderada): {f1:.4f}")

# Análise por categoria
print(f"\n📋 ANÁLISE POR CATEGORIA:")
for i, category in enumerate(voting_clf.classes_):
    # Índices onde a categoria real é 'category'
    real_indices = (y_test_multi == category)
    pred_indices = (y_pred_voting == category)
    
    # Contar verdadeiros positivos, falsos positivos, falsos negativos
    tp = sum((y_test_multi == category) & (y_pred_voting == category))
    fp = sum((y_test_multi != category) & (y_pred_voting == category))
    fn = sum((y_test_multi == category) & (y_pred_voting != category))
    tn = sum((y_test_multi != category) & (y_pred_voting != category))
    
    # Calcular métricas
    precision_cat = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall_cat = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_cat = 2 * (precision_cat * recall_cat) / (precision_cat + recall_cat) if (precision_cat + recall_cat) > 0 else 0
    
    print(f"  {category}:")
    print(f"    VP: {tp} | FP: {fp} | FN: {fn} | VN: {tn}")
    print(f"    Precisão: {precision_cat:.3f} | Recall: {recall_cat:.3f} | F1: {f1_cat:.3f}")
    print()

print("✅ MATRIZ DE CONFUSÃO GERADA COM SUCESSO!")

🎯 MATRIZ DE CONFUSÃO - VOTING CLASSIFIER




AttributeError: module 'matplotlib' has no attribute 'backends'

Error in callback <function _enable_matplotlib_integration.<locals>.configure_once at 0x000001636ED10220> (for post_run_cell), with arguments args (<ExecutionResult object at 1636ef932f0, execution_count=9 error_before_exec=None error_in_exec=module 'matplotlib' has no attribute 'backends' info=<ExecutionInfo object at 1636ef915e0, raw_cell="# --- MATRIZ DE CONFUSÃO DO VOTING CLASSIFIER ---
.." transformed_cell="# --- MATRIZ DE CONFUSÃO DO VOTING CLASSIFIER ---
.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/c%3A/Users/pedro/OneDrive/Documentos/a/exoplanet_classifier/main.ipynb#X21sZmlsZQ%3D%3D> result=None>,),kwargs {}:


AttributeError: module 'matplotlib' has no attribute 'backends'