1. Import Libraries

In [132]:
import pandas as pd
import numpy as np
import joblib
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler

2. Prepare Data

In [133]:
# load data
kepler_df = pd.read_csv("data/kepler.csv", comment="#")
tess_df = pd.read_csv("data/tess.csv", comment="#")

In [134]:
kepler_df.head()

Unnamed: 0,loc_rowid,koi_pdisposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,koi_kepmag
0,1,CANDIDATE,0,0,0,0,9.488036,170.53875,0.146,2.9575,615.8,2.26,793.0,93.59,35.8,5455.0,4.467,0.927,15.347
1,2,CANDIDATE,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443.0,9.11,25.8,5455.0,4.467,0.927,15.347
2,3,CANDIDATE,0,0,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638.0,39.3,76.3,5853.0,4.544,0.868,15.436
3,4,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,891.96,505.6,5805.0,4.564,0.791,15.597
4,5,CANDIDATE,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406.0,926.16,40.9,6031.0,4.438,1.046,15.509


In [135]:
tess_df.head()

Unnamed: 0,loc_rowid,tfopwg_disp,pl_orbper,pl_trandurh,pl_trandep,pl_rade,pl_insol,pl_eqt,st_tmag,st_teff,st_logg,st_rad
0,1,FP,2.171348,2.01722,656.886099,5.818163,22601.948581,3127.204052,9.604,10249.0,4.19,2.16986
1,2,PC,1.931646,3.166,1286.0,11.2154,44464.5,4045.0,9.42344,7070.0,4.03,2.01
2,3,FP,1.867557,1.408,1500.0,23.7529,2860.61,2037.0,9.299501,8924.0,,5.73
3,4,FP,2.74323,3.167,383.41,,1177.36,1631.0,9.3003,5388.5,4.15,
4,5,FP,3.573014,3.37,755.0,11.3113,54679.3,4260.0,9.1355,9219.0,4.14,2.15


In [136]:
kepler_df.shape

(9564, 19)

In [137]:
tess_df.shape

(7703, 12)

rename columns for consistency

In [138]:
kepler_df = kepler_df.rename(columns={
    "koi_pdisposition": "target",
    "koi_period": "orbital_period",
    "koi_duration": "transit_duration",
    "koi_depth": "transit_depth",
    "koi_prad": "planet_radius",
    "koi_insol": "insolation_flux",
    "koi_teq": "equilibrium_temp",
    "koi_steff": "stellar_teff",
    "koi_slogg": "stellar_logg",
    "koi_srad": "stellar_radius",
    "koi_kepmag": "stellar_mag",
    "koi_fpflag_nt": "fpflag_nt",
    "koi_fpflag_ss": "fpflag_ss",
    "koi_fpflag_co": "fpflag_co",
    "koi_fpflag_ec": "fpflag_ec"
})
tess_df = tess_df.rename(columns={
    "tfopwg_disp": "target",
    "pl_orbper": "orbital_period",
    "pl_trandurh": "transit_duration",
    "pl_trandep": "transit_depth",
    "pl_rade": "planet_radius",
    "pl_insol": "insolation_flux",
    "pl_eqt": "equilibrium_temp",
    "st_teff": "stellar_teff",
    "st_logg": "stellar_logg",
    "st_rad": "stellar_radius",
    "st_tmag": "stellar_mag"
})

2. Keep only relevant columns

In [139]:
kepler_cols = [
    "target", "orbital_period", "transit_duration", "transit_depth",
    "planet_radius", "insolation_flux", "equilibrium_temp",
    "stellar_teff", "stellar_logg", "stellar_radius", "stellar_mag",
    "fpflag_nt", "fpflag_ss", "fpflag_co", "fpflag_ec"
]

In [140]:
tess_cols = [
    "target", "orbital_period", "transit_duration", "transit_depth",
    "planet_radius", "insolation_flux", "equilibrium_temp",
    "stellar_teff", "stellar_logg", "stellar_radius", "stellar_mag"
]

In [141]:
kepler_df = kepler_df[kepler_cols]
tess_df = tess_df[tess_cols]

In [142]:
kepler_df.head()

Unnamed: 0,target,orbital_period,transit_duration,transit_depth,planet_radius,insolation_flux,equilibrium_temp,stellar_teff,stellar_logg,stellar_radius,stellar_mag,fpflag_nt,fpflag_ss,fpflag_co,fpflag_ec
0,CANDIDATE,9.488036,2.9575,615.8,2.26,93.59,793.0,5455.0,4.467,0.927,15.347,0,0,0,0
1,CANDIDATE,54.418383,4.507,874.8,2.83,9.11,443.0,5455.0,4.467,0.927,15.347,0,0,0,0
2,CANDIDATE,19.89914,1.7822,10829.0,14.6,39.3,638.0,5853.0,4.544,0.868,15.436,0,0,0,0
3,FALSE POSITIVE,1.736952,2.40641,8079.2,33.46,891.96,1395.0,5805.0,4.564,0.791,15.597,0,1,0,0
4,CANDIDATE,2.525592,1.6545,603.3,2.75,926.16,1406.0,6031.0,4.438,1.046,15.509,0,0,0,0


In [143]:
tess_df.head()

Unnamed: 0,target,orbital_period,transit_duration,transit_depth,planet_radius,insolation_flux,equilibrium_temp,stellar_teff,stellar_logg,stellar_radius,stellar_mag
0,FP,2.171348,2.01722,656.886099,5.818163,22601.948581,3127.204052,10249.0,4.19,2.16986,9.604
1,PC,1.931646,3.166,1286.0,11.2154,44464.5,4045.0,7070.0,4.03,2.01,9.42344
2,FP,1.867557,1.408,1500.0,23.7529,2860.61,2037.0,8924.0,,5.73,9.299501
3,FP,2.74323,3.167,383.41,,1177.36,1631.0,5388.5,4.15,,9.3003
4,FP,3.573014,3.37,755.0,11.3113,54679.3,4260.0,9219.0,4.14,2.15,9.1355


map target values to common labels

In [144]:
kepler_df["target"].unique()

array(['CANDIDATE', 'FALSE POSITIVE'], dtype=object)

In [145]:
tess_df["target"].unique()

array(['FP', 'PC', 'KP', 'APC', 'FA', 'CP'], dtype=object)

In [146]:
kepler_map = {
    "CANDIDATE": "CANDIDATE",
    "FALSE POSITIVE": "FALSE POSITIVE"
}

tess_map = {
    "PC": "CANDIDATE",
    "APC": "CANDIDATE",
    "CP": "CONFIRMED",
    "KP": "CONFIRMED",
    "FP": "FALSE POSITIVE",
    "FA": "FALSE POSITIVE"
}

In [147]:
kepler_df["target"] = kepler_df["target"].map(kepler_map)
tess_df["target"] = tess_df["target"].map(tess_map)

In [148]:
kepler_df["target"].unique()

array(['CANDIDATE', 'FALSE POSITIVE'], dtype=object)

In [149]:
tess_df["target"].unique()

array(['FALSE POSITIVE', 'CANDIDATE', 'CONFIRMED'], dtype=object)

In [150]:
# add missing fpflag columns to tess data
for col in ["fpflag_nt", "fpflag_ss", "fpflag_co", "fpflag_ec"]:
    tess_df[col] = 0

combine kepler and tess data

In [151]:
combined_df = pd.concat([kepler_df, tess_df], ignore_index=True)

In [152]:
combined_df.shape

(17267, 15)

In [153]:
combined_df.head()

Unnamed: 0,target,orbital_period,transit_duration,transit_depth,planet_radius,insolation_flux,equilibrium_temp,stellar_teff,stellar_logg,stellar_radius,stellar_mag,fpflag_nt,fpflag_ss,fpflag_co,fpflag_ec
0,CANDIDATE,9.488036,2.9575,615.8,2.26,93.59,793.0,5455.0,4.467,0.927,15.347,0,0,0,0
1,CANDIDATE,54.418383,4.507,874.8,2.83,9.11,443.0,5455.0,4.467,0.927,15.347,0,0,0,0
2,CANDIDATE,19.89914,1.7822,10829.0,14.6,39.3,638.0,5853.0,4.544,0.868,15.436,0,0,0,0
3,FALSE POSITIVE,1.736952,2.40641,8079.2,33.46,891.96,1395.0,5805.0,4.564,0.791,15.597,0,1,0,0
4,CANDIDATE,2.525592,1.6545,603.3,2.75,926.16,1406.0,6031.0,4.438,1.046,15.509,0,0,0,0


filling missing data

In [154]:
combined_df.isna().sum()

target                 0
orbital_period       107
transit_duration       0
transit_depth        363
planet_radius        869
insolation_flux      497
equilibrium_temp     674
stellar_teff         524
stellar_logg        1219
stellar_radius       870
stellar_mag            1
fpflag_nt              0
fpflag_ss              0
fpflag_co              0
fpflag_ec              0
dtype: int64

In [155]:
# remove stellar_logg due to high missing values
combined_df = combined_df.drop(columns=["stellar_logg"])

In [156]:
# fill missing numeric columns with median values
num_cols = ["orbital_period", "transit_depth", "transit_duration", "planet_radius","insolation_flux", "equilibrium_temp", 
            "stellar_teff", "stellar_radius", "stellar_mag"]

for col in num_cols:
    combined_df[col] = combined_df[col].fillna(combined_df[col].median())

In [157]:
combined_df.isna().sum()

target              0
orbital_period      0
transit_duration    0
transit_depth       0
planet_radius       0
insolation_flux     0
equilibrium_temp    0
stellar_teff        0
stellar_radius      0
stellar_mag         0
fpflag_nt           0
fpflag_ss           0
fpflag_co           0
fpflag_ec           0
dtype: int64

Split features & labels

In [158]:
x = combined_df.drop(columns=['target'])  # features only
y = combined_df['target']   # target only

3. Normalization

In [159]:
scaler = RobustScaler()
# Only scale numeric features
combined_df[num_cols] = scaler.fit_transform(combined_df[num_cols])

4. Train-test split with stratification

In [160]:
x_train, x_test, y_train, y_test = train_test_split( 
    x, y, 
    test_size=0.2, 
    random_state=1, 
    stratify=y)

5.  Apply SMOTE to training set

In [161]:
# Create SMOTE object
smote = SMOTE(random_state=42)

In [162]:
# Apply SMOTE to training data only
X_train_res, y_train_res = smote.fit_resample(x_train, y_train)

In [163]:
print("Original training set shape:", x_train.shape, y_train.value_counts())
print("Resampled training set shape:", X_train_res.shape, y_train_res.value_counts())

Original training set shape: (13813, 13) target
CANDIDATE         7886
FALSE POSITIVE    4913
CONFIRMED         1014
Name: count, dtype: int64
Resampled training set shape: (23658, 13) target
CONFIRMED         7886
FALSE POSITIVE    7886
CANDIDATE         7886
Name: count, dtype: int64


6.  Train LightGBM model

In [164]:
# Create model
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    class_weight='balanced',
    random_state=1
)

In [165]:
# Train on SMOTE data
lgb_model.fit(X_train_res, y_train_res)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026217 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2304
[LightGBM] [Info] Number of data points in the train set: 23658, number of used features: 13
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,5
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


7. Apply and Train on Custom Threshold(0.3)

In [166]:
# Get probabilities for each class
y_pred_proba = lgb_model.predict_proba(x_test)

In [167]:
# Find index of CONFIRMED class
conf_index = list(lgb_model.classes_).index('CONFIRMED')

In [168]:
# Apply custom threshold
threshold = 0.3
y_pred_custom = []

In [169]:
for probs in y_pred_proba:
    if probs[conf_index] >= threshold:
        y_pred_custom.append('CONFIRMED')
    else:
        other_classes = [c for c in range(len(probs)) if c != conf_index]
        max_idx = other_classes[np.argmax(probs[other_classes])]
        y_pred_custom.append(lgb_model.classes_[max_idx])

8. Metrics

In [170]:
y_pred = lgb_model.predict(x_test)

In [171]:
# metrics on default threshold
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# Classification report (per class)
print("Classification report (default threshold 0.5):")
print(classification_report(y_test, y_pred))

# Macro recall
macro_recall = recall_score(y_test, y_pred, average='macro')
print("Macro recall:", macro_recall)

Accuracy: 0.8251302837290099
Classification report (default threshold 0.5):
                precision    recall  f1-score   support

     CANDIDATE       0.92      0.80      0.85      1972
     CONFIRMED       0.37      0.84      0.51       253
FALSE POSITIVE       0.92      0.87      0.89      1229

      accuracy                           0.83      3454
     macro avg       0.73      0.83      0.75      3454
  weighted avg       0.88      0.83      0.84      3454

Macro recall: 0.834158918156478


In [172]:
# metrics on custom threshold(0.3)
acc_custom = accuracy_score(y_test, y_pred_custom)
print("Accuracy (custom threshold):", acc_custom)

print("Classification report (custom threshold):")
print(classification_report(y_test, y_pred_custom))

macro_recall_custom = recall_score(y_test, y_pred_custom, average='macro')
print("Macro recall (custom threshold):", macro_recall_custom)

Accuracy (custom threshold): 0.7822814128546612
Classification report (custom threshold):
                precision    recall  f1-score   support

     CANDIDATE       0.92      0.71      0.81      1972
     CONFIRMED       0.30      0.91      0.45       253
FALSE POSITIVE       0.92      0.86      0.89      1229

      accuracy                           0.78      3454
     macro avg       0.71      0.83      0.72      3454
  weighted avg       0.88      0.78      0.81      3454

Macro recall (custom threshold): 0.8295082632556658


In [173]:
recall_confirmed = recall_score(y_test, y_pred_custom, average=None, labels=['CONFIRMED'])[0]
print("Recall for CONFIRMED:", recall_confirmed)

Recall for CONFIRMED: 0.9090909090909091


9. Save Model

In [174]:
import joblib

# Save model
joblib.dump(lgb_model, "lgb_model.pkl")
# Save threshold
joblib.dump(0.25, "threshold.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']