In [None]:
!pip install scikit-optimize

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_auc_score
import matplotlib.pyplot as plt

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

from skopt import BayesSearchCV
from skopt.space import Integer, Categorical, Real

#Kepler Data

In [None]:
df = pd.read_csv('kepler.csv')
display(df.info(verbose=2))

In [None]:
display(df)

In [None]:
df['kepid'].nunique(), (df['kepid'].astype(str) + ';' + df['kepoi_name'].astype(str)).nunique()

In [None]:
df.groupby(['kepid'])['kepoi_name'].count().hist()

In [None]:
df['koi_disposition'].value_counts(), df['koi_pdisposition'].value_counts()

In [None]:
df['koi_vet_stat'].value_counts()

In [None]:
df.groupby(['koi_pdisposition'])['koi_score'].hist(legend=True)

In [None]:
reason_cols = {'koi_fpflag_nt':'Not Transit', 'koi_fpflag_ss':'Stellar Eclipse - binary star system', 'koi_fpflag_co':'Centroid Offset - comes from a nearby star', 'koi_fpflag_ec':'Ephemeris Match / Contamination - contamination or electronic crosstalk'}

for each, reason in reason_cols.items():
  print("***************************")
  print(each, '-', reason)
  print("Of Total:")
  print(df[each].value_counts())
  print()
  print("Of FP:")
  print(df[df['koi_pdisposition'] == 'FALSE POSITIVE'][each].value_counts())
  print()


In [None]:
id_cols = ['kepid', 'kepoi_name']
other_target_cols = ['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec']

num_cols = [ 'koi_period', 'koi_eccen', 'koi_longp', 'koi_impact', 'koi_duration', 'koi_ingress', 'koi_depth', 'koi_ror',
            'koi_srho', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_ldm_coeff4',
            'koi_ldm_coeff3', 'koi_ldm_coeff2', 'koi_ldm_coeff1', 'koi_max_sngle_ev', 'koi_max_mult_ev', 'koi_model_snr', 'koi_count',
             'koi_num_transits', 'koi_tce_plnt_num', 'koi_bin_oedp_sig', 'koi_model_dof',
             'koi_model_chisq', 'koi_steff', 'koi_slogg', 'koi_smet', 'koi_srad', 'koi_smass', 'koi_sage',
             'ra', 'dec', 'koi_kepmag', 'koi_gmag', 'koi_rmag', 'koi_imag', 'koi_zmag', 'koi_jmag', 'koi_hmag', 'koi_kmag', 'koi_fwm_stat_sig',
             'koi_fwm_sra', 'koi_fwm_sdec', 'koi_fwm_srao', 'koi_fwm_sdeco', 'koi_fwm_prao', 'koi_fwm_pdeco', 'koi_dicco_mra', 'koi_dicco_mdec',
             'koi_dicco_msky', 'koi_dikco_mra', 'koi_dikco_mdec', 'koi_dikco_msky']

cat_cols = ['koi_fittype', 'koi_limbdark_mod', 'koi_parm_prov', 'koi_tce_delivname', 'koi_quarters', 'koi_trans_mod', 'koi_datalink_dvr', 'koi_datalink_dvs', 'koi_sparprov']

In [None]:
for each in cat_cols:
  print(f"{each}: ", df[each].nunique())

In [None]:
cat_cols = [each for each in cat_cols if df[each].nunique() > 1 and df[each].nunique() < 10]
cat_cols

In [None]:
set(num_cols) - set(df.columns)

In [None]:
target_col = 'koi_disposition'

X = df[num_cols+cat_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.33, random_state=42)

X_train[num_cols].fillna(X_train[num_cols].median(), inplace=True)
X_val[num_cols].fillna(X_train[num_cols].median(), inplace=True)
X_test[num_cols].fillna(X_train[num_cols].median(), inplace=True)

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_cat = encoder.fit_transform(X_train[cat_cols])
X_val_cat = encoder.transform(X_val[cat_cols])
X_test_cat = encoder.transform(X_test[cat_cols])

X_train_cat = pd.DataFrame(X_train_cat, columns=encoder.get_feature_names_out(cat_cols), index=X_train.index)
X_val_cat = pd.DataFrame(X_val_cat, columns=encoder.get_feature_names_out(cat_cols), index=X_val.index)
X_test_cat = pd.DataFrame(X_test_cat, columns=encoder.get_feature_names_out(cat_cols), index=X_test.index)

X_train_ohe = pd.concat([X_train[num_cols], X_train_cat], axis=1)
X_val_ohe = pd.concat([X_val[num_cols], X_val_cat], axis=1)
X_test_ohe = pd.concat([X_test[num_cols], X_test_cat], axis=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_ohe)
X_val_scaled = scaler.transform(X_val_ohe)
X_test_scaled = scaler.transform(X_test_ohe)

# search_space = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'max_depth': [None, 5, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'bootstrap': [True, False]
# }

# random_search = RandomizedSearchCV(
#     RandomForestClassifier(),
#     param_distributions=search_space,
#     n_iter=50,
#     cv=3,
#     verbose=3,
#     random_state=42,
#     n_jobs=-1
# )


# search_space_skopt = {
#     'n_estimators': Integer(100, 500),
#     'max_depth': Integer(5, 30),
#     'min_samples_split': Integer(2, 10),
#     'min_samples_leaf': Integer(1, 4),
#     'max_features': Categorical(['sqrt', 'log2', None]),
#     'bootstrap': Categorical([True, False])
# }

# bayes_search = BayesSearchCV(
#     estimator=RandomForestClassifier(random_state=42),
#     search_spaces=search_space_skopt,
#     n_iter=50,
#     cv=3,
#     n_jobs=-1,
#     verbose=3,
#     random_state=42
# )

# best_model = bayes_search.fit(X_train_scaled, y_train)

# y_pred = best_model.predict(X_val_scaled)

# cm = confusion_matrix(y_val, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
# disp.plot(cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.show()

# print('Best Parameters:', bayes_search.best_params_)
# #{'bootstrap': False, 'max_depth': 23, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 466}
# print(classification_report(y_val, y_pred))


In [None]:
# y_pred = best_model.predict(X_test_scaled)

# cm = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
# disp.plot(cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.show()

# print('Best Parameters:', best_model.best_params_)
# print(classification_report(y_test, y_pred))

In [None]:
# search_space_skopt = {
#     'n_estimators': Integer(100, 500),
#     'max_depth': Integer(5, 30),
#     'num_leaves': Integer(20, 150),
#     'min_child_samples': Integer(5, 30),
#     'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
#     'subsample': Real(0.6, 1.0),
#     'colsample_bytree': Real(0.6, 1.0)
# }

# bayes_search = BayesSearchCV(
#     estimator=LGBMClassifier(random_state=42),
#     search_spaces=search_space_skopt,
#     n_iter=50,
#     cv=3,
#     n_jobs=-1,
#     verbose=3,
#     random_state=42
# )

params = { 'colsample_bytree': 0.6041861734100984, 'learning_rate': 0.06154814859853837, 'max_depth': 13, 'min_child_samples': 5, 'n_estimators': 163, 'num_leaves': 144, 'subsample': 0.9784159300992212}

model = LGBMClassifier(**params, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_val_scaled)

cm = confusion_matrix(y_val, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

#
print(classification_report(y_val, y_pred))

In [None]:
import joblib, pickle

In [None]:
joblib.dump(scaler, 'kepler_scaler.pkl')

medians = X_train[num_cols].median()

joblib.dump(medians, 'kepler_medians.pkl')

with open('kepler_model.pkl', 'wb') as f:
    pickle.dump(model, f)

joblib.dump(encoder, 'kepler_encoder.pkl')


In [None]:


y_pred_test = model.predict(X_test_scaled)
y_pred_test_proba = model.predict_proba(X_test_scaled)

cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

print(classification_report(y_test, y_pred_test))

auc_ovr_macro = roc_auc_score(y_test, y_pred_test_proba, multi_class="ovr", average="macro")
auc_ovr_weighted = roc_auc_score(y_test, y_pred_test_proba, multi_class="ovr", average="weighted")
print(auc_ovr_macro, auc_ovr_weighted)

In [None]:
ohe_cols = encoder.get_feature_names_out(cat_cols)

feature_names = list(num_cols) + list(ohe_cols)

feature_names = X_train_ohe.columns.tolist()

feat_importances = pd.Series(model.feature_importances_, index=feature_names)
feat_importances = feat_importances.sort_values()

plt.figure(figsize=(20,20))
feat_importances.plot(kind="barh")
plt.xlabel("Feature Importance")
plt.title("Model Feature Importances")
plt.show()

#TESS Data

In [None]:
df = pd.read_csv('tess.csv')
display(df.info(verbose=2))

In [None]:
df['toi'].nunique(), df['tid'].nunique()

In [None]:
df.groupby(['tid'])['toi'].count().hist()

In [None]:
"""
APC=ambiguous planetary candidate
CP=confirmed planet
FA=false alarm
FP=false positive
KP=known planet
PC=planetary candidate
"""

df['tfopwg_disp'] = df['tfopwg_disp'].map({
    'APC': 'CANDIDATE',
    'CP': 'CONFIRMED',
    'FA' : 'FALSE POSITIVE',
    'FP' : 'FALSE POSITIVE',
    'KP': 'CONFIRMED',
    'PC': 'CANDIDATE',
})

df['tfopwg_disp'].value_counts()

In [None]:
df[['rastr', 'ra', 'decstr', 'dec']].head()

In [None]:
df[['st_pmra', 'st_pmdec']].head()

In [None]:
for each in df.columns:
  if df[each].dtype == 'object':
    print(each)

In [None]:
cols_req = []
for each in df.columns:
  if 'err1' not in each and 'err2' not in each and each[-3:] != 'err' and each[-3:] != 'lim':
    cols_req.append(each)

print(cols_req)

In [None]:
id_cols = ['tid', 'toi']

num_cols = ['ra', 'dec', 'st_pmra', 'st_pmdec', 'pl_tranmid', 'pl_orbper', 'pl_trandurh', 'pl_trandep',
            'pl_rade', 'pl_insol', 'pl_eqt', 'st_tmag', 'st_dist', 'st_teff', 'st_logg', 'st_rad']

In [None]:
target_col = 'tfopwg_disp'

X = df[num_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.33, random_state=42)

X_train[num_cols].fillna(X_train[num_cols].median(), inplace=True)
X_val[num_cols].fillna(X_train[num_cols].median(), inplace=True)
X_test[num_cols].fillna(X_train[num_cols].median(), inplace=True)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


search_space_skopt = {
    'n_estimators': Integer(100, 500),
    'max_depth': Integer(5, 30),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 4),
    'max_features': Categorical(['sqrt', 'log2', None]),
    'bootstrap': Categorical([True, False])
}

bayes_search = BayesSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    search_spaces=search_space_skopt,
    n_iter=50,
    cv=3,
    n_jobs=-1,
    verbose=3,
    random_state=42
)

best_model = bayes_search.fit(X_train_scaled, y_train)

y_pred = best_model.predict(X_val_scaled)

cm = confusion_matrix(y_val, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

# params = {'bootstrap': False, 'max_depth': 24, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}

print('Best Parameters:', bayes_search.best_params_)
print(classification_report(y_val, y_pred))


In [None]:
y_pred = best_model.predict(X_test_scaled)

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

print('Best Parameters:', best_model.best_params_)
print(classification_report(y_test, y_pred))

In [None]:
search_space_skopt = {
    'n_estimators': Integer(100, 500),
    'max_depth': Integer(5, 30),
    'num_leaves': Integer(20, 150),
    'min_child_samples': Integer(5, 30),
    'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'subsample': Real(0.6, 1.0),
    'colsample_bytree': Real(0.6, 1.0)
}

bayes_search = BayesSearchCV(
    estimator=LGBMClassifier(random_state=42),
    search_spaces=search_space_skopt,
    n_iter=50,
    cv=3,
    n_jobs=-1,
    verbose=3,
    random_state=42
)

best_model = bayes_search.fit(X_train_scaled, y_train)

y_pred = best_model.predict(X_val_scaled)

cm = confusion_matrix(y_val, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

# params = {'colsample_bytree': 0.7192647300201618, 'learning_rate': 0.06362774918766591, 'max_depth': 30, 'min_child_samples': 12, 'n_estimators': 500, 'num_leaves': 52, 'subsample': 1.0}
print('Best Parameters:', bayes_search.best_params_)
print(classification_report(y_val, y_pred))

In [None]:
y_pred = best_model.predict(X_test_scaled)

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

print('Best Parameters:', best_model.best_params_)
print(classification_report(y_test, y_pred))

#K2 Data

In [None]:
df = pd.read_csv('k2.csv')
display(df.info(verbose=2))

In [None]:
df.groupby(['hostname'])['pl_name'].nunique().hist()

In [None]:
df['discoverymethod'].value_counts()

In [None]:
df['disposition'].value_counts()

In [None]:
cat_cols_to_remove = []
for each in df.columns[38:282]:
  if df[each].dtype == 'object':
    print(each)
    # print(df[each].value_counts())
    cat_cols_to_remove.append(each)

In [None]:
df['soltype'].value_counts()

In [None]:
cols_req = []
for each in df.columns[38:282]:
  if 'err1' not in each and 'err2' not in each and each[-3:] != 'err' and each[-3:] != 'lim':
    cols_req.append(each)

print(cols_req)

In [None]:
id_cols = ['pl_name', 'hostname']

num_cols = list({'pl_orbper', 'pl_orbsmax', 'pl_rade', 'pl_radj', 'pl_masse', 'pl_massj', 'pl_msinie', 'pl_msinij',
            'pl_cmasse', 'pl_cmassj', 'pl_bmasse', 'pl_bmassj', 'pl_bmassprov', 'pl_dens', 'pl_orbeccen', 'pl_insol',
            'pl_eqt', 'pl_orbincl', 'pl_tranmid', 'pl_tsystemref', 'ttv_flag', 'pl_imppar', 'pl_trandep', 'pl_trandur',
            'pl_ratdor', 'pl_ratror', 'pl_occdep', 'pl_orbtper', 'pl_orblper', 'pl_rvamp', 'pl_projobliq', 'pl_trueobliq',
            'st_refname', 'st_spectype', 'st_teff', 'st_rad', 'st_mass', 'st_met', 'st_metratio', 'st_lum', 'st_logg', 'st_age',
            'st_dens', 'st_vsin', 'st_rotp', 'st_radv', 'sy_refname', 'rastr', 'ra', 'decstr', 'dec', 'glat', 'glon', 'elat', 'elon',
            'sy_pm', 'sy_pmra', 'sy_pmdec', 'sy_dist', 'sy_plx', 'sy_bmag', 'sy_vmag', 'sy_jmag', 'sy_hmag', 'sy_kmag', 'sy_umag',
            'sy_gmag', 'sy_rmag', 'sy_imag', 'sy_zmag', 'sy_w1mag', 'sy_w2mag', 'sy_w3mag', 'sy_w4mag', 'sy_gaiamag', 'sy_icmag', 'sy_tmag', 'sy_kepmag'} - set(cat_cols_to_remove))
len(num_cols)

In [None]:
list_ = ['st_dens','pl_cmasse','sy_kepmag','st_radv','pl_orbsmax','pl_dens','pl_massj','pl_insol','pl_bmasse','ra','pl_trandep','st_logg','sy_bmag','st_age','pl_occdep',
 'pl_orbeccen','sy_jmag','sy_kmag','elat','dec','sy_w1mag','st_rad','pl_rvamp','pl_bmassj','pl_orblper','pl_tranmid','sy_gmag','elon','sy_imag','st_rotp','pl_msinij',
 'pl_orbtper','sy_pm','st_teff','pl_orbper','sy_plx','sy_umag','pl_cmassj','pl_eqt','sy_gaiamag','st_mass','pl_masse','sy_rmag','sy_dist','sy_zmag','pl_orbincl',
 'sy_pmdec','st_met','glat','sy_w4mag','pl_imppar','ttv_flag','pl_projobliq','st_lum','sy_pmra','pl_trueobliq','pl_ratror','sy_icmag','pl_rade','pl_trandur',
 'sy_hmag','glon','pl_radj','st_vsin','sy_w2mag','sy_vmag','pl_msinie','sy_tmag','pl_ratdor','sy_w3mag']

len(list_)

In [None]:
df = df[df['disposition']!='REFUTED']

In [None]:
target_col = 'disposition'

X = df[num_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.33, random_state=42)

X_train[num_cols].fillna(X_train[num_cols].median(), inplace=True)
X_val[num_cols].fillna(X_train[num_cols].median(), inplace=True)
X_test[num_cols].fillna(X_train[num_cols].median(), inplace=True)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


# search_space_skopt = {
#     'n_estimators': Integer(100, 500),
#     'max_depth': Integer(5, 30),
#     'min_samples_split': Integer(2, 10),
#     'min_samples_leaf': Integer(1, 4),
#     'max_features': Categorical(['sqrt', 'log2', None]),
#     'bootstrap': Categorical([True, False])
# }

# bayes_search = BayesSearchCV(
#     estimator=RandomForestClassifier(random_state=42),
#     search_spaces=search_space_skopt,
#     n_iter=50,
#     cv=3,
#     n_jobs=-1,
#     verbose=3,
#     random_state=42
# )

# best_model = bayes_search.fit(X_train_scaled, y_train)

# y_pred = best_model.predict(X_val_scaled)

# cm = confusion_matrix(y_val, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
# disp.plot(cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.show()

# # params = {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}

# print('Best Parameters:', bayes_search.best_params_)
# print(classification_report(y_val, y_pred))


In [None]:
# params = {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
# best_model = RandomForestClassifier(**params)
# best_model.fit(X_train_scaled, y_train)

# y_pred = best_model.predict(X_test_scaled)

# cm = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
# disp.plot(cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.show()

# print(classification_report(y_test, y_pred))

In [None]:
# search_space_skopt = {
#     'n_estimators': Integer(100, 500),
#     'max_depth': Integer(5, 30),
#     'num_leaves': Integer(20, 150),
#     'min_child_samples': Integer(5, 30),
#     'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
#     'subsample': Real(0.6, 1.0),
#     'colsample_bytree': Real(0.6, 1.0)
# }

# bayes_search = BayesSearchCV(
#     estimator=LGBMClassifier(random_state=42),
#     search_spaces=search_space_skopt,
#     n_iter=50,
#     cv=3,
#     n_jobs=-1,
#     verbose=3,
#     random_state=42
# )

params = {}

model = LGBMClassifier(**params, random_state=42)

best_model.fit(X_train_scaled, y_train)

y_pred = best_model.predict(X_val_scaled)

cm = confusion_matrix(y_val, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

print('Best Parameters:', bayes_search.best_params_)
print(classification_report(y_val, y_pred))

In [None]:
y_pred = best_model.predict(X_test_scaled)

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

print('Best Parameters:', best_model.best_params_)
print(classification_report(y_test, y_pred))

In [None]:
joblib.dump(scaler, 'k2_scaler.pkl')

medians = X_train[num_cols].median()

joblib.dump(medians, 'k2_medians.pkl')

with open('k2_model.pkl', 'wb') as f:
    pickle.dump(model, f)