In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score

In [16]:
koi = pd.read_csv(
    "./data/koi.csv",
    comment='#',
    engine='python'
)

In [17]:
features_of_interest = [
    "koi_score", "koi_fpflag_nt", "koi_max_mult_ev", "koi_dicco_msky", "koi_fpflag_co", "koi_fwm_stat_sig",
    "koi_fpflag_ss", "koi_dikco_msky", "koi_model_snr", "koi_prad", "koi_smet_err2", "koi_ror",
    "koi_fwm_sdec_err", "koi_duration_err1", "koi_dor", "koi_duration_err2", "koi_prad_err2", "koi_fwm_sra_err",
    "koi_time0bk_err1", "koi_dicco_msky_err", "koi_time0_err1", "koi_steff_err2", "koi_fwm_srao_err", "koi_count",
    "koi_prad_err1", "koi_dikco_mra_err", "koi_time0bk_err2", "koi_fwm_sdeco_err", "koi_dikco_msky_err", "koi_steff_err1",
    "koi_max_sngle_ev", "koi_fpflag_ec", "koi_dikco_mdec_err", "koi_srho_err2", "koi_incl", "koi_dicco_mdec_err",
    "koi_insol_err1", "koi_dor_err1", "koi_sma", "koi_dor_err2", "koi_insol_err2", "koi_num_transits",
    "koi_period", "koi_ror_err2", "koi_dicco_mdec", "koi_bin_oedp_sig", "koi_teq", "koi_fwm_pdeco_err",
    "koi_period_err1", "koi_dikco_mdec", "koi_period_err2", "koi_time0_err2", "koi_duration", "koi_srho_err1",
    "koi_ror_err1", "koi_srho", "koi_depth", "koi_impact", "koi_dikco_mra", "koi_insol",
    "koi_dicco_mra", "koi_fwm_prao_err", "koi_fwm_sdeco", "koi_srad_err1", "koi_dicco_mra_err", "koi_fwm_sdec",
    "dec", "koi_srad_err2", "koi_time0bk", "koi_depth_err2", "koi_slogg_err2", "koi_fwm_srao",
    "koi_impact_err1", "ra", "koi_impact_err2", "koi_smass_err1", "koi_time0", "koi_smet",
    "koi_zmag", "koi_smet_err1", "koi_gmag", "koi_smass_err2", "koi_fwm_prao", "koi_imag",
    "koi_srad", "koi_depth_err1", "koi_kmag", "koi_slogg", "koi_jmag", "koi_fwm_sra",
    "koi_fwm_pdeco", "koi_hmag", "koi_steff", "koi_rmag", "koi_kepmag", "koi_ldm_coeff1",
    "koi_smass", "koi_ldm_coeff2", "koi_slogg_err1", "koi_tce_plnt_num"
]
# len(features_of_interest)

In [18]:
X = koi[features_of_interest]

In [19]:
X.columns[X.isna().any()]

Index(['koi_score', 'koi_max_mult_ev', 'koi_dicco_msky', 'koi_fwm_stat_sig',
       'koi_dikco_msky', 'koi_model_snr', 'koi_prad', 'koi_smet_err2',
       'koi_ror', 'koi_fwm_sdec_err', 'koi_duration_err1', 'koi_dor',
       'koi_duration_err2', 'koi_prad_err2', 'koi_fwm_sra_err',
       'koi_time0bk_err1', 'koi_dicco_msky_err', 'koi_time0_err1',
       'koi_steff_err2', 'koi_fwm_srao_err', 'koi_prad_err1',
       'koi_dikco_mra_err', 'koi_time0bk_err2', 'koi_fwm_sdeco_err',
       'koi_dikco_msky_err', 'koi_steff_err1', 'koi_max_sngle_ev',
       'koi_dikco_mdec_err', 'koi_srho_err2', 'koi_incl', 'koi_dicco_mdec_err',
       'koi_insol_err1', 'koi_dor_err1', 'koi_sma', 'koi_dor_err2',
       'koi_insol_err2', 'koi_num_transits', 'koi_ror_err2', 'koi_dicco_mdec',
       'koi_bin_oedp_sig', 'koi_teq', 'koi_fwm_pdeco_err', 'koi_period_err1',
       'koi_dikco_mdec', 'koi_period_err2', 'koi_time0_err2', 'koi_srho_err1',
       'koi_ror_err1', 'koi_srho', 'koi_depth', 'koi_impact', 'koi_di

In [20]:
X.isna().sum()

koi_score           1510
koi_fpflag_nt          0
koi_max_mult_ev     1142
koi_dicco_msky       599
koi_fpflag_co          0
                    ... 
koi_ldm_coeff1       363
koi_smass            363
koi_ldm_coeff2       363
koi_slogg_err1       468
koi_tce_plnt_num     346
Length: 100, dtype: int64

In [21]:

# reduced_koi = koi[features_of_interest]


non_numerical_features = koi.select_dtypes(include="object").columns.tolist()
non_numerical_features.extend(["rowid", "kepid"])
print(non_numerical_features)

X = koi.drop(non_numerical_features, axis=1)
y = koi["koi_disposition"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
RF = RandomForestClassifier()

cv_results = cross_validate(RF, X_train, y_train, cv=5, n_jobs=-1)
cv_results
RF.fit(X_train, y_train)
feature_importances = {feature: importance for feature, importance in zip(X_train.columns, RF.feature_importances_)}
feature_importances = dict(sorted(feature_importances.items(), key=lambda item: item[1], reverse=True))
print("Features ordered by importance, according to default Random Forest\n")
for feature, importance in feature_importances.items(): print(f"{feature}: {importance}")
adaboost = AdaBoostClassifier(n_estimators=1000, random_state=0)
adaboost.fit(X_train, y_train)

y_pred_ada = adaboost.predict(X_test)
accuracy_score(y_test, y_pred_ada)


['kepoi_name', 'kepler_name', 'koi_disposition', 'koi_vet_stat', 'koi_vet_date', 'koi_pdisposition', 'koi_disp_prov', 'koi_comment', 'koi_fittype', 'koi_limbdark_mod', 'koi_parm_prov', 'koi_tce_delivname', 'koi_quarters', 'koi_trans_mod', 'koi_datalink_dvr', 'koi_datalink_dvs', 'koi_sparprov', 'rowid', 'kepid']
Features ordered by importance, according to default Random Forest

koi_score: 0.13148595439531013
koi_fpflag_nt: 0.06282343697770446
koi_fpflag_ss: 0.04351376145007602
koi_dikco_msky: 0.040604977644858334
koi_max_mult_ev: 0.038673030424251895
koi_fpflag_co: 0.03134768060635171
koi_dicco_msky: 0.02412728166516135
koi_model_snr: 0.02233835621742397
koi_duration_err2: 0.022038264243285654
koi_smet_err2: 0.02014420485416498
koi_fwm_stat_sig: 0.019699871664222653
koi_prad: 0.01944882523658717
koi_prad_err2: 0.018331184058610898
koi_ror: 0.01646967081875212
koi_duration_err1: 0.014510925901181375
koi_steff_err1: 0.01397235644551856
koi_fwm_srao_err: 0.013713172396654965
koi_dor: 0.01

ValueError: Input X contains NaN.
AdaBoostClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
gbr = GradientBoostingClassifier(
    n_estimators=100,    # número de árboles
    learning_rate=0.1,   # factor de aprendizaje (shrinkage)
    max_depth=8,         # profundidad de cada árbol
    random_state=42
)
gbr.fit(X_train, y_train)
print(accuracy_score(y_test,gbr.predict(X_test)))
