In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('nasa_Kepler Objects of Interest (KOI).csv')

print(df.shape)
df.head()

(9565, 142)


Unnamed: 0.1,Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,Done,2018-08-16,CANDIDATE,1.0,...,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16
1,1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,Done,2018-08-16,CANDIDATE,0.969,...,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45
2,2,3,10811496,K00753.01,,CANDIDATE,Done,2018-08-16,CANDIDATE,0.0,...,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,3,4,10848459,K00754.01,,FALSE POSITIVE,Done,2018-08-16,FALSE POSITIVE,0.0,...,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,Done,2018-08-16,CANDIDATE,1.0,...,-0.09,0.18,0.1,0.14,0.07,0.18,0.02,0.16,0.07,0.2


In [6]:
df.isna().sum().sort_values(ascending=False)

koi_sma_err2        9565
koi_sma_err1        9565
koi_ingress_err2    9564
koi_eccen_err2      9564
koi_incl_err1       9564
                    ... 
koi_vet_stat           0
koi_disposition        0
kepoi_name             0
kepid                  0
Unnamed: 0             0
Length: 142, dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Columns: 142 entries, Unnamed: 0 to koi_dikco_msky_err
dtypes: float64(115), int64(1), object(26)
memory usage: 10.4+ MB


In [161]:
# 1. Identification & Metadata
identification_metadata = [
    "kepid", "kepoi_name", "kepler_name", "koi_disposition", "koi_pdisposition",
    "koi_disp_prov", "koi_parm_prov", "koi_sparprov",
    "koi_vet_stat", "koi_vet_date", "koi_comment",
    "koi_tce_plnt_num", "koi_tce_delivname", "koi_quarters"
]

# 2. False Positive Flags
false_positive_flags = [
    "koi_fpflag_nt", "koi_fpflag_ss", "koi_fpflag_co", "koi_fpflag_ec"
]

# 3. Orbital Properties
orbital_properties = [
    "koi_period", "koi_period_err1", "koi_period_err2",
    "koi_time0bk", "koi_time0bk_err1", "koi_time0bk_err2",
    "koi_time0", "koi_time0_err1", "koi_time0_err2",
    "koi_eccen", "koi_eccen_err1", "koi_eccen_err2",
    "koi_longp", "koi_longp_err1", "koi_longp_err2",
    "koi_incl", "koi_incl_err1", "koi_incl_err2",
    "koi_sma", "koi_sma_err1", "koi_sma_err2",
    "koi_num_transits"
]

# 4. Transit Geometry and Light Curve Features
transit_geometry = [
    "koi_duration", "koi_duration_err1", "koi_duration_err2",
    "koi_ingress", "koi_ingress_err1", "koi_ingress_err2",
    "koi_depth", "koi_depth_err1", "koi_depth_err2",
    "koi_impact", "koi_impact_err1", "koi_impact_err2",
    "koi_dor", "koi_dor_err1", "koi_dor_err2",
    "koi_ror", "koi_ror_err1", "koi_ror_err2"
]

# 5. Planetary Physical Properties
planetary_properties = [
    "koi_prad", "koi_prad_err1", "koi_prad_err2",
    "koi_teq", "koi_teq_err1", "koi_teq_err2",
    "koi_insol", "koi_insol_err1", "koi_insol_err2"
]

# 6. Stellar Properties
stellar_properties = [
    "koi_steff", "koi_steff_err1", "koi_steff_err2",
    "koi_slogg", "koi_slogg_err1", "koi_slogg_err2",
    "koi_smet", "koi_smet_err1", "koi_smet_err2",
    "koi_srad", "koi_srad_err1", "koi_srad_err2",
    "koi_smass", "koi_smass_err1", "koi_smass_err2",
    "koi_sage", "koi_sage_err1", "koi_sage_err2",
    "koi_srho", "koi_srho_err1", "koi_srho_err2"
]

# 7. Signal & Model Quality
signal_quality = [
    "koi_model_snr", "koi_max_sngle_ev", "koi_max_mult_ev",
    "koi_model_dof", "koi_model_chisq", "koi_trans_mod",
    "koi_fittype", "koi_count", "koi_bin_oedp_sig"
]

# 8. Astrometry & Centroid Data
astrometry_centroid = [
    "ra", "dec", "koi_fwm_stat_sig",
    "koi_fwm_sra", "koi_fwm_sra_err", "koi_fwm_sdec", "koi_fwm_sdec_err",
    "koi_fwm_srao", "koi_fwm_srao_err", "koi_fwm_sdeco", "koi_fwm_sdeco_err",
    "koi_fwm_prao", "koi_fwm_prao_err", "koi_fwm_pdeco", "koi_fwm_pdeco_err",
    "koi_dicco_mra", "koi_dicco_mra_err", "koi_dicco_mdec", "koi_dicco_mdec_err",
    "koi_dicco_msky", "koi_dicco_msky_err",
    "koi_dikco_mra", "koi_dikco_mra_err", "koi_dikco_mdec", "koi_dikco_mdec_err",
    "koi_dikco_msky", "koi_dikco_msky_err"
]

# 9. Photometric Magnitudes
photometric_magnitudes = [
    "koi_kepmag", "koi_gmag", "koi_rmag", "koi_imag", "koi_zmag",
    "koi_jmag", "koi_hmag", "koi_kmag"
]

# 10. Links & Reports
links_reports = [
    "koi_datalink_dvr", "koi_datalink_dvs"
]
dfcolumncategories=[identification_metadata, false_positive_flags,
                     orbital_properties, transit_geometry,planetary_properties
                     ,stellar_properties, signal_quality, astrometry_centroid,
                     photometric_magnitudes, links_reports]
x=0
for cat in dfcolumncategories:

    print( len(cat))
    x=x+len(cat)
print(x)

14
4
22
18
9
21
9
27
8
2
134


In [162]:
len(df.columns)

142

In [163]:
t_cal=[]
for cat in dfcolumncategories:
    t_cal=t_cal+cat
len(t_cal)

134

In [165]:
# Example lists
reference_list = t_cal 
list_to_check = df.columns.tolist()
# Check if all elements are in the reference list
all_present = all(item in reference_list for item in list_to_check)

if all_present:
    print("All elements are present.")
else:
    missing = [item for item in list_to_check if item not in reference_list]
    print("Missing elements:", missing)


Missing elements: ['Unnamed: 0', 'rowid', 'koi_score', 'koi_limbdark_mod', 'koi_ldm_coeff4', 'koi_ldm_coeff3', 'koi_ldm_coeff2', 'koi_ldm_coeff1']


In [159]:
# Get all columns from DataFrame
all_df_columns = df.columns.tolist()

# Check each category list against DataFrame columns
for i, category in enumerate(dfcolumncategories):
    missing_cols = [col for col in category if col not in all_df_columns]
    
    if missing_cols:
        print(f"\nCategory {i+1} has missing columns:")
        print(missing_cols)
    else:
        print(f"\nCategory {i+1} - All columns present in DataFrame")


Category 1 - All columns present in DataFrame

Category 2 - All columns present in DataFrame

Category 3 - All columns present in DataFrame

Category 4 - All columns present in DataFrame

Category 5 - All columns present in DataFrame

Category 6 - All columns present in DataFrame

Category 7 - All columns present in DataFrame

Category 8 - All columns present in DataFrame

Category 9 - All columns present in DataFrame

Category 10 - All columns present in DataFrame


In [160]:
all_df_columns = set(df.columns)
all_missing = []

for category in dfcolumncategories:
    all_missing.extend([col for col in category if col not in all_df_columns])

print("Columns absent in DataFrame:", all_missing)


Columns absent in DataFrame: []


In [26]:
column_categories = {
    "Identification & Metadata": identification_metadata,
    "False Positive Flags": false_positive_flags,
    "Orbital Properties": orbital_properties,
    "Transit Geometry and Light Curve Features": transit_geometry,
    "Planetary Physical Properties": planetary_properties,
    "Stellar Properties": stellar_properties,
    "Signal & Model Quality": signal_quality,
    "Astrometry & Centroid Data": astrometry_centroid,
    "Photometric Magnitudes": photometric_magnitudes,
    "Links & Reports": links_reports
}


In [None]:
for i, (category, columns) in enumerate(column_categories.items(), 1):
    print(f"{i}. {category} ({len(columns)} columns)")
    print(columns)
    print()

In [None]:
for cat in column_categories.keys():
    print(cat+'\n')
    print(df[column_categories[cat]].isna().sum().sort_values(ascending=False))
    print("\n")

Identification & Metadata

kepler_name          6817
koi_comment          1210
koi_quarters         1143
koi_sparprov          364
koi_tce_plnt_num      348
koi_tce_delivname     347
koi_pdisposition        1
koi_disp_prov           1
koi_parm_prov           1
kepid                   0
kepoi_name              0
koi_disposition         0
koi_vet_stat            0
koi_vet_date            0
dtype: int64


False Positive Flags

koi_fpflag_nt    1
koi_fpflag_ss    1
koi_fpflag_co    1
koi_fpflag_ec    1
dtype: int64


Orbital Properties

koi_sma_err2        9565
koi_sma_err1        9565
koi_eccen_err2      9564
koi_eccen_err1      9564
koi_incl_err2       9564
koi_incl_err1       9564
koi_longp_err2      9564
koi_longp_err1      9564
koi_longp           9564
koi_num_transits    1144
koi_time0_err2       456
koi_time0_err1       456
koi_time0bk_err2     456
koi_time0bk_err1     456
koi_period_err1      455
koi_period_err2      455
koi_incl             365
koi_eccen            364
koi_sma    

In [32]:
categories=[x for x in  column_categories.keys()]
categories

['Identification & Metadata',
 'False Positive Flags',
 'Orbital Properties',
 'Transit Geometry and Light Curve Features',
 'Planetary Physical Properties',
 'Stellar Properties',
 'Signal & Model Quality',
 'Astrometry & Centroid Data',
 'Photometric Magnitudes',
 'Links & Reports']

In [None]:
#1.'Identification & Metadata',
df_1 =df[column_categories['Identification & Metadata']].copy()

In [35]:
df_1

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_disp_prov,koi_parm_prov,koi_sparprov,koi_vet_stat,koi_vet_date,koi_comment,koi_tce_plnt_num,koi_tce_delivname,koi_quarters
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,q1_q17_dr25_sup_koi,q1_q17_dr25_koi,q1_q17_dr25_stellar,Done,2018-08-16,NO_COMMENT,1.0,q1_q17_dr25_tce,11111111111111111000000000000000
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,q1_q17_dr25_sup_koi,q1_q17_dr25_koi,q1_q17_dr25_stellar,Done,2018-08-16,NO_COMMENT,2.0,q1_q17_dr25_tce,11111111111111111000000000000000
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,q1_q17_dr25_sup_koi,q1_q17_dr25_koi,q1_q17_dr25_stellar,Done,2018-08-16,DEEP_V_SHAPED,1.0,q1_q17_dr25_tce,11111101110111011000000000000000
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,q1_q17_dr25_sup_koi,q1_q17_dr25_koi,q1_q17_dr25_stellar,Done,2018-08-16,MOD_ODDEVEN_DV---MOD_ODDEVEN_ALT---DEEP_V_SHAPED,1.0,q1_q17_dr25_tce,11111110111011101000000000000000
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,q1_q17_dr25_sup_koi,q1_q17_dr25_koi,q1_q17_dr25_stellar,Done,2018-08-16,NO_COMMENT,1.0,q1_q17_dr25_tce,01111111111111111000000000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9560,10090151,K07985.01,,FALSE POSITIVE,FALSE POSITIVE,q1_q17_dr25_sup_koi,q1_q17_dr25_koi,q1_q17_dr25_stellar,Done,2018-08-16,MOD_SEC_DV---PLANET_OCCULT_DV---MOD_SEC_ALT---...,1.0,q1_q17_dr25_tce,11111101110111011000000000000000
9561,10128825,K07986.01,,CANDIDATE,CANDIDATE,q1_q17_dr25_sup_koi,q1_q17_dr25_koi,q1_q17_dr25_stellar,Done,2018-08-16,NO_COMMENT,1.0,q1_q17_dr25_tce,11111111111111111000000000000000
9562,10147276,K07987.01,,FALSE POSITIVE,FALSE POSITIVE,q1_q17_dr25_sup_koi,q1_q17_dr25_koi,q1_q17_dr25_stellar,Done,2018-08-16,CENT_RESOLVED_OFFSET,1.0,q1_q17_dr25_tce,11111111111111111000000000000000
9563,10155286,K07988.01,,CANDIDATE,CANDIDATE,q1_q17_dr25_sup_koi,q1_q17_dr25_koi,q1_q17_dr25_stellar,Done,2018-08-16,ALL_TRANS_CHASES---CENT_SATURATED,1.0,q1_q17_dr25_tce,11111101110111011000000000000000


In [34]:
df_1.describe()

Unnamed: 0,koi_tce_plnt_num
count,9217.0
mean,1.24368
std,0.664604
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,8.0


In [38]:
df_1.isnull().sum().sort_values(ascending=False)

kepler_name          6817
koi_comment          1210
koi_quarters         1143
koi_sparprov          364
koi_tce_plnt_num      348
koi_tce_delivname     347
koi_pdisposition        1
koi_disp_prov           1
koi_parm_prov           1
kepid                   0
kepoi_name              0
koi_disposition         0
koi_vet_stat            0
koi_vet_date            0
dtype: int64

In [51]:
def decode_quarters(mask):
    """
    Decode a binary mask (string like '1110...' or int) into Kepler quarters 1-17.
    If decoding fails the original input is returned unchanged.
    """
    try:
        # normalize to a binary string
        if isinstance(mask, int):
            bits = bin(mask)[2:]
        elif isinstance(mask, str):
            s = mask.strip()
            if s.lower().startswith('0b'):
                s = s[2:]
            if not s or any(c not in '01' for c in s):
                return mask
            bits = s
        else:
            return mask

        # ensure at least 17 bits (left-pad with zeros if shorter)
        if len(bits) < 17:
            bits = bits.zfill(17)

        # reverse so index 0 -> Q1 and take first 17 bits
        rev = bits[::-1]
        return [i + 1 for i, b in enumerate(rev[:17]) if b == '1']

    except Exception:
        return mask
decoded= [decode_quarters(x) for x in q]

In [57]:
drop_cols_1 = [
    "koi_disp_prov",
    "koi_parm_prov",
    "koi_sparprov",
    "koi_vet_stat",
    "koi_vet_date",
    "koi_comment",
    "koi_tce_delivname",
    "koi_quarters",
    'kepler_name'
]
df_1.drop(columns=drop_cols_1, inplace=True)
df_1.head()

Unnamed: 0,kepid,kepoi_name,koi_disposition,koi_pdisposition,koi_tce_plnt_num
0,10797460,K00752.01,CONFIRMED,CANDIDATE,1.0
1,10797460,K00752.02,CONFIRMED,CANDIDATE,2.0
2,10811496,K00753.01,CANDIDATE,CANDIDATE,1.0
3,10848459,K00754.01,FALSE POSITIVE,FALSE POSITIVE,1.0
4,10854555,K00755.01,CONFIRMED,CANDIDATE,1.0


In [71]:
df_1.isnull().sum().sort_values(ascending=False)

kepid               0
kepoi_name          0
koi_disposition     0
koi_pdisposition    0
koi_tce_plnt_num    0
dtype: int64

In [70]:
df_1['koi_tce_plnt_num'].fillna(1.0, inplace=True)
df_1['koi_pdisposition'].fillna('FALSE POSITIVE', inplace=True)

In [179]:
df_1.head()

Unnamed: 0,kepid,kepoi_name,koi_disposition,koi_pdisposition,koi_tce_plnt_num
0,10797460,K00752.01,CONFIRMED,CANDIDATE,1.0
1,10797460,K00752.02,CONFIRMED,CANDIDATE,2.0
2,10811496,K00753.01,CANDIDATE,CANDIDATE,1.0
3,10848459,K00754.01,FALSE POSITIVE,FALSE POSITIVE,1.0
4,10854555,K00755.01,CONFIRMED,CANDIDATE,1.0


In [72]:
#2. False Positive 
df_2 =df[column_categories['False Positive Flags']].copy()

In [112]:
df_4[df_4.isnull().any(axis=1)]

Unnamed: 0,koi_duration,koi_duration_err1,koi_duration_err2,koi_ingress,koi_ingress_err1,koi_ingress_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_dor,koi_dor_err1,koi_dor_err2,koi_ror,koi_ror_err1,koi_ror_err2
0,2.95750,0.08190,-0.08190,,,,615.8,19.5,-19.5,0.146,0.318,-0.146,24.810,2.600,-2.600,0.022344,0.000832,-0.000528
1,4.50700,0.11600,-0.11600,,,,874.8,35.5,-35.5,0.586,0.059,-0.443,77.900,28.400,-28.400,0.027954,0.009078,-0.001347
2,1.78220,0.03410,-0.03410,,,,10829.0,171.0,-171.0,0.969,5.126,-0.077,53.500,25.700,-25.700,0.154046,5.034292,-0.042179
3,2.40641,0.00537,-0.00537,,,,8079.2,12.8,-12.8,1.276,0.115,-0.092,3.278,0.136,-0.136,0.387394,0.109232,-0.084950
4,1.65450,0.04200,-0.04200,,,,603.3,16.9,-16.9,0.701,0.235,-0.478,8.750,4.000,-4.000,0.024064,0.003751,-0.001522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9560,3.22210,0.01740,-0.01740,,,,1579.2,4.6,-4.6,1.252,0.051,-0.049,1.339,0.139,-0.139,0.297633,0.044899,-0.052116
9561,3.11400,0.22900,-0.22900,,,,48.5,5.4,-5.4,0.043,0.423,-0.043,4.331,0.625,-0.625,0.006379,0.000476,-0.000367
9562,0.86500,0.16200,-0.16200,,,,103.6,14.7,-14.7,0.147,0.309,-0.147,6.040,2.600,-2.600,0.009444,0.001982,-0.000520
9563,3.19900,0.22900,-0.22900,,,,639.1,52.7,-52.7,0.214,0.255,-0.214,796.000,161.000,-161.000,0.022590,0.001411,-0.001220


In [84]:
for col in df_2.columns:
    if df_2[col].dtype != 'object':  # categorical
        mode_value = df_2[col].quantile(0.75)
        df_2[col].fillna(mode_value, inplace=True)

In [85]:
df_2.isnull().sum().sort_values(ascending=False)

koi_fpflag_nt    0
koi_fpflag_ss    0
koi_fpflag_co    0
koi_fpflag_ec    0
dtype: int64

In [None]:
#for col in df.columns:
#   if df[col].dtype == 'object':  # categorical
#        mode_value = df[col].mode().iloc[0]
#        df[col].fillna(mode_value, inplace=True)
#   else:  # numerical
#        median_value = df[col].median()
#        df[col].fillna(median_value, inplace=True)


In [92]:
# 3. Orbital Properties

df_3= df[column_categories['Orbital Properties']].copy()

In [99]:
df_3.isnull().sum().sort_values(ascending=False)

koi_period          0
koi_period_err1     0
koi_period_err2     0
koi_time0bk         0
koi_time0bk_err1    0
koi_time0bk_err2    0
koi_time0           0
koi_time0_err1      0
koi_time0_err2      0
koi_eccen           0
koi_incl            0
koi_sma             0
koi_num_transits    0
dtype: int64

In [90]:
print(orbital_properties)

['koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_time0', 'koi_time0_err1', 'koi_time0_err2', 'koi_eccen', 'koi_eccen_err1', 'koi_eccen_err2', 'koi_longp', 'koi_longp_err1', 'koi_longp_err2', 'koi_incl', 'koi_incl_err1', 'koi_incl_err2', 'koi_sma', 'koi_sma_err1', 'koi_sma_err2', 'koi_num_transits']


In [94]:
drop_cols_3=['koi_longp', 'koi_longp_err1', 'koi_longp_err2','koi_incl_err1', 'koi_incl_err2', 'koi_sma_err1', 'koi_sma_err2', 'koi_eccen_err1', 'koi_eccen_err2']
df_3.drop(columns=drop_cols_3, inplace=True)
df_3.isnull().sum().sort_values(ascending=False)

koi_num_transits    1144
koi_time0bk_err1     456
koi_time0bk_err2     456
koi_time0_err1       456
koi_time0_err2       456
koi_period_err1      455
koi_period_err2      455
koi_incl             365
koi_eccen            364
koi_sma              364
koi_time0              2
koi_period             1
koi_time0bk            1
dtype: int64

In [97]:
df_3.head()

Unnamed: 0,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_time0,koi_time0_err1,koi_time0_err2,koi_eccen,koi_incl,koi_sma,koi_num_transits
0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,2455003.539,0.00216,-0.00216,0.0,89.66,0.0853,142.0
1,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,2454995.514,0.00352,-0.00352,0.0,89.57,0.2734,25.0
2,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,2455008.85,0.000581,-0.000581,0.0,88.96,0.1419,56.0
3,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,2455003.308,0.000115,-0.000115,0.0,67.09,0.0267,621.0
4,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,2455004.596,0.00113,-0.00113,0.0,85.41,0.0374,515.0


In [98]:
df_3['koi_eccen'].fillna(0, inplace=True)

for col in ['koi_time0bk_err1','koi_time0bk_err2','koi_time0_err1',
            'koi_time0_err2','koi_period_err1','koi_period_err2']:
    df_3[col].fillna(df_3[col].mean(), inplace=True)

for col in ['koi_incl','koi_sma','koi_time0','koi_period','koi_time0bk','koi_num_transits']:
    df_3[col].fillna(df_3[col].median(), inplace=True)


In [102]:
df_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   koi_period        9565 non-null   float64
 1   koi_period_err1   9565 non-null   float64
 2   koi_period_err2   9565 non-null   float64
 3   koi_time0bk       9565 non-null   float64
 4   koi_time0bk_err1  9565 non-null   float64
 5   koi_time0bk_err2  9565 non-null   float64
 6   koi_time0         9565 non-null   float64
 7   koi_time0_err1    9565 non-null   float64
 8   koi_time0_err2    9565 non-null   float64
 9   koi_eccen         9565 non-null   float64
 10  koi_incl          9565 non-null   float64
 11  koi_sma           9565 non-null   float64
 12  koi_num_transits  9565 non-null   float64
dtypes: float64(13)
memory usage: 971.6 KB


In [103]:
#4. "Transit Geometry and Light Curve Features"
df_4 =df[column_categories['Transit Geometry and Light Curve Features']].copy()
df_4.head()

Unnamed: 0,koi_duration,koi_duration_err1,koi_duration_err2,koi_ingress,koi_ingress_err1,koi_ingress_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_dor,koi_dor_err1,koi_dor_err2,koi_ror,koi_ror_err1,koi_ror_err2
0,2.9575,0.0819,-0.0819,,,,615.8,19.5,-19.5,0.146,0.318,-0.146,24.81,2.6,-2.6,0.022344,0.000832,-0.000528
1,4.507,0.116,-0.116,,,,874.8,35.5,-35.5,0.586,0.059,-0.443,77.9,28.4,-28.4,0.027954,0.009078,-0.001347
2,1.7822,0.0341,-0.0341,,,,10829.0,171.0,-171.0,0.969,5.126,-0.077,53.5,25.7,-25.7,0.154046,5.034292,-0.042179
3,2.40641,0.00537,-0.00537,,,,8079.2,12.8,-12.8,1.276,0.115,-0.092,3.278,0.136,-0.136,0.387394,0.109232,-0.08495
4,1.6545,0.042,-0.042,,,,603.3,16.9,-16.9,0.701,0.235,-0.478,8.75,4.0,-4.0,0.024064,0.003751,-0.001522


In [118]:
df_4.isnull().sum().sort_values(ascending=False)

koi_impact      365
koi_ror         365
koi_depth       364
koi_dor         364
koi_duration      1
dtype: int64

In [111]:
df_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_duration       9564 non-null   float64
 1   koi_duration_err1  9110 non-null   float64
 2   koi_duration_err2  9110 non-null   float64
 3   koi_ingress        1 non-null      float64
 4   koi_ingress_err1   1 non-null      float64
 5   koi_ingress_err2   1 non-null      float64
 6   koi_depth          9201 non-null   float64
 7   koi_depth_err1     9110 non-null   float64
 8   koi_depth_err2     9110 non-null   float64
 9   koi_impact         9200 non-null   float64
 10  koi_impact_err1    9109 non-null   float64
 11  koi_impact_err2    9109 non-null   float64
 12  koi_dor            9201 non-null   float64
 13  koi_dor_err1       9110 non-null   float64
 14  koi_dor_err2       9109 non-null   float64
 15  koi_ror            9200 non-null   float64
 16  koi_ror_err1       9201 

In [110]:
# Convert all object columns to float where possible

df_4['koi_dor_err2']= pd.to_numeric(df_4['koi_dor_err2'], errors='coerce')
df_4['koi_ror']= pd.to_numeric(df_4['koi_ror'], errors='coerce')

In [117]:
drop_cols_4=["koi_ingress", "koi_ingress_err1", "koi_ingress_err2", 'koi_duration_err1', 'koi_duration_err2',
               'koi_depth_err1', 'koi_depth_err2', 'koi_ror_err1',
                'koi_ror_err2', 'koi_impact_err1', 'koi_impact_err2', 'koi_dor_err1','koi_dor_err2']
df_4.drop(columns=drop_cols_4, inplace=True)

In [122]:
impute_cols = ["koi_impact", "koi_ror", "koi_depth", "koi_dor", "koi_duration"]

for col in impute_cols:
    df_4[col].fillna(df_4[col].median(), inplace=True)
df_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   koi_duration  9565 non-null   float64
 1   koi_depth     9565 non-null   float64
 2   koi_impact    9565 non-null   float64
 3   koi_dor       9565 non-null   float64
 4   koi_ror       9565 non-null   float64
dtypes: float64(5)
memory usage: 373.8 KB


In [116]:
# Drop all columns ending with '_err1' or '_err2'
columnserr12= [c for c in df.columns if c.endswith(('_err1', '_err2'))]
print(columnserr12)
len(columnserr12)

['koi_period_err1', 'koi_period_err2', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_time0_err1', 'koi_time0_err2', 'koi_eccen_err1', 'koi_eccen_err2', 'koi_longp_err1', 'koi_longp_err2', 'koi_impact_err1', 'koi_impact_err2', 'koi_duration_err1', 'koi_duration_err2', 'koi_ingress_err1', 'koi_ingress_err2', 'koi_depth_err1', 'koi_depth_err2', 'koi_ror_err1', 'koi_ror_err2', 'koi_srho_err1', 'koi_srho_err2', 'koi_prad_err1', 'koi_prad_err2', 'koi_sma_err1', 'koi_sma_err2', 'koi_incl_err1', 'koi_incl_err2', 'koi_teq_err1', 'koi_teq_err2', 'koi_insol_err1', 'koi_insol_err2', 'koi_dor_err1', 'koi_dor_err2', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_smet_err1', 'koi_smet_err2', 'koi_srad_err1', 'koi_srad_err2', 'koi_smass_err1', 'koi_smass_err2', 'koi_sage_err1', 'koi_sage_err2']


46

In [123]:
#5. 'Planetary Physical Properties'
df_5 =df[column_categories['Planetary Physical Properties']].copy()
df_5.head()

Unnamed: 0,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_teq_err1,koi_teq_err2,koi_insol,koi_insol_err1,koi_insol_err2
0,2.26,0.26,-0.15,793.0,,,93.59,29.45,-16.65
1,2.83,0.32,-0.19,443.0,,,9.11,2.87,-1.62
2,14.6,3.92,-1.31,638.0,,,39.3,31.04,-10.49
3,33.46,8.5,-2.83,1395.0,,,891.96,668.95,-230.35
4,2.75,0.88,-0.35,1406.0,,,926.16,874.33,-314.24


In [135]:
df_5.isnull().sum().sort_values(ascending=False)

koi_prad     0
koi_teq      0
koi_insol    0
dtype: int64

In [125]:
drop_cols_5=['koi_prad_err1', 'koi_prad_err2', 'koi_teq_err1', 'koi_teq_err2', 'koi_insol_err1', 'koi_insol_err2']
df_5.drop(columns=drop_cols_5, inplace=True)
df_5.describe()

Unnamed: 0,koi_prad,koi_teq
count,9201.0,9201.0
mean,102.899535,1085.241213
std,3077.63899,856.420677
min,0.08,8.3977
25%,1.4,539.0
50%,2.39,878.0
75%,14.94,1379.0
max,200346.0,14667.0


In [128]:
df_5['koi_insol']= pd.to_numeric(df_5['koi_insol'], errors='coerce')

In [130]:
df_5.describe()

Unnamed: 0,koi_prad,koi_teq,koi_insol
count,9201.0,9201.0,9242.0
mean,102.899535,1085.241213,7746.493
std,3077.63899,856.420677,159213.3
min,0.08,8.3977,0.0
25%,1.4,539.0,20.15
50%,2.39,878.0,141.45
75%,14.94,1379.0,870.35
max,200346.0,14667.0,10947550.0


In [134]:
for col in [ "koi_teq", "koi_insol"]:
    df_5[col].fillna(df_5[col].median(), inplace=True)
df_5["koi_prad"].fillna(df_5["koi_prad"].quantile(0.75), inplace=True)

In [136]:
# 6. 'Stellar Properties'
df_6= df[column_categories['Stellar Properties']].copy()
df_6.head()

Unnamed: 0,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_smet,koi_smet_err1,koi_smet_err2,koi_srad,...,koi_srad_err2,koi_smass,koi_smass_err1,koi_smass_err2,koi_sage,koi_sage_err1,koi_sage_err2,koi_srho,koi_srho_err1,koi_srho_err2
0,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.14,0.15,-0.15,0.927,...,-0.061,0.919,0.052,-0.046,,,,3.20796,0.33173,-1.09986
1,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.14,0.15,-0.15,0.927,...,-0.061,0.919,0.052,-0.046,,,,3.02368,2.20489,-2.49638
2,5853.0,158.0,-176.0,4.544,0.044,-0.176,-0.18,0.3,-0.3,0.868,...,-0.078,0.961,0.11,-0.121,,,,7.29555,35.03293,-2.75453
3,5805.0,157.0,-174.0,4.564,0.053,-0.168,-0.52,0.3,-0.3,0.791,...,-0.067,0.836,0.093,-0.077,,,,0.2208,0.00917,-0.01837
4,6031.0,169.0,-211.0,4.438,0.07,-0.21,0.07,0.25,-0.3,1.046,...,-0.133,1.095,0.151,-0.136,,,,1.98635,2.71141,-1.74541


In [140]:
df_6.describe()

Unnamed: 0,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_smet,koi_smet_err1,koi_smet_err2,koi_srad,koi_srad_err1,koi_srad_err2,koi_smass_err1,koi_smass_err2,koi_sage,koi_sage_err1,koi_sage_err2,koi_srho,koi_srho_err1,koi_srho_err2
count,9201.0,9096.0,9081.0,9201.0,9096.0,9096.0,9178.0,9177.0,9177.0,9200.0,9095.0,9095.0,9096.0,9096.0,1.0,1.0,1.0,9243.0,9243.0,9242.0
mean,5706.190734,144.626995,-162.256486,4.309777,0.120739,-0.143162,-0.124319,0.228706,-0.252022,1.728795,0.362328,-0.394845,0.154325,-0.133396,11.338,11.736,11.275,9.16416,18.064684,-5.489125
std,799.074998,47.071503,72.760865,0.434008,0.132836,0.085475,0.282342,0.077156,0.085471,6.127513,0.930915,2.168329,3.007171,0.548565,,,,53.808007,76.80077,32.3384
min,-0.06,0.0,-1762.0,0.047,0.0,-1.207,-2.5,0.0,-0.75,0.109,0.0,-116.137,0.0,-2.432,11.338,11.736,11.275,-0.51,0.0,-696.08923
25%,5309.0,106.0,-198.0,4.218,0.042,-0.196,-0.26,0.15,-0.3,0.829,0.129,-0.25,0.072,-0.141,11.338,11.736,11.275,0.22915,0.05423,-1.130418
50%,5767.0,157.0,-160.0,4.438,0.07,-0.128,-0.1,0.25,-0.3,1.0,0.251,-0.111,0.106,-0.0975,11.338,11.736,11.275,0.95531,0.43654,-0.22376
75%,6112.0,174.0,-114.0,4.543,0.149,-0.088,0.07,0.3,-0.15,1.345,0.364,-0.069,0.151,-0.061,11.338,11.736,11.275,2.897175,2.48341,-0.025682
max,15896.0,676.0,0.0,5.364,1.472,0.0,0.971,0.5,0.0,229.908,33.091,0.0,286.80847,49.316399,11.338,11.736,11.275,980.85419,835.24225,0.0


In [142]:
df_6.isnull().sum().sort_values(ascending=False)

koi_steff         0
koi_steff_err1    0
koi_srho_err1     0
koi_srho          0
koi_smass_err2    0
koi_smass_err1    0
koi_smass         0
koi_srad_err2     0
koi_srad_err1     0
koi_srad          0
koi_smet_err2     0
koi_smet_err1     0
koi_smet          0
koi_slogg_err2    0
koi_slogg_err1    0
koi_slogg         0
koi_steff_err2    0
koi_srho_err2     0
dtype: int64

In [141]:
drop_cols_6 =[ 'koi_sage_err1', 'koi_sage_err2', 'koi_sage']
df_6.drop(columns=drop_cols_6, inplace=True)
df_6['koi_smass']= pd.to_numeric(df_6['koi_smass'], errors='coerce')
for col in df_6.columns:
    df_6[col].fillna(df_6[col].median(), inplace=True)
df_6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   koi_steff       9565 non-null   float64
 1   koi_steff_err1  9565 non-null   float64
 2   koi_steff_err2  9565 non-null   float64
 3   koi_slogg       9565 non-null   float64
 4   koi_slogg_err1  9565 non-null   float64
 5   koi_slogg_err2  9565 non-null   float64
 6   koi_smet        9565 non-null   float64
 7   koi_smet_err1   9565 non-null   float64
 8   koi_smet_err2   9565 non-null   float64
 9   koi_srad        9565 non-null   float64
 10  koi_srad_err1   9565 non-null   float64
 11  koi_srad_err2   9565 non-null   float64
 12  koi_smass       9565 non-null   float64
 13  koi_smass_err1  9565 non-null   float64
 14  koi_smass_err2  9565 non-null   float64
 15  koi_srho        9565 non-null   float64
 16  koi_srho_err1   9565 non-null   float64
 17  koi_srho_err2   9565 non-null   f

In [143]:
# 7. "Signal & Model Quality"
df_7 =df[column_categories['Signal & Model Quality']].copy()
df_7.head()

Unnamed: 0,koi_model_snr,koi_max_sngle_ev,koi_max_mult_ev,koi_model_dof,koi_model_chisq,koi_trans_mod,koi_fittype,koi_count,koi_bin_oedp_sig
0,35.8,5.135849,28.47082,,,Mandel and Agol (2002 ApJ 580 171),LS+MCMC,2,0.6864
1,25.8,7.027669,20.109507,,,Mandel and Agol (2002 ApJ 580 171),LS+MCMC,2,0.0023
2,76.3,37.159767,187.4491,,,Mandel and Agol (2002 ApJ 580 171),LS+MCMC,1,0.6624
3,505.6,39.06655,541.8951,,,Mandel and Agol (2002 ApJ 580 171),LS+MCMC,1,0.0
4,40.9,4.749945,33.1919,,,Mandel and Agol (2002 ApJ 580 171),LS+MCMC,1,0.309


In [150]:
df_7.describe(include='all')

Unnamed: 0,koi_model_snr,koi_max_sngle_ev,koi_max_mult_ev,koi_model_dof,koi_model_chisq,koi_trans_mod,koi_fittype,koi_count,koi_bin_oedp_sig
count,9201.0,8422.0,8422.0,1.0,1.0,9201,9563,9564.0,8054.0
unique,,8421.0,,,,2,4,15.0,
top,,4.23504,,,,Mandel and Agol (2002 ApJ 580 171),LS+MCMC,1.0,
freq,,2.0,,,,9200,7896,4780.0,
mean,259.427209,,1.319296e+27,-78.0,4.457,,,,1.132103
std,794.693227,,1.210737e+29,,,,,,64.848444
min,0.0,,7.105086,-78.0,4.457,,,,-1.0
25%,12.0,,10.73303,-78.0,4.457,,,,0.13465
50%,23.0,,19.25441,-78.0,4.457,,,,0.48695
75%,78.0,,71.998,-78.0,4.457,,,,0.81055


In [152]:
df_7.isnull().sum().sort_values(ascending=False)

koi_model_snr       0
koi_max_sngle_ev    0
koi_max_mult_ev     0
koi_trans_mod       0
koi_fittype         0
koi_count           0
koi_bin_oedp_sig    0
dtype: int64

In [153]:
df_7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   koi_model_snr     9565 non-null   float64
 1   koi_max_sngle_ev  9565 non-null   float64
 2   koi_max_mult_ev   9565 non-null   float64
 3   koi_trans_mod     9565 non-null   object 
 4   koi_fittype       9565 non-null   object 
 5   koi_count         9565 non-null   float64
 6   koi_bin_oedp_sig  9565 non-null   float64
dtypes: float64(5), object(2)
memory usage: 523.2+ KB


In [151]:
drop_cols_7=['koi_model_dof','koi_model_chisq']
df_7['koi_max_sngle_ev'] = pd.to_numeric(df_7['koi_max_sngle_ev'], errors='coerce')
df_7['koi_count'] = pd.to_numeric(df_7['koi_count'], errors='coerce')
df_7.drop(columns=drop_cols_7, inplace=True)
for col in df_7.columns:
    if df_7[col].dtype != 'object':  # numerical
        df_7[col].fillna(df_7[col].median(), inplace=True)
    else:  # categorical
        df_7[col].fillna(df_7[col].mode().iloc[0], inplace=True)

In [154]:
# 8.  'Astrometry & Centroid Data'
df_8 =df[column_categories['Astrometry & Centroid Data']].copy()
df_8.head()

Unnamed: 0,ra,dec,koi_fwm_stat_sig,koi_fwm_sra,koi_fwm_sra_err,koi_fwm_sdec,koi_fwm_sdec_err,koi_fwm_srao,koi_fwm_srao_err,koi_fwm_sdeco,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,291.93423,48.141651,0.002,19.462294,1.4e-05,48.14191,0.00013,0.43,0.51,0.94,...,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16
1,291.93423,48.141651,0.003,19.462265,2e-05,48.14199,0.00019,-0.63,0.72,1.23,...,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45
2,297.00482,48.134129,0.278,19.800321,1.9e-06,48.13412,2e-05,-0.021,0.069,-0.038,...,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,285.53461,48.28521,0.0,19.035638,8.6e-07,48.28521,7e-06,-0.111,0.031,0.002,...,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,288.75488,48.2262,0.733,19.250326,9.7e-06,48.22626,0.0001,-0.01,0.35,0.23,...,-0.09,0.18,0.1,0.14,0.07,0.18,0.02,0.16,0.07,0.2


In [157]:
df_8.isnull().sum().sort_values(ascending=False)

koi_fwm_stat_sig      1077
koi_fwm_prao           831
koi_fwm_prao_err       831
koi_fwm_pdeco          818
koi_fwm_pdeco_err      818
koi_dicco_mra_err      600
koi_dicco_msky_err     600
koi_dicco_msky         600
koi_dicco_mdec_err     600
koi_dicco_mdec         600
koi_dicco_mra          600
koi_dikco_msky         572
koi_dikco_mdec_err     572
koi_dikco_mdec         572
koi_dikco_mra_err      572
koi_dikco_mra          572
koi_dikco_msky_err     572
koi_fwm_sdec_err       507
koi_fwm_sdec           507
koi_fwm_sra_err        507
koi_fwm_sra            507
koi_fwm_sdeco_err      456
koi_fwm_sdeco          456
koi_fwm_srao_err       456
koi_fwm_srao           456
dec                      1
ra                       1
dtype: int64

In [158]:
df_8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ra                  9564 non-null   float64
 1   dec                 9564 non-null   float64
 2   koi_fwm_stat_sig    8488 non-null   float64
 3   koi_fwm_sra         9058 non-null   float64
 4   koi_fwm_sra_err     9058 non-null   float64
 5   koi_fwm_sdec        9058 non-null   float64
 6   koi_fwm_sdec_err    9058 non-null   float64
 7   koi_fwm_srao        9109 non-null   float64
 8   koi_fwm_srao_err    9109 non-null   float64
 9   koi_fwm_sdeco       9109 non-null   float64
 10  koi_fwm_sdeco_err   9109 non-null   float64
 11  koi_fwm_prao        8734 non-null   float64
 12  koi_fwm_prao_err    8734 non-null   float64
 13  koi_fwm_pdeco       8747 non-null   float64
 14  koi_fwm_pdeco_err   8747 non-null   float64
 15  koi_dicco_mra       8965 non-null   float64
 16  koi_di

In [166]:
drop_cols_8=[]
for col in df_8.columns:
    df_8[col].fillna(df_8[col].median(), inplace=True)

df_8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ra                  9565 non-null   float64
 1   dec                 9565 non-null   float64
 2   koi_fwm_stat_sig    9565 non-null   float64
 3   koi_fwm_sra         9565 non-null   float64
 4   koi_fwm_sra_err     9565 non-null   float64
 5   koi_fwm_sdec        9565 non-null   float64
 6   koi_fwm_sdec_err    9565 non-null   float64
 7   koi_fwm_srao        9565 non-null   float64
 8   koi_fwm_srao_err    9565 non-null   float64
 9   koi_fwm_sdeco       9565 non-null   float64
 10  koi_fwm_sdeco_err   9565 non-null   float64
 11  koi_fwm_prao        9565 non-null   float64
 12  koi_fwm_prao_err    9565 non-null   float64
 13  koi_fwm_pdeco       9565 non-null   float64
 14  koi_fwm_pdeco_err   9565 non-null   float64
 15  koi_dicco_mra       9565 non-null   float64
 16  koi_di

In [167]:
# 9. 'Photometric Magnitudes'
df_9 =df[column_categories['Photometric Magnitudes']].copy()
df_9.head()

Unnamed: 0,koi_kepmag,koi_gmag,koi_rmag,koi_imag,koi_zmag,koi_jmag,koi_hmag,koi_kmag
0,15.347,15.89,15.27,15.114,15.006,14.082,13.751,13.648
1,15.347,15.89,15.27,15.114,15.006,14.082,13.751,13.648
2,15.436,15.943,15.39,15.22,15.166,14.254,13.9,13.826
3,15.597,16.1,15.554,15.382,15.266,14.326,13.911,13.809
4,15.509,16.015,15.468,15.292,15.241,14.366,14.064,13.952


In [168]:
df_9.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   koi_kepmag  9563 non-null   float64
 1   koi_gmag    9523 non-null   float64
 2   koi_rmag    9555 non-null   float64
 3   koi_imag    9410 non-null   float64
 4   koi_zmag    8951 non-null   float64
 5   koi_jmag    9539 non-null   float64
 6   koi_hmag    9539 non-null   float64
 7   koi_kmag    9539 non-null   float64
dtypes: float64(8)
memory usage: 597.9 KB


In [169]:
df_9.isnull().sum().sort_values(ascending=False)

koi_zmag      614
koi_imag      155
koi_gmag       42
koi_jmag       26
koi_hmag       26
koi_kmag       26
koi_rmag       10
koi_kepmag      2
dtype: int64

In [171]:
for col in df_9.columns:
    df_9[col].fillna(df_9[col].median(), inplace=True)

In [172]:
df_9.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   koi_kepmag  9565 non-null   float64
 1   koi_gmag    9565 non-null   float64
 2   koi_rmag    9565 non-null   float64
 3   koi_imag    9565 non-null   float64
 4   koi_zmag    9565 non-null   float64
 5   koi_jmag    9565 non-null   float64
 6   koi_hmag    9565 non-null   float64
 7   koi_kmag    9565 non-null   float64
dtypes: float64(8)
memory usage: 597.9 KB


In [173]:
# 10 .'Links & Reports'
df_10 =df[column_categories['Links & Reports']].copy()
df_10.head()

Unnamed: 0,koi_datalink_dvr,koi_datalink_dvs
0,010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-001-2016...
1,010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-002-2016...
2,010/010811/010811496/dv/kplr010811496-20160209...,010/010811/010811496/dv/kplr010811496-001-2016...
3,010/010848/010848459/dv/kplr010848459-20160209...,010/010848/010848459/dv/kplr010848459-001-2016...
4,010/010854/010854555/dv/kplr010854555-20160209...,010/010854/010854555/dv/kplr010854555-001-2016...


In [174]:
df_clean = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10], axis=1)
df_clean

Unnamed: 0,kepid,kepoi_name,koi_disposition,koi_pdisposition,koi_tce_plnt_num,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,...,koi_kepmag,koi_gmag,koi_rmag,koi_imag,koi_zmag,koi_jmag,koi_hmag,koi_kmag,koi_datalink_dvr,koi_datalink_dvs
0,10797460,K00752.01,CONFIRMED,CANDIDATE,1.0,0.0,0.0,0.0,0.0,9.488036,...,15.347,15.890,15.270,15.114,15.006,14.082,13.751,13.648,010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-001-2016...
1,10797460,K00752.02,CONFIRMED,CANDIDATE,2.0,0.0,0.0,0.0,0.0,54.418383,...,15.347,15.890,15.270,15.114,15.006,14.082,13.751,13.648,010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-002-2016...
2,10811496,K00753.01,CANDIDATE,CANDIDATE,1.0,0.0,0.0,0.0,0.0,19.899140,...,15.436,15.943,15.390,15.220,15.166,14.254,13.900,13.826,010/010811/010811496/dv/kplr010811496-20160209...,010/010811/010811496/dv/kplr010811496-001-2016...
3,10848459,K00754.01,FALSE POSITIVE,FALSE POSITIVE,1.0,0.0,1.0,0.0,0.0,1.736952,...,15.597,16.100,15.554,15.382,15.266,14.326,13.911,13.809,010/010848/010848459/dv/kplr010848459-20160209...,010/010848/010848459/dv/kplr010848459-001-2016...
4,10854555,K00755.01,CONFIRMED,CANDIDATE,1.0,0.0,0.0,0.0,0.0,2.525592,...,15.509,16.015,15.468,15.292,15.241,14.366,14.064,13.952,010/010854/010854555/dv/kplr010854555-20160209...,010/010854/010854555/dv/kplr010854555-001-2016...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9560,10090151,K07985.01,FALSE POSITIVE,FALSE POSITIVE,1.0,0.0,1.0,1.0,0.0,0.527699,...,14.082,14.082,13.501,13.299,13.177,12.200,11.814,11.756,010/010090/010090151/dv/kplr010090151-20160209...,010/010090/010090151/dv/kplr010090151-001-2016...
9561,10128825,K07986.01,CANDIDATE,CANDIDATE,1.0,0.0,0.0,0.0,0.0,1.739849,...,14.757,15.189,14.687,14.571,14.484,13.641,13.339,13.261,010/010128/010128825/dv/kplr010128825-20160209...,010/010128/010128825/dv/kplr010128825-001-2016...
9562,10147276,K07987.01,FALSE POSITIVE,FALSE POSITIVE,1.0,0.0,0.0,1.0,0.0,0.681402,...,15.385,15.853,15.347,15.185,15.158,14.220,13.913,13.844,010/010147/010147276/dv/kplr010147276-20160209...,010/010147/010147276/dv/kplr010147276-001-2016...
9563,10155286,K07988.01,CANDIDATE,CANDIDATE,1.0,0.0,0.0,0.0,0.0,333.486169,...,10.998,11.733,10.880,10.682,10.578,9.501,9.027,8.921,010/010155/010155286/dv/kplr010155286-20160209...,010/010155/010155286/dv/kplr010155286-001-2016...


In [175]:
df_clean.drop(column_categories['Links & Reports'], axis=1, inplace=True)

In [176]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 90 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   kepid               9565 non-null   object 
 1   kepoi_name          9565 non-null   object 
 2   koi_disposition     9565 non-null   object 
 3   koi_pdisposition    9565 non-null   object 
 4   koi_tce_plnt_num    9565 non-null   float64
 5   koi_fpflag_nt       9565 non-null   float64
 6   koi_fpflag_ss       9565 non-null   float64
 7   koi_fpflag_co       9565 non-null   float64
 8   koi_fpflag_ec       9565 non-null   float64
 9   koi_period          9565 non-null   float64
 10  koi_period_err1     9565 non-null   float64
 11  koi_period_err2     9565 non-null   float64
 12  koi_time0bk         9565 non-null   float64
 13  koi_time0bk_err1    9565 non-null   float64
 14  koi_time0bk_err2    9565 non-null   float64
 15  koi_time0           9565 non-null   float64
 16  koi_ti

In [206]:
categorical_cols = df_clean.select_dtypes(include=['object', 'category']).columns.tolist()

In [211]:
categorical_cols

['koi_disposition', 'koi_pdisposition', 'koi_fittype']

In [188]:
df_clean['koi_disposition'].unique()

array(['CONFIRMED', 'CANDIDATE', 'FALSE POSITIVE'], dtype=object)

In [187]:
df_clean.loc[df_clean['koi_disposition'] == '0', 'koi_disposition'] = 'CANDIDATE'

In [None]:
df_clean.drop(['kepid', 'kepoi_name',], inplace=True, axis=1)

In [207]:
for col in categorical_cols:

    print("\n")
    print(df_clean[col].nunique())
    print(df_clean[col].value_counts())




3
koi_disposition
FALSE POSITIVE    4839
CONFIRMED         2746
CANDIDATE         1980
Name: count, dtype: int64


2
koi_pdisposition
FALSE POSITIVE    4848
CANDIDATE         4717
Name: count, dtype: int64


4
koi_fittype
LS+MCMC    7898
MCMC       1206
none        369
LS           92
Name: count, dtype: int64


In [200]:
df_clean.loc[df_clean['koi_pdisposition'] == 'q1_q17_dr25_sup_koi', 'koi_pdisposition'] = 'CANDIDATE'

In [203]:
df['koi_trans_mod'].value_counts()    

koi_trans_mod
Mandel and Agol (2002 ApJ 580 171)    9200
78.00                                    1
Name: count, dtype: int64

In [205]:
df_clean.drop("koi_trans_mod",axis=1, inplace=True)

In [213]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(df_clean[['koi_pdisposition', 'koi_fittype']])
df_encode = pd.get_dummies(df_clean, columns=['koi_pdisposition', 'koi_fittype'], drop_first=True)
df_final=df_clean.drop(['koi_pdisposition','koi_fittype'], axis=1)
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(['koi_pdisposition', 'koi_fittype']))
df_final=pd.concat([df_final,one_hot_df ], axis=1)
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9565 entries, 0 to 9564
Data columns (total 91 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   koi_disposition                  9565 non-null   object 
 1   koi_tce_plnt_num                 9565 non-null   float64
 2   koi_fpflag_nt                    9565 non-null   float64
 3   koi_fpflag_ss                    9565 non-null   float64
 4   koi_fpflag_co                    9565 non-null   float64
 5   koi_fpflag_ec                    9565 non-null   float64
 6   koi_period                       9565 non-null   float64
 7   koi_period_err1                  9565 non-null   float64
 8   koi_period_err2                  9565 non-null   float64
 9   koi_time0bk                      9565 non-null   float64
 10  koi_time0bk_err1                 9565 non-null   float64
 11  koi_time0bk_err2                 9565 non-null   float64
 12  koi_time0           

In [214]:
x=df_final.drop('koi_disposition', axis=1)
y=df_final['koi_disposition']

In [215]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [216]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [217]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)


In [218]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.9492943021432305
                precision    recall  f1-score   support

     CANDIDATE       0.92      0.84      0.88       405
     CONFIRMED       0.88      0.94      0.91       536
FALSE POSITIVE       1.00      1.00      1.00       972

      accuracy                           0.95      1913
     macro avg       0.93      0.93      0.93      1913
  weighted avg       0.95      0.95      0.95      1913



In [None]:
dt= pd.read_csv('src\data\nasa_Kepler Objects of Interest (KOI).csv')

FileNotFoundError: [Errno 2] No such file or directory: 'nasa_Kepler Objects of Interest (KOI).csv'

In [225]:
import pandas as pd

# Read the NASA KOI dataset
koi_df = pd.read_csv('src/data/nasa_Kepler Objects of Interest (KOI).csv')


# Display basic information about the datasets
print("NASA KOI Dataset Shape:", koi_df.shape)
# Display the first few rows of each dataset
print("\nNASA KOI Dataset Preview:")
print(koi_df.head())



NASA KOI Dataset Shape: (9565, 142)

NASA KOI Dataset Preview:
   Unnamed: 0 rowid     kepid kepoi_name   kepler_name koi_disposition  \
0           0     1  10797460  K00752.01  Kepler-227 b       CONFIRMED   
1           1     2  10797460  K00752.02  Kepler-227 c       CONFIRMED   
2           2     3  10811496  K00753.01           NaN       CANDIDATE   
3           3     4  10848459  K00754.01           NaN  FALSE POSITIVE   
4           4     5  10854555  K00755.01  Kepler-664 b       CONFIRMED   

  koi_vet_stat koi_vet_date koi_pdisposition koi_score  ...  koi_dicco_mdec  \
0         Done   2018-08-16        CANDIDATE    1.0000  ...           0.200   
1         Done   2018-08-16        CANDIDATE    0.9690  ...           0.000   
2         Done   2018-08-16        CANDIDATE    0.0000  ...          -0.034   
3         Done   2018-08-16   FALSE POSITIVE    0.0000  ...           0.147   
4         Done   2018-08-16        CANDIDATE    1.0000  ...          -0.090   

   koi_dicco_mdec

In [231]:
dfa= df[t_cal].copy()

In [232]:
# Calculate the number of rows
n_rows = len(dfa)

# Calculate the threshold (50% of rows)
threshold = n_rows * 0.5

# Get columns with missing values more than 50%
cols_to_drop = [col for col in df.columns if df[col].isnull().sum() > threshold]

# Drop these columns
dfa.drop(columns=cols_to_drop, inplace=True)

# Print the dropped columns and remaining shape
print("Dropped columns:", cols_to_drop)
print("New shape:", dfa.shape)

Dropped columns: ['kepler_name', 'koi_eccen_err1', 'koi_eccen_err2', 'koi_longp', 'koi_longp_err1', 'koi_longp_err2', 'koi_ingress', 'koi_ingress_err1', 'koi_ingress_err2', 'koi_sma_err1', 'koi_sma_err2', 'koi_incl_err1', 'koi_incl_err2', 'koi_teq_err1', 'koi_teq_err2', 'koi_model_dof', 'koi_model_chisq', 'koi_sage', 'koi_sage_err1', 'koi_sage_err2']
New shape: (9565, 114)


In [233]:
dfa.isnull().sum().sort_values(ascending=False)

koi_bin_oedp_sig    1511
koi_comment         1210
koi_num_transits    1144
koi_max_sngle_ev    1143
koi_quarters        1143
                    ... 
koi_vet_stat           0
koi_impact             0
kepoi_name             0
koi_disposition        0
kepid                  0
Length: 114, dtype: int64

In [234]:
good_features = df_clean.columns.tolist()

In [235]:
len(good_features)

87

In [236]:
dfa= df[good_features].copy()
dfa.isnull().sum().sort_values(ascending=False)

koi_bin_oedp_sig    1511
koi_num_transits    1144
koi_max_mult_ev     1143
koi_max_sngle_ev    1143
koi_fwm_stat_sig    1077
                    ... 
koi_duration           1
koi_pdisposition       1
koi_count              1
koi_impact             0
koi_disposition        0
Length: 87, dtype: int64

In [None]:
# Find columns with dtype 'object' but containing only numeric values
miss_encoded_cols = []
for col in dfa.select_dtypes(include=['object']).columns:
    # Try converting to numeric, count non-convertible values
    non_numeric = pd.to_numeric(dfa[col], errors='coerce').isna() & dfa[col].notna()
    # If all non-null values are convertible, it's likely mis-encoded
    if dfa[col].notna().sum() > 0 and non_numeric.sum() == 0:
        miss_encoded_cols.append(col)

print("Potential mis-encoded numeric columns:", miss_encoded_cols)

Potential mis-encoded numeric columns: []


In [240]:
dfa[dfa.select_dtypes(include=['object']).columns].describe(include='all')

Unnamed: 0,koi_disposition,koi_pdisposition,koi_ror,koi_insol,koi_smass,koi_max_sngle_ev,koi_fittype,koi_count
count,9565,9564,9201.0,9243.0,9201.0,8422.0,9563,9564
unique,4,3,8828.0,8213.0,2264.0,8421.0,4,15
top,FALSE POSITIVE,FALSE POSITIVE,0.009013,0.45,1.0,4.23504,LS+MCMC,1
freq,4839,4847,4.0,9.0,95.0,2.0,7896,4780


In [241]:
numweical_mis=['koi_max_sngle_ev','koi_ror','koi_insol','koi_smass']
for col in numweical_mis:
    dfa[col]= pd.to_numeric(dfa[col], errors='coerce')

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler


numerical_cols = dfa.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = dfa.drop('koi_disposition', axis=1).select_dtypes(include=['object', 'category']).columns.tolist()
num_pipeline= Pipeline(
                steps=[
                ("imputer",SimpleImputer(strategy="median")),
                ("scaler",StandardScaler())

                ]
            )
cat_pipeline= Pipeline(
                steps=[
                ("imputer",SimpleImputer(strategy="most_frequent")),
                ("onehot",OneHotEncoder(handle_unknown="ignore"))
                ]
            )


In [None]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Custom transformer to replace rare categories
class RareCategoryImputer(BaseEstimator, TransformerMixin):
    def __init__(self, min_count=5):
        self.min_count = min_count
        self.most_frequent_ = {}
    
    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        for col in X.columns:
            counts = X[col].value_counts()
            # Replace values appearing less than min_count
            rare_mask = counts < self.min_count
            rare_values = counts[rare_mask].index
            # Determine most frequent value
            self.most_frequent_[col] = X[col].mode()[0]
        return self
    
    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col in X.columns:
            counts = X[col].value_counts()
            rare_values = counts[counts < self.min_count].index
            X[col] = X[col].replace(rare_values, self.most_frequent_[col])
        return X

# Define numerical and categorical columns
numerical_cols = dfa.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = dfa.drop('koi_disposition', axis=1).select_dtypes(include=['object', 'category']).columns.tolist()

# Numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

# Categorical pipeline with rare category handling
cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("rare", RareCategoryImputer(min_count=5)),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# Combine into ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, numerical_cols),
        ("cat", cat_pipeline, categorical_cols)
    ]
)


train_df,test_df=train_test_split(dfa,test_size=0.2,random_state=42)
target_column_name = 'koi_disposition'
input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df=train_df[target_column_name]

input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df=test_df[target_column_name]

input_feature_train_arr=preprocessor.fit_transform(input_feature_train_df)
input_feature_test_arr=preprocessor.transform(input_feature_test_df)

train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]


TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['int', 'str']

In [244]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

class RareCategoryImputer(BaseEstimator, TransformerMixin):
    def __init__(self, min_count=5):
        self.min_count = min_count
        self.most_frequent_ = {}
        self.rare_values_ = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        for col in X.columns:
            counts = X[col].value_counts()
            self.rare_values_[col] = counts[counts < self.min_count].index.tolist()
            self.most_frequent_[col] = X[col].mode()[0]
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col in X.columns:
            X[col] = X[col].replace(self.rare_values_[col], self.most_frequent_[col])
            X[col] = X[col].astype(str)  # <-- ensure all values are strings
        return X


numerical_cols = dfa.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = dfa.drop('koi_disposition', axis=1).select_dtypes(include=['object', 'category']).columns.tolist()

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("rare", RareCategoryImputer(min_count=5)),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, numerical_cols),
    ("cat", cat_pipeline, categorical_cols)
])

train_df, test_df = train_test_split(dfa, test_size=0.2, random_state=42)

X_train = train_df.drop(columns=['koi_disposition'])
y_train = train_df['koi_disposition']
X_test = test_df.drop(columns=['koi_disposition'])
y_test = test_df['koi_disposition']

X_train_arr = preprocessor.fit_transform(X_train)
X_test_arr = preprocessor.transform(X_test)

train_arr = np.c_[X_train_arr, np.array(y_train)]
test_arr = np.c_[X_test_arr, np.array(y_test)]


In [245]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming your preprocessor and train/test arrays are ready:
# train_arr and test_arr
# Last column is the target

# Split features and target
X_train = train_arr[:, :-1]
y_train = train_arr[:, -1]

X_test = test_arr[:, :-1]
y_test = test_arr[:, -1]

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,       # Number of trees
    max_depth=None,         # Let trees expand until all leaves are pure
    random_state=42,
    n_jobs=-1               # Use all CPU cores
)

# Fit the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9472

Classification Report:
                precision    recall  f1-score   support

     CANDIDATE       0.92      0.83      0.87       405
     CONFIRMED       0.88      0.94      0.91       536
FALSE POSITIVE       1.00      1.00      1.00       972

      accuracy                           0.95      1913
     macro avg       0.93      0.92      0.93      1913
  weighted avg       0.95      0.95      0.95      1913

Confusion Matrix:
[[337  68   0]
 [ 31 504   1]
 [  0   1 971]]


In [246]:
print(numerical_cols)

['koi_tce_plnt_num', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_time0', 'koi_time0_err1', 'koi_time0_err2', 'koi_eccen', 'koi_incl', 'koi_sma', 'koi_num_transits', 'koi_duration', 'koi_depth', 'koi_impact', 'koi_dor', 'koi_ror', 'koi_prad', 'koi_teq', 'koi_insol', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_smet', 'koi_smet_err1', 'koi_smet_err2', 'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'koi_smass', 'koi_smass_err1', 'koi_smass_err2', 'koi_srho', 'koi_srho_err1', 'koi_srho_err2', 'koi_model_snr', 'koi_max_sngle_ev', 'koi_max_mult_ev', 'koi_bin_oedp_sig', 'ra', 'dec', 'koi_fwm_stat_sig', 'koi_fwm_sra', 'koi_fwm_sra_err', 'koi_fwm_sdec', 'koi_fwm_sdec_err', 'koi_fwm_srao', 'koi_fwm_srao_err', 'koi_fwm_sdeco', 'koi_fwm_sdeco_err', 'koi_fwm_prao', 'koi_fwm_prao_err', 'koi_fwm_pdeco', 'koi_fw

In [247]:
print(categorical_cols)

['koi_pdisposition', 'koi_fittype', 'koi_count']
