In [48]:
##import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import os

In [49]:
##open and read the csv files into pandas dataframes
dir = os.getcwd()
dir_path = dir.replace('src\kepler', 'data\\')
with  open(dir_path + 'cumulative_2025.10.03_13.30.13.csv') as cumulative:
    cumulative_df = pd.read_csv(cumulative)
    print(cumulative_df.head(1))
    


      kepid kepoi_name   kepler_name koi_disposition koi_pdisposition  \
0  10797460  K00752.01  Kepler-227 b       CONFIRMED        CANDIDATE   

   koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  koi_fpflag_ec  ...  \
0        1.0              0              0              0              0  ...   

   koi_steff_err2  koi_slogg  koi_slogg_err1  koi_slogg_err2  koi_srad  \
0           -81.0      4.467           0.064          -0.096     0.927   

   koi_srad_err1  koi_srad_err2         ra        dec  koi_kepmag  
0          0.105         -0.061  291.93423  48.141651      15.347  

[1 rows x 49 columns]


In [50]:
##drop unnecessary columns
# Drop only the columns you don’t want as features
cumulative_df = cumulative_df.drop(columns=[
    'kepid',            # we don't need this ID column
    'kepoi_name',       # we don't need this KOI name column
    'kepler_name',      # we don't need the official Kepler name column
    'koi_pdisposition', # we don't need the predicted disposition column
    'koi_tce_delivname',# we don't need the delivery file name column
    'ra',               # we don't need right ascension (sky coordinate)
    'dec',              # we don't need declination (sky coordinate)
    'koi_tce_plnt_num'  # we don't need the number of planets per TCE
])



In [51]:
##drop columns that have more than 75 percent missing values
threshold = int(np.ceil(len(cumulative_df) * 0.75))  # require ≥75% non-missing to keep
cumulative_df = cumulative_df.dropna(thresh=threshold, axis=1)  

In [52]:
##count nulls in each column as a percentage of total rows
for col in cumulative_df.columns:
    null_count = cumulative_df[col].isnull().sum()
    null_percentage = (null_count / len(cumulative_df)) * 100
    print(f"Column '{col}' has {null_count} null values ({null_percentage:.2f}%).")

Column 'koi_disposition' has 0 null values (0.00%).
Column 'koi_score' has 1510 null values (15.79%).
Column 'koi_fpflag_nt' has 0 null values (0.00%).
Column 'koi_fpflag_ss' has 0 null values (0.00%).
Column 'koi_fpflag_co' has 0 null values (0.00%).
Column 'koi_fpflag_ec' has 0 null values (0.00%).
Column 'koi_period' has 0 null values (0.00%).
Column 'koi_period_err1' has 454 null values (4.75%).
Column 'koi_period_err2' has 454 null values (4.75%).
Column 'koi_time0bk' has 0 null values (0.00%).
Column 'koi_time0bk_err1' has 454 null values (4.75%).
Column 'koi_time0bk_err2' has 454 null values (4.75%).
Column 'koi_impact' has 363 null values (3.80%).
Column 'koi_impact_err1' has 454 null values (4.75%).
Column 'koi_impact_err2' has 454 null values (4.75%).
Column 'koi_duration' has 0 null values (0.00%).
Column 'koi_duration_err1' has 454 null values (4.75%).
Column 'koi_duration_err2' has 454 null values (4.75%).
Column 'koi_depth' has 363 null values (3.80%).
Column 'koi_depth_e

In [53]:

categorical_columns = cumulative_df.select_dtypes(include=['object']).columns.tolist()


# Perform one-hot encoding with NaN handling
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fill NaN values with 'missing' before encoding
cat_data = cumulative_df[categorical_columns].fillna('missing')
one_hot_encoded = encoder.fit_transform(cat_data)
##C:\Users\downe\RightGoodProgrammers\src\kepler\cumulative.ipynb
# Create DataFrame with encoded columns
feature_names = encoder.get_feature_names_out(categorical_columns)
one_hot_df = pd.DataFrame(
    one_hot_encoded, 
    columns=feature_names,
    index=cumulative_df.index
)

# Get numeric columns to keep
numeric_columns = cumulative_df.select_dtypes(include=['int64', 'float64']).columns

# Combine numeric and encoded categorical columns
df_encoded = pd.concat([
    cumulative_df[numeric_columns],  # Keep numeric columns
    one_hot_df  # Add encoded categorical columns
], axis=1)

print("\nShape of encoded DataFrame:", df_encoded.shape)
print("\nColumns in encoded DataFrame:", df_encoded.columns.tolist())


Shape of encoded DataFrame: (9564, 41)

Columns in encoded DataFrame: ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2', 'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'koi_kepmag', 'koi_disposition_CANDIDATE', 'koi_disposition_CONFIRMED', 'koi_disposition_FALSE POSITIVE']


In [54]:
y = df_encoded[['koi_disposition_CANDIDATE', 'koi_disposition_CONFIRMED', 'koi_disposition_FALSE POSITIVE']]
X = df_encoded.drop(columns=[
    'koi_disposition_CANDIDATE', 
    'koi_disposition_CONFIRMED', 
    'koi_disposition_FALSE POSITIVE'
])
print("\nShape of feature matrix X:", X.shape)
print("\nShape of target matrix y:", y.shape)


Shape of feature matrix X: (9564, 38)

Shape of target matrix y: (9564, 3)


In [55]:
##find correlated features in x and drop them
corr_matrix = X.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.7)]
X = X.drop(columns=to_drop)
print("\nDropped columns due to high correlation:", to_drop)



Dropped columns due to high correlation: ['koi_period_err2', 'koi_time0bk_err2', 'koi_impact_err2', 'koi_duration_err2', 'koi_depth_err2', 'koi_prad_err1', 'koi_prad_err2', 'koi_insol_err1', 'koi_insol_err2', 'koi_steff_err2', 'koi_srad_err1', 'koi_srad_err2']


In [56]:
dataframe = pd.concat([X, y], axis=1)
dir = os.getcwd()
dir_path = dir.replace('src\kepler', 'data\\')
dataframe.to_csv(dir_path + 'cumulative_final_2025.10.03.csv', index=False)