In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier



In [None]:
# 1. LOAD DATA
# Loading the dataset from the previous cleaning stage
df = pd.read_csv('exoplanetai_datacleaned5_standardized_filtered.csv')

In [24]:
# 2. FIX HIDDEN NULLS
# Correcting columns where 'bound method' strings replaced actual numbers
cols_to_fix = ['pl_orbpererr1', 'pl_orbpererr2', 'pl_orbsmaxerr1', 'pl_orbsmaxerr2']
for col in cols_to_fix:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Ensure all numeric data is correctly typed
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

In [25]:
# 2. FEATURE ENGINEERING: Incorporating the "Essence" of 224 Columns
def calculate_confidence(val, err1, err2):
    # Relates the magnitude of error to the value (Lower error = Higher confidence)
    rel_err = (np.abs(err1) + np.abs(err2)) / (2 * np.abs(val) + 1e-5)
    return 1 / (1 + rel_err)

# A. Habitability Score Index (HSI) - Earth Similarity
# High score = Radius near 1.0 Re and Temp near 288K
h_temp = np.exp(-0.5 * ((df['pl_eqt'] - 288.0)**2 / 60.0**2))
h_rad = np.exp(-0.5 * ((df['pl_rade'] - 1.0)**2 / 0.5**2))
# Essence: Weighted by measurement confidence of Radius and Temperature
h_conf = calculate_confidence(df['pl_rade'], df['pl_radeerr1'], df['pl_radeerr2']) * \
         calculate_confidence(df['pl_eqt'], df['pl_eqterr1'], df['pl_eqterr2'])
df['HSI'] = h_temp * h_rad * h_conf

# B. Stellar Compatibility Index (SCI) - Host Suitability
s_temp = np.exp(-0.5 * ((df['st_teff'] - 5778.0)**2 / 1200.0**2))
# Essence: Weighted by observational density (how well we've studied this star)
s_stability = (df['st_nspec'] + df['st_nphot'] + df['st_nrvc'])
s_stability_norm = (s_stability - s_stability.min()) / (s_stability.max() - s_stability.min() + 1e-5)
df['SCI'] = s_temp * (0.7 + 0.3 * s_stability_norm)

# C. Orbital Stability Factor (OSF) - Dynamic Consistency
# Essence: Low eccentricity and high confidence in that measurement
ecc_conf = calculate_confidence(df['pl_orbeccen'], df['pl_orbeccenerr1'], df['pl_orbeccenerr2'])
df['OSF'] = (1 - df['pl_orbeccen']) * ecc_conf

  df['HSI'] = h_temp * h_rad * h_conf
  df['SCI'] = s_temp * (0.7 + 0.3 * s_stability_norm)
  df['OSF'] = (1 - df['pl_orbeccen']) * ecc_conf


In [26]:
# 3. TARGET VARIABLE: is_habitable
# Binary Target: Rocky (<1.6 Re) and high Earth-similarity signal
df['is_habitable'] = ((df['pl_rade'] < 1.6) & (df['HSI'] > 0.4)).astype(int)

  df['is_habitable'] = ((df['pl_rade'] < 1.6) & (df['HSI'] > 0.4)).astype(int)


In [42]:
# 4. RANDOM FOREST FEATURE SELECTION
# Identifying "Hidden Gems" from all 224 raw numeric features
X_raw = df.select_dtypes(include=[np.number]).drop(['is_habitable', 'HSI', 'SCI', 'OSF'], axis=1)
y = df['is_habitable']

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_raw, y)

importances = pd.DataFrame({'feature': X_raw.columns, 'importance': rf.feature_importances_})
# Filter out errors from the importance list to find primary gems
top_gems = importances[~importances['feature'].str.contains('err|lim')].sort_values(by='importance', ascending=False)

In [51]:
# 5. FINAL SELECTION: 36 Essential + RF Gems
essential_36 = [
    'pl_name', 'hostname', 'pl_orbper', 'pl_orbsmax', 'pl_rade', 'pl_bmasse', 
    'pl_dens', 'pl_orbeccen', 'pl_orbincl', 'pl_insol', 'pl_eqt', 'pl_trandep', 
    'pl_trandur', 'pl_ratdor', 'pl_ratror', 'pl_rvamp', 'st_teff', 'st_rad', 
    'st_mass', 'st_met', 'st_lum', 'st_logg', 'st_age', 'st_dens', 'st_spectype', 
    'sy_dist', 'sy_plx', 'sy_snum', 'sy_pnum', 'sy_vmag', 'sy_gaiamag', 
    'discoverymethod', 'HSI', 'SCI', 'OSF', 'is_habitable'
]

# Adding any top 3 RF "Gems" that weren't in our list (e.g., sy_kepmag)
final_cols = essential_36+top_gems['feature'].head(35).to_list()
final_cols = list(dict.fromkeys(final_cols))  # Remove duplicates while preserving order
len(final_cols)

47

In [55]:
# 6. ENCODING & EXPORT
df_final = df[final_cols].copy()

df_final.to_csv('preprocessed.csv', index=False)
print(f"Preprocessed successfully with {len(df_final.columns)} columns.")
df_final.shape
df_final['is_habitable'].value_counts()

Preprocessed successfully with 47 columns.


is_habitable
0    5418
1      26
Name: count, dtype: int64