In [50]:
import numpy as np
import pandas as pd

In [51]:
df = pd.read_csv('exoplanetai_datacleaned4_final_withMedian.csv')
df.isnull().sum().sum()

np.int64(0)

In [52]:
# 2. Categorize All Applicable Columns
# ---------------------------------------------------------
# Strictly Positive (> 0): Physical measurements that cannot be zero or negative
strictly_positive = [
    'pl_orbper', 'pl_orbsmax', 'pl_rade', 'pl_masse', 'pl_msinie',
    'pl_cmasse', 'pl_bmasse', 'pl_dens', 'pl_insol', 'pl_eqt',
    'pl_rvamp', 'st_teff', 'st_rad', 'st_mass', 'st_age', 'st_dens',
    'sy_dist', 'sy_plx', 'pl_trandur', 'pl_trandep', 'pl_ratdor', 'pl_ratror'
]

df[strictly_positive].dtypes



pl_orbper     float64
pl_orbsmax    float64
pl_rade       float64
pl_masse      float64
pl_msinie     float64
pl_cmasse     float64
pl_bmasse     float64
pl_dens       float64
pl_insol      float64
pl_eqt        float64
pl_rvamp      float64
st_teff       float64
st_rad        float64
st_mass       float64
st_age        float64
st_dens       float64
sy_dist       float64
sy_plx        float64
pl_trandur    float64
pl_trandep    float64
pl_ratdor     float64
pl_ratror     float64
dtype: object

In [53]:
# Non-Negative (>= 0): Counts and absolute flags
non_negative = [
    'pl_imppar', 'sy_snum', 'sy_pnum', 'sy_mnum', 'pl_nnotes',
    'st_nphot', 'st_nrvc', 'st_nspec', 'pl_nespec', 'pl_ntranspec', 'pl_ndispec'
]

df[non_negative].dtypes



pl_imppar       float64
sy_snum           int64
sy_pnum           int64
sy_mnum           int64
pl_nnotes         int64
st_nphot          int64
st_nrvc           int64
st_nspec          int64
pl_nespec         int64
pl_ntranspec      int64
pl_ndispec        int64
dtype: object

In [54]:
# Uncertainties (NASA standard: err1 >= 0, err2 <= 0)
err1_cols = [c for c in df.columns if 'err1' in c]
err2_cols = [c for c in df.columns if 'err2' in c]

# Uncertainties (NASA standard: err1 >= 0, err2 <= 0)
err1_cols = [c for c in df.columns if 'err1' in c]
err2_cols = [c for c in df.columns if 'err2' in c]

df[err1_cols + err2_cols].dtypes.value_counts()




float64    92
Name: count, dtype: int64

In [55]:
# 3. Data Type Conversion (Ensuring Numeric Standards)
all_numeric_cols = list(set(strictly_positive + non_negative + err1_cols + err2_cols + ['pl_orbeccen', 'pl_orbincl']))
df[all_numeric_cols].dtypes.to_clipboard()


In [56]:
all_numeric_cols = [c for c in all_numeric_cols if c in df.columns]

for col in all_numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df.isnull().sum().sum()

np.int64(0)

In [57]:
# 4. Apply Physical Sanitization Filters (Removing Impossible Values)
# ---------------------------------------------------------
initial_rows = len(df)
mask = pd.Series(True, index=df.index)

mask.value_counts()

True    5954
Name: count, dtype: int64

In [58]:
# Applying strictly positive and non-negative filters 
for col in strictly_positive:
    if col in df.columns: mask &= (df[col] > 0)
for col in non_negative:
    if col in df.columns: mask &= (df[col] >= 0)

# Orbital Bound Check (Eccentricity < 1 for bound orbits, Inclination <= 180)
if 'pl_orbeccen' in df.columns: mask &= (df['pl_orbeccen'] >= 0) & (df['pl_orbeccen'] < 1)
if 'pl_orbincl' in df.columns: mask &= (df['pl_orbincl'] >= 0) & (df['pl_orbincl'] <= 180)

# Error Symmetery Check
for col in err1_cols: mask &= (df[col] >= 0)
for col in err2_cols: mask &= (df[col] <= 0)

df_filtered = df[mask].copy()


In [59]:
# 5. Statistical Outlier Capping (IQR Method) 
# ---------------------------------------------------------
features_to_cap = ['pl_rade', 'pl_bmasse', 'pl_orbsmax', 'pl_eqt', 'st_teff', 'st_rad', 'st_mass', 'st_lum']
for col in features_to_cap:
    if col in df_filtered.columns:
        Q1, Q3 = df_filtered[col].quantile(0.25), df_filtered[col].quantile(0.75)
        IQR = Q3 - Q1
        df_filtered[col] = df_filtered[col].clip(lower=Q1 - 1.5 * IQR, upper=Q3 + 1.5 * IQR)

# Final Save
df_filtered.shape
df_filtered.to_csv('exoplanetai_datacleaned5_standardized_filtered.csv', index=False)

In [None]:
df_filtered.shape


(5444, 224)