In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset properly (skip NASA comment lines)
df = pd.read_csv(
    "../data/raw/PS_2026.02.18_10.44.05.csv",
    sep=",",
    comment="#",
    engine="python"
)

print("Shape:", df.shape)
print("\nColumn Names:\n")
print(df.columns)

df.head()


Shape: (748, 92)

Column Names:

Index(['pl_name', 'hostname', 'default_flag', 'sy_snum', 'sy_pnum',
       'discoverymethod', 'disc_year', 'disc_facility', 'soltype',
       'pl_controv_flag', 'pl_refname', 'pl_orbper', 'pl_orbpererr1',
       'pl_orbpererr2', 'pl_orbperlim', 'pl_orbsmax', 'pl_orbsmaxerr1',
       'pl_orbsmaxerr2', 'pl_orbsmaxlim', 'pl_rade', 'pl_radeerr1',
       'pl_radeerr2', 'pl_radelim', 'pl_radj', 'pl_radjerr1', 'pl_radjerr2',
       'pl_radjlim', 'pl_bmasse', 'pl_bmasseerr1', 'pl_bmasseerr2',
       'pl_bmasselim', 'pl_bmassj', 'pl_bmassjerr1', 'pl_bmassjerr2',
       'pl_bmassjlim', 'pl_bmassprov', 'pl_orbeccen', 'pl_orbeccenerr1',
       'pl_orbeccenerr2', 'pl_orbeccenlim', 'pl_insol', 'pl_insolerr1',
       'pl_insolerr2', 'pl_insollim', 'pl_eqt', 'pl_eqterr1', 'pl_eqterr2',
       'pl_eqtlim', 'ttv_flag', 'st_refname', 'st_spectype', 'st_teff',
       'st_tefferr1', 'st_tefferr2', 'st_tefflim', 'st_rad', 'st_raderr1',
       'st_raderr2', 'st_radlim', 'st_m

Unnamed: 0,pl_name,hostname,default_flag,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,soltype,pl_controv_flag,...,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2,rowupdate,pl_pubdate,releasedate
0,AU Mic b,AU Mic,1,1,4,Transit,2020.0,Transiting Exoplanet Survey Satellite (TESS),Published Confirmed,0,...,-0.1,4.529,0.02,-0.02,7.84038,0.000731,-0.000731,2024-09-28,2023-12,2024-09-28
1,AU Mic c,AU Mic,1,1,4,Transit,2021.0,Transiting Exoplanet Survey Satellite (TESS),Published Confirmed,0,...,-0.1,4.529,0.02,-0.02,7.84038,0.000731,-0.000731,2024-09-28,2023-12,2024-09-28
2,BD+05 4868 A b,BD+05 4868 A,1,2,1,Transit,2025.0,Transiting Exoplanet Survey Satellite (TESS),Published Confirmed,0,...,-0.01,7.448,0.026,-0.026,9.84152,0.000476,-0.000476,2025-03-21,2025-01,2025-03-21
3,BD-14 3065 b,BD-14 3065 A,1,3,1,Transit,2024.0,Transiting Exoplanet Survey Satellite (TESS),Published Confirmed,0,...,-0.013,9.932,0.026,-0.026,10.91,0.001617,-0.001617,2024-06-25,2024-03,2024-06-25
4,DS Tuc A b,DS Tuc A,1,2,1,Transit,2019.0,Transiting Exoplanet Survey Satellite (TESS),Published Confirmed,0,...,-0.03,6.676,0.034,-0.034,8.31926,0.001025,-0.001025,2019-07-03,2019-07,2019-07-11


In [11]:
# Select required columns for habitability analysis
selected_columns = [
    "pl_name",
    "pl_rade",
    "pl_bmasse",
    "pl_orbper",
    "pl_orbsmax",
    "pl_eqt",
    "pl_insol",
    "pl_orbeccen",
    "st_teff",
    "st_rad",
    "st_mass",
    "st_met",
    "st_spectype"
]

df_selected = df[selected_columns]

print("New Shape:", df_selected.shape)
df_selected.head()


New Shape: (748, 13)


Unnamed: 0,pl_name,pl_rade,pl_bmasse,pl_orbper,pl_orbsmax,pl_eqt,pl_insol,pl_orbeccen,st_teff,st_rad,st_mass,st_met,st_spectype
0,AU Mic b,3.95677,,8.46308,0.0649,600.0,21.2,0.00577,3678.0,0.744,0.51,0.23,
1,AU Mic c,2.522021,,18.85969,0.1108,459.0,7.3,0.00338,3678.0,0.744,0.51,0.23,
2,BD+05 4868 A b,,6.2,1.271869,0.0208,1820.0,,,4596.0,0.69,0.7,-0.05,K V
3,BD-14 3065 b,21.59,3932.0,4.288973,0.0656,2001.0,,0.066,6935.0,2.35,1.41,-0.34,
4,DS Tuc A b,5.7,,8.138268,,850.0,,0.0,5428.0,0.964,1.01,0.0,G6 V


In [12]:
# Missing value count
missing_counts = df_selected.isnull().sum()

print("Missing Values Per Column:\n")
print(missing_counts)

# Percentage of missing values
missing_percentage = (df_selected.isnull().sum() / len(df_selected)) * 100
print("\nMissing Percentage:\n")
print(missing_percentage)


Missing Values Per Column:

pl_name          0
pl_rade         14
pl_bmasse      184
pl_orbper        4
pl_orbsmax      94
pl_eqt         120
pl_insol       380
pl_orbeccen    171
st_teff         19
st_rad           6
st_mass         10
st_met         112
st_spectype    435
dtype: int64

Missing Percentage:

pl_name         0.000000
pl_rade         1.871658
pl_bmasse      24.598930
pl_orbper       0.534759
pl_orbsmax     12.566845
pl_eqt         16.042781
pl_insol       50.802139
pl_orbeccen    22.860963
st_teff         2.540107
st_rad          0.802139
st_mass         1.336898
st_met         14.973262
st_spectype    58.155080
dtype: float64


In [13]:
# Drop columns with too many missing values
df_selected = df_selected.drop(columns=["pl_insol", "st_spectype"])

print("Shape after dropping high-missing columns:", df_selected.shape)
df_selected.head()


Shape after dropping high-missing columns: (748, 11)


Unnamed: 0,pl_name,pl_rade,pl_bmasse,pl_orbper,pl_orbsmax,pl_eqt,pl_orbeccen,st_teff,st_rad,st_mass,st_met
0,AU Mic b,3.95677,,8.46308,0.0649,600.0,0.00577,3678.0,0.744,0.51,0.23
1,AU Mic c,2.522021,,18.85969,0.1108,459.0,0.00338,3678.0,0.744,0.51,0.23
2,BD+05 4868 A b,,6.2,1.271869,0.0208,1820.0,,4596.0,0.69,0.7,-0.05
3,BD-14 3065 b,21.59,3932.0,4.288973,0.0656,2001.0,0.066,6935.0,2.35,1.41,-0.34
4,DS Tuc A b,5.7,,8.138268,,850.0,0.0,5428.0,0.964,1.01,0.0


In [14]:
# Fill missing numeric values with median
for column in df_selected.columns:
    if df_selected[column].dtype != "object":
        median_value = df_selected[column].median()
        df_selected[column] = df_selected[column].fillna(median_value)

# Verify no missing values remain
print("Remaining Missing Values:\n")
print(df_selected.isnull().sum())


Remaining Missing Values:

pl_name        0
pl_rade        0
pl_bmasse      0
pl_orbper      0
pl_orbsmax     0
pl_eqt         0
pl_orbeccen    0
st_teff        0
st_rad         0
st_mass        0
st_met         0
dtype: int64


In [15]:
# Remove physically impossible values

df_selected = df_selected[
    (df_selected["pl_rade"] > 0) &
    (df_selected["pl_bmasse"] > 0) &
    (df_selected["pl_orbper"] > 0) &
    (df_selected["pl_orbsmax"] > 0) &
    (df_selected["pl_eqt"] > 0) &
    (df_selected["st_teff"] > 0) &
    (df_selected["st_rad"] > 0) &
    (df_selected["st_mass"] > 0)
]

print("Shape after removing impossible values:", df_selected.shape)


Shape after removing impossible values: (748, 11)


In [16]:
# IQR Outlier Removal
numeric_cols = df_selected.select_dtypes(include=np.number).columns

for col in numeric_cols:
    Q1 = df_selected[col].quantile(0.25)
    Q3 = df_selected[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df_selected = df_selected[
        (df_selected[col] >= lower_bound) &
        (df_selected[col] <= upper_bound)
    ]

print("Shape after IQR outlier removal:", df_selected.shape)


Shape after IQR outlier removal: (449, 11)


In [20]:
# Normalize temperature score (ideal range 200K–320K)
df_selected["temp_score"] = 1 - abs(df_selected["pl_eqt"] - 288) / 288
df_selected["temp_score"] = df_selected["temp_score"].clip(0, 1)

# Radius score (ideal near 1 Earth radius)
df_selected["radius_score"] = 1 - abs(df_selected["pl_rade"] - 1) / df_selected["pl_rade"].max()
df_selected["radius_score"] = df_selected["radius_score"].clip(0, 1)

# Mass score (ideal near 1 Earth mass)
df_selected["mass_score"] = 1 - abs(df_selected["pl_bmasse"] - 1) / df_selected["pl_bmasse"].max()
df_selected["mass_score"] = df_selected["mass_score"].clip(0, 1)

# Final Habitability Score
df_selected["habitability_score"] = (
    df_selected["temp_score"] +
    df_selected["radius_score"] +
    df_selected["mass_score"]
) / 3


In [21]:
df_selected["stellar_temp_score"] = 1 - abs(df_selected["st_teff"] - 5778) / df_selected["st_teff"].max()
df_selected["stellar_temp_score"] = df_selected["stellar_temp_score"].clip(0, 1)

df_selected["stellar_mass_score"] = 1 - abs(df_selected["st_mass"] - 1) / df_selected["st_mass"].max()
df_selected["stellar_mass_score"] = df_selected["stellar_mass_score"].clip(0, 1)

df_selected["stellar_compatibility_index"] = (
    df_selected["stellar_temp_score"] +
    df_selected["stellar_mass_score"]
) / 2


In [22]:
# Check score ranges
print("Habitability Score Range:",
      df_selected["habitability_score"].min(),
      "to",
      df_selected["habitability_score"].max())

print("Stellar Compatibility Index Range:",
      df_selected["stellar_compatibility_index"].min(),
      "to",
      df_selected["stellar_compatibility_index"].max())

# Create orbital stability if not yet done
df_selected["orbital_stability_factor"] = 1 - df_selected["pl_orbeccen"]
df_selected["orbital_stability_factor"] = df_selected["orbital_stability_factor"].clip(0, 1)

print("Orbital Stability Range:",
      df_selected["orbital_stability_factor"].min(),
      "to",
      df_selected["orbital_stability_factor"].max())


Habitability Score Range: 0.04523955574268471 to 0.9412443499645554
Stellar Compatibility Index Range: 0.5060545701377607 to 0.9982904175847003
Orbital Stability Range: 0.8260000000000001 to 1.0


In [23]:
print("Habitability Score Range:",
      df_selected["habitability_score"].min(),
      "to",
      df_selected["habitability_score"].max())


Habitability Score Range: 0.04523955574268471 to 0.9412443499645554


In [24]:
print("Stellar Compatibility Index Range:",
      df_selected["stellar_compatibility_index"].min(),
      "to",
      df_selected["stellar_compatibility_index"].max())


Stellar Compatibility Index Range: 0.5060545701377607 to 0.9982904175847003


In [25]:
df_selected["orbital_stability_factor"] = 1 - df_selected["pl_orbeccen"]
df_selected["orbital_stability_factor"] = df_selected["orbital_stability_factor"].clip(0, 1)

print("Orbital Stability Range:",
      df_selected["orbital_stability_factor"].min(),
      "to",
      df_selected["orbital_stability_factor"].max())


Orbital Stability Range: 0.8260000000000001 to 1.0


In [26]:
df_selected = df_selected.drop(columns=[
    "temp_score",
    "radius_score",
    "mass_score",
    "stellar_temp_score",
    "stellar_mass_score"
])

df_selected.head()


Unnamed: 0,pl_name,pl_rade,pl_bmasse,pl_orbper,pl_orbsmax,pl_eqt,pl_orbeccen,st_teff,st_rad,st_mass,st_met,habitability_score,stellar_compatibility_index,orbital_stability_factor
0,AU Mic b,3.95677,32.295,8.46308,0.0649,600.0,0.00577,3678.0,0.744,0.51,0.23,0.588449,0.691086,0.99423
1,AU Mic c,2.522021,32.295,18.85969,0.1108,459.0,0.00338,3678.0,0.744,0.51,0.23,0.747971,0.691086,0.99662
2,BD+05 4868 A b,3.2816,6.2,1.271869,0.0208,1820.0,0.046,4596.0,0.69,0.7,-0.05,0.623591,0.818429,0.954
4,DS Tuc A b,5.7,32.295,8.138268,0.06,850.0,0.0,5428.0,0.964,1.01,0.0,0.559161,0.971309,1.0
5,GJ 1252 b,1.193,1.32,0.518233,0.06,867.5,0.0,3458.0,0.391,0.381,0.1,0.663132,0.634021,1.0


In [27]:
df_selected.to_csv("../data/processed/preprocessed.csv", index=False)

print("Preprocessed dataset saved successfully!")


Preprocessed dataset saved successfully!
