In [1]:
# Import essential libraries for data handling and visualization

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Load NASA Exoplanet dataset
# comment="#" ignores metadata lines starting with #

df = pd.read_csv(r"C:/Users/india/OneDrive/Desktop/exo.csv", comment="#")

# Display first 5 rows
df.head()


  df = pd.read_csv(r"C:/Users/india/OneDrive/Desktop/exo.csv", comment="#")


Unnamed: 0,rowid,pl_name,hostname,pl_letter,hd_name,hip_name,tic_id,gaia_dr2_id,gaia_dr3_id,default_flag,...,rowupdate,pl_pubdate,releasedate,pl_nnotes,st_nphot,st_nrvc,st_nspec,pl_nespec,pl_ntranspec,pl_ndispec
0,1,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,Gaia DR3 3946945413106333696,1,...,9/19/2023,2023-08,9/19/2023,2.0,1.0,2.0,0.0,0.0,0.0,0.0
1,2,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,Gaia DR3 3946945413106333696,0,...,5/14/2014,2008-01,5/14/2014,2.0,1.0,2.0,0.0,0.0,0.0,0.0
2,3,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,Gaia DR3 3946945413106333696,0,...,7/23/2014,2011-08,7/23/2014,2.0,1.0,2.0,0.0,0.0,0.0,0.0
3,4,11 UMi b,11 UMi,b,HD 136726,HIP 74793,TIC 230061010,Gaia DR2 1696798367260229376,Gaia DR3 1696798367260229376,0,...,4/25/2018,2009-10,5/14/2014,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,5,11 UMi b,11 UMi,b,HD 136726,HIP 74793,TIC 230061010,Gaia DR2 1696798367260229376,Gaia DR3 1696798367260229376,1,...,9/4/2018,2017-03,9/6/2018,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [3]:
# Check number of rows and columns

df.shape


(39386, 289)

In [4]:
# View column names and data types

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39386 entries, 0 to 39385
Columns: 289 entries, rowid to pl_ndispec
dtypes: float64(255), int64(6), object(28)
memory usage: 86.8+ MB


In [5]:
# Select only required features for ExoHabitAI

required_columns = [
    "pl_rade",      # Planet radius
    "pl_bmasse",    # Planet mass
    "pl_orbper",    # Orbital period
    "pl_orbsmax",   # Semi-major axis
    "pl_eqt",       # Equilibrium temperature
    "pl_dens",      # Planet density
    "st_teff",      # Host star temperature
    "st_lum",       # Star luminosity
    "st_met",       # Star metallicity
    "st_spectype"   # Star spectral type
]

df = df[required_columns]

df.head()


Unnamed: 0,pl_rade,pl_bmasse,pl_orbper,pl_orbsmax,pl_eqt,pl_dens,st_teff,st_lum,st_met,st_spectype
0,,4914.898486,323.21,1.178,,,4874.0,1.97823,-0.26,G8 III
1,,6165.6,326.03,1.29,,,4742.0,2.243,-0.35,G8 III
2,,5434.7,,1.21,,,,,,
3,,3337.07,516.22,1.54,,,4340.0,,0.04,K4 III
4,,4684.8142,516.21997,1.53,,,4213.0,,-0.02,


In [6]:
# Count missing values per column

df.isnull().sum()


pl_rade        12275
pl_bmasse      32185
pl_orbper       3361
pl_orbsmax     17338
pl_eqt         22127
pl_dens        36635
st_teff         3543
st_lum         29695
st_met         14561
st_spectype    36466
dtype: int64

In [7]:
# Percentage of missing data per column

(df.isnull().sum() / len(df)) * 100


pl_rade        31.165897
pl_bmasse      81.716854
pl_orbper       8.533489
pl_orbsmax     44.020718
pl_eqt         56.179861
pl_dens        93.015285
st_teff         8.995582
st_lum         75.394810
st_met         36.969989
st_spectype    92.586198
dtype: float64

In [8]:
# Count duplicate rows

df.duplicated().sum()


np.int64(5586)

In [9]:
# Remove duplicates if any

df = df.drop_duplicates()


In [10]:
df.duplicated().sum()#check whether dupicates dropped or not


np.int64(0)

In [11]:
# Generate descriptive statistics for numeric features

df.describe()


Unnamed: 0,pl_rade,pl_bmasse,pl_orbper,pl_orbsmax,pl_eqt,pl_dens,st_teff,st_lum,st_met
count,21734.0,7112.0,30641.0,16598.0,11888.0,2750.0,30452.0,9676.0,19453.0
mean,5.079607,741.477961,14272.35,6.248295,903.751281,6.136843,5441.200928,-0.150518,0.001127
std,57.720931,1560.580428,2297876.0,208.206295,448.886686,64.921689,1034.623732,0.719805,0.216915
min,0.27,0.015,0.09070629,0.0044,34.0,0.00074,415.0,-4.66,-2.5
25%,1.59,13.1255,4.302148,0.054,574.75,0.56,5069.5075,-0.46384,-0.12
50%,2.36,191.80616,10.22216,0.102365,811.0,1.3225,5594.0,-0.08379,0.0168
75%,3.4965,682.359252,26.72321,0.24315,1146.0,3.655736,5927.0,0.303762,0.14
max,4282.98,25426.4,402000000.0,19000.0,4050.0,2331.0,57000.0,3.26076,7.79


In [12]:
# Remove rows where all values are missing
df.dropna(how='all', inplace=True)


In [13]:
# Fill missing planetary values using median (safe method)
planetary_cols = ["pl_rade","pl_bmasse","pl_orbper","pl_orbsmax","pl_eqt","pl_dens"]

for col in planetary_cols:
    df[col] = df[col].fillna(df[col].median())


In [14]:
# Fill missing star temperature using median (safe version)

df["st_teff"] = df["st_teff"].fillna(df["st_teff"].median())


In [15]:
# Fill missing star luminosity using median (safe version)

df["st_lum"] = df["st_lum"].fillna(df["st_lum"].median())


In [16]:
# Fill missing star type using mode (safe version)

df["st_spectype"] = df["st_spectype"].fillna(df["st_spectype"].mode()[0])


In [17]:
# Fill missing star metallicity using median
df["st_met"] = df["st_met"].fillna(df["st_met"].median())


In [18]:
df.isnull().sum()

pl_rade        0
pl_bmasse      0
pl_orbper      0
pl_orbsmax     0
pl_eqt         0
pl_dens        0
st_teff        0
st_lum         0
st_met         0
st_spectype    0
dtype: int64

In [19]:
# Radius must be positive
df = df[df["pl_rade"] > 0]

# Mass must be positive
df = df[df["pl_bmasse"] > 0]

# Orbital period must be positive
df = df[df["pl_orbper"] > 0]

# Semi-major axis must be positive
df = df[df["pl_orbsmax"] > 0]

# Temperature must be above 0 Kelvin
df = df[df["pl_eqt"] > 0]


In [20]:
df.shape


(33799, 10)

In [21]:
# Cap extreme values at 1st and 99th percentile

numeric_cols = ["pl_rade","pl_bmasse","pl_orbper","pl_orbsmax",
                "pl_eqt","pl_dens","st_teff","st_lum","st_met"]

for col in numeric_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)


In [22]:
df.shape


(33799, 10)

In [23]:
# Temperature similarity to Earth (288 K)
df["temp_score"] = 1 - abs(df["pl_eqt"] - 288) / 288

# Radius similarity to Earth (1 Earth radius)
df["radius_score"] = 1 - abs(df["pl_rade"] - 1)

# Orbital distance similarity to Earth (1 AU)
df["distance_score"] = 1 - abs(df["pl_orbsmax"] - 1)

# Combine into Habitability Index (weighted average)
df["habitability_index"] = (
    0.4 * df["temp_score"] +
    0.3 * df["radius_score"] +
    0.3 * df["distance_score"]
)


In [24]:
# Star temperature similarity to Sun
df["stellar_temp_score"] = 1 - abs(df["st_teff"] - 5778) / 5778

# Star luminosity similarity to Sun
df["stellar_lum_score"] = 1 - abs(df["st_lum"] - 1)

# Combine into Stellar Compatibility Index
df["stellar_compatibility_index"] = (
    0.6 * df["stellar_temp_score"] +
    0.4 * df["stellar_lum_score"]
)


In [25]:
# Orbital stability ratio
df["orbital_stability_factor"] = df["pl_orbsmax"] / df["pl_orbper"]


In [26]:
# Convert star spectral type into dummy variables
df = pd.get_dummies(df, columns=["st_spectype"], drop_first=True)


In [27]:
# Create binary target variable
df["habitable"] = df["habitability_index"].apply(lambda x: 1 if x > 0.7 else 0)


In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_cols = df.select_dtypes(include=['float64','int64']).columns
df[num_cols] = scaler.fit_transform(df[num_cols])


In [29]:
df["habitable_multi"] = pd.cut(
    df["habitability_index"],
    bins=[-np.inf, 0.4, 0.7, np.inf],
    labels=[0,1,2]
).astype(int)


In [30]:
df.shape

(33799, 310)

In [31]:
df.to_csv("preprocessed_dataset.csv", index=False)
