Load Dataset

In [46]:
import pandas as pd
import numpy as np

df = pd.read_csv("datacleaning2_final2.csv")
df.shape


(5954, 11)

Drop Useless Column

In [47]:
# Drop junk index column
if "Unnamed: 0" in df.columns:
    df.drop(columns=["Unnamed: 0"], inplace=True)

df.shape


(5954, 10)

Detect Physically Impossible Values

In [48]:
# Define physical constraints
constraints = {
    "pl_rad": df["pl_rad"] <= 0,
    "pl_mass": df["pl_mass"] <= 0,
    "pl_sma": df["pl_sma"] <= 0,
    "pl_orbper": df["pl_orbper"] <= 0,
    "st_temp": df["st_temp"] < 0   
}

# Replace impossible values with NaN
for col, condition in constraints.items():
    df.loc[condition, col] = np.nan


Column-wise Median Imputation (Numerical)

In [49]:
# Select numerical columns
num_cols = df.select_dtypes(include=["int64", "float64"]).columns

# Median imputation
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

df.isnull().sum()


pl_rad       0
pl_mass      0
pl_orbper    0
pl_sma       0
pl_ecc       0
st_temp      0
st_met       0
st_rad       0
st_age       0
st_mass      0
dtype: int64

Categorical Mode Imputation (Future-proof)

In [50]:
cat_cols = df.select_dtypes(include=["object"]).columns

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [51]:
print("Shape:", df.shape)
print("Total missing values:", df.isnull().sum().sum())
df.describe()


Shape: (5954, 10)
Total missing values: 0


Unnamed: 0,pl_rad,pl_mass,pl_orbper,pl_sma,pl_ecc,st_temp,st_met,st_rad,st_age,st_mass
count,5954.0,5954.0,5954.0,5954.0,5954.0,5954.0,5954.0,5954.0,5954.0,5954.0
mean,3.137738,321.535887,27.078043,0.376137,0.113066,5366.673559,0.018215,1.017287,4.211106,0.929957
std,2.110888,452.709073,32.499803,0.6181,0.105842,768.911643,0.143508,0.382789,1.634496,0.284697
min,0.3098,0.02,0.090706,0.0044,0.0,3442.0,-0.38,0.068,0.001,0.275
25%,1.92,176.076938,4.581149,0.082098,0.09,5013.25,-0.03,0.8,4.17,0.8
50%,2.438184,176.076938,11.087324,0.11415,0.09,5551.0,0.02,0.95,4.17,0.948
75%,3.07,176.076938,34.971796,0.168475,0.09,5854.0,0.06,1.18,4.17,1.07175
max,7.777125,1728.986438,92.086428,2.004842,0.5305,7380.0,0.42,1.942,9.225,1.595


Feature Engineering: Habitability Score

In [52]:
# Enforce realistic astrophysical ranges

df.loc[df["pl_rad"] > 20, "pl_rad"] = np.nan      # >20 Earth radii = likely error
df.loc[df["pl_mass"] > 5000, "pl_mass"] = np.nan # absurd mass
df.loc[df["st_temp"] > 60000, "st_temp"] = np.nan
df.loc[df["pl_sma"] > 100, "pl_sma"] = np.nan    # too far for habitability

# Re-impute after enforcing bounds
for col in df.select_dtypes(include=["float64", "int64"]).columns:
    df[col] = df[col].fillna(df[col].median())


 unit validity with sanity bounds

In [53]:
# Normalize helper
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

# Temperature score (habitable zone ~ 273â€“373 K)
temp_score = 1 - abs(df["st_temp"] - 288) / 288

# Planet size similarity to Earth
radius_score = 1 - abs(df["pl_rad"] - 1)

# Orbit stability proxy
orbit_score = normalize(df["pl_sma"])

# Final Habitability Score
df["habitability_score"] = (
    0.4 * temp_score +
    0.3 * radius_score +
    0.3 * orbit_score
)

df["habitability_score"].describe()


count    5954.000000
mean       -6.943094
std         1.358335
min       -11.180213
25%        -7.627041
50%        -6.990555
75%        -6.243836
max        -3.664097
Name: habitability_score, dtype: float64

In [54]:
df["habitable"] = (df["habitability_score"] >= 0.3).astype(int)
df["habitability_score"].min(), df["habitability_score"].max()
df["habitability_score"].isna().sum()
df["habitability_score"].dtype
df["habitability_score"] = pd.to_numeric(df["habitability_score"], errors="coerce")


threshold = df["habitability_score"].quantile(0.8)
df["habitable"] = (df["habitability_score"] >= threshold).astype(int)

df["habitable"].value_counts()


habitable
0    4763
1    1191
Name: count, dtype: int64

Saved clean data 

In [55]:

# Remove duplicate rows
df = df.drop_duplicates()

print("Final shape:", df.shape)
print("Remaining duplicates:", df.duplicated().sum())


Final shape: (5938, 12)
Remaining duplicates: 0


In [56]:
total_missing_values = df.isnull().sum().sum()
print("Total missing values:", total_missing_values)


Total missing values: 0


In [57]:
df.to_csv('pavan.csv')