In [1]:
#Imports
import pandas as pd
from sklearn.impute import KNNImputer

#Dataset 1 loading
file_path = "/kaggle/input/nasa-exoplanet/nasa_exoplanet.csv"

#loading into a dataframe
df=pd.read_csv(file_path,comment='#',engine="python",on_bad_lines="skip")
df.isnull().sum()



pl_name              0
hostname             0
default_flag         0
sy_snum              0
sy_pnum              0
                  ... 
sy_gaiamagerr1    1136
sy_gaiamagerr2    1136
rowupdate           36
pl_pubdate          35
releasedate         35
Length: 92, dtype: int64

In [2]:
needed = [
    "pl_rade", "pl_bmasse", "pl_dens", "pl_eqt",
    "pl_orbper", "sy_dist", "st_teff", 
    "st_lum", "st_spectype", "st_metfe",
    "pl_orbeccen", "pl_orbsmax"
]

df = df[[col for col in needed if col in df.columns]]

df = df.rename(columns={
    "pl_rade": "radius",
    "pl_bmasse": "mass",
    "pl_eqt": "temp",
    "pl_orbper": "orbital_period",
    "sy_dist": "distance_star",
    "st_teff": "star_temp",
    "st_spectype": "star_type",
    "pl_orbeccen": "eccentricity",
    "pl_orbsmax": "semi_major_axis"
})
df.isnull().sum()




radius             12169
mass               32101
temp               22007
orbital_period      3311
distance_star        879
star_temp           3486
star_type          36290
eccentricity       20455
semi_major_axis    17238
dtype: int64

In [3]:
selected_cols = [
    "radius", "mass", "temp", "orbital_period",
    "distance_star", "star_temp", "star_type",
    "eccentricity", "semi_major_axis"
]
df=df[selected_cols]

In [4]:
file2_path="/kaggle/input/exoplanetsdata1/exoplanetsdata1.csv"
df2 = pd.read_csv(file2_path, comment='#', engine="python", on_bad_lines="skip")
df2 = df2.loc[:, ~df2.columns.str.contains("^Unnamed")]
df2.isnull().sum()



#Removes all columns whose name starts with "Unnamed" (those are usually junk index columns in CSVs).

#df2.columns.str.contains("^Unnamed") → boolean mask for which columns are “Unnamed…”

#~ negates the mask.

#df2.loc[:, mask] → selects all rows (:) and only columns where mask is True.

pl_name              0
hostname             0
sy_snum              0
sy_pnum              0
discoverymethod      0
                  ... 
sy_kmagerr1        241
sy_kmagerr2        253
sy_gaiamag         259
sy_gaiamagerr1     261
sy_gaiamagerr2     261
Length: 85, dtype: int64

In [5]:
df2 = df2.rename(columns={
    "pl_rade": "radius",
    "pl_bmasse": "mass",
    "pl_eqt": "temp",
    "pl_orbper": "orbital_period",
    "sy_dist": "distance_star",
    "st_teff": "star_temp",
    "st_spectype": "star_type",
    "pl_orbeccen": "eccentricity",
    "pl_orbsmax": "semi_major_axis"
})
df2=df2[selected_cols]

In [6]:
combined_df=pd.concat([df,df2],ignore_index=True)

combined_df=combined_df.drop_duplicates()
combined_df["star_type"]=combined_df["star_type"].fillna("Unknown")
combined_df["eccentricity"] = combined_df["eccentricity"].fillna(combined_df["eccentricity"].mean())
combined_df["semi_major_axis"] = combined_df["semi_major_axis"].fillna(combined_df["semi_major_axis"].mean())
combined_df["distance_star"] = combined_df["distance_star"].fillna(combined_df["distance_star"].mean())



In [7]:
critical_cols = ["radius", "mass", "temp", "star_temp", "orbital_period"]
for col in critical_cols:
    combined_df[col]=combined_df.groupby("star_type")[col].transform(lambda x:x.fillna(x.median()))


In [8]:
missing_after_median=combined_df[critical_cols].isnull().sum()
print("Missing AFTER group-wise median:\n",missing_after_median)

if combined_df[critical_cols].isnull().sum().sum()>0:
    print("\nApplying KNN Imputation since some values are still missing...\n")
    numeric_df=combined_df.select_dtypes(include=['float64','int64'])
    imputer=KNNImputer(n_neighbors=5)
    numeric_filled=imputer.fit_transform(numeric_df)
    #fit->learns patterns of distance btw rows
    #transform->fill in naNs based on nearest neighbours

    numeric_filled = pd.DataFrame(numeric_filled, columns=numeric_df.columns)
    #created a numpy array with no NaNs combined
    combined_df[numeric_df.columns]=numeric_filled
    #overwriting with filled values
    

Missing AFTER group-wise median:
 radius             38
mass               15
temp              579
star_temp          18
orbital_period     37
dtype: int64

Applying KNN Imputation since some values are still missing...



In [9]:
missing_mask=combined_df[critical_cols].isnull().any(axis=1)
df_dropped=combined_df[missing_mask]
df_clean=combined_df[~missing_mask]
print("\nFinal Missing Values:\n", df_clean.isnull().sum())
print("\nFinal dataset shape:", df_clean.shape)



Final Missing Values:
 radius             0
mass               0
temp               0
orbital_period     0
distance_star      0
star_temp          0
star_type          0
eccentricity       0
semi_major_axis    0
dtype: int64

Final dataset shape: (33290, 9)
