In [1]:
import pandas as pd 
import numpy as np
import seaborn as sb

kepler = pd.read_csv("KOI_cleaned.csv")
tess = pd.read_csv("TOI_cleaned.csv")
k2 = pd.read_csv("K2_cleaned.csv")

In [2]:
print(kepler.columns.tolist())
print(tess.columns.tolist())
print(k2.columns.tolist())

['kepid', 'kepoi_name', 'koi_disposition', 'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_tce_plnt_num', 'koi_tce_delivname', 'koi_steff', 'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag', 'stellar_mass', 'distance']
['toi', 'tid', 'tfopwg_disp', 'rastr', 'ra', 'decstr', 'dec', 'st_pmra', 'st_pmdec', 'pl_tranmid', 'pl_orbper', 'pl_trandurh', 'pl_trandep', 'pl_rade', 'pl_insol', 'pl_eqt', 'st_tmag', 'st_dist', 'st_teff', 'st_logg', 'st_rad', 'toi_created', 'rowupdate', 'stellar_mass']
['pl_name', 'hostname', 'default_flag', 'disposition', 'disp_refname', 'pl_orbper', 'pl_rade', 'pl_radj', 'ttv_flag', 'st_teff', 'st_rad', 'st_mass', 'st_logg', 'sy_dist', 'sy_vmag', 'sy_kmag', 'sy_gaiamag']


In [3]:
kepler = kepler.rename(columns={
    "koi_period": "period",
    "koi_prad": "radius",
    "koi_steff": "stellar_teff",
    "koi_slogg": "stellar_logg",
    "koi_srad": "stellar_rad",
    "koi_kepmag": "stellar_mag",
    "koi_disposition": "disposition",
    "koi_insol": "insolation",
    "koi_teq": "equilibrium_temp"
})

kepler = kepler[["period","radius","stellar_teff","stellar_logg","stellar_rad","stellar_mag","disposition","insolation","equilibrium_temp","stellar_mass","distance"]]


In [4]:
tess = tess.rename(columns={
    "pl_orbper": "period",
    "pl_rade": "radius",
    "st_teff": "stellar_teff",
    "st_logg": "stellar_logg",
    "st_rad": "stellar_rad",
    "st_tmag": "stellar_mag",
    "st_dist": "distance",
    "pl_insol": "insolation",
    "pl_eqt": "equilibrium_temp",
    "tfopwg_disp": "disposition"
})

tess = tess[["period","radius","stellar_teff","stellar_logg","stellar_rad","stellar_mag","disposition",
    "insolation","equilibrium_temp","distance","stellar_mass"]]


In [5]:
k2 = k2.rename(columns={
    "pl_orbper": "period",
    "pl_rade": "radius",
    "st_teff": "stellar_teff",
    "st_logg": "stellar_logg",
    "st_rad": "stellar_rad",
    "sy_vmag": "stellar_mag",     
    "disposition": "disposition", 
    "st_mass": "stellar_mass",    
    "sy_dist": "distance"         
})

k2 = k2[[
    "period", "radius", "stellar_teff", "stellar_logg", "stellar_rad",
    "stellar_mag", "disposition", "stellar_mass", "distance"
]]


In [6]:
combined_dataset = pd.concat([kepler,tess,k2], ignore_index=True)

print(combined_dataset.shape)

(21223, 11)


In [7]:
print(combined_dataset.isnull().sum())

period                 0
radius                 0
stellar_teff           0
stellar_logg           0
stellar_rad            0
stellar_mag            0
disposition            0
insolation          3992
equilibrium_temp    3992
stellar_mass           0
distance               0
dtype: int64


In [8]:
missing_cols = ["insolation","equilibrium_temp"]
for col in missing_cols:
    combined_dataset[col].fillna(combined_dataset[col].median(),inplace=True)
print(combined_dataset.isnull().sum())

period              0
radius              0
stellar_teff        0
stellar_logg        0
stellar_rad         0
stellar_mag         0
disposition         0
insolation          0
equilibrium_temp    0
stellar_mass        0
distance            0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_dataset[col].fillna(combined_dataset[col].median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_dataset[col].fillna(combined_dataset[col].median(),inplace=True)


In [9]:
print(combined_dataset.columns.tolist())

['period', 'radius', 'stellar_teff', 'stellar_logg', 'stellar_rad', 'stellar_mag', 'disposition', 'insolation', 'equilibrium_temp', 'stellar_mass', 'distance']


In [10]:
combined_dataset.to_csv("Combined Dataset.csv",index=False)