## Exoplanets preprocessing

In [2]:
# imports and stuff
import warnings

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

In [3]:
# read in the data
nasa = pd.read_csv(
    "../data/nasa_2025_02.csv",
    comment='#'
)

display(nasa)

Unnamed: 0,rowid,pl_name,hostname,pl_letter,hd_name,hip_name,tic_id,gaia_id,default_flag,sy_snum,...,rowupdate,pl_pubdate,releasedate,pl_nnotes,st_nphot,st_nrvc,st_nspec,pl_nespec,pl_ntranspec,pl_ndispec
0,1,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,0,2,...,2014-05-14,2008-01,2014-05-14,2.0,1.0,2.0,0.0,0.0,0.0,0.0
1,2,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,0,2,...,2014-07-23,2011-08,2014-07-23,2.0,1.0,2.0,0.0,0.0,0.0,0.0
2,3,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,1,2,...,2023-09-19,2023-08,2023-09-19,2.0,1.0,2.0,0.0,0.0,0.0,0.0
3,4,11 UMi b,11 UMi,b,HD 136726,HIP 74793,TIC 230061010,Gaia DR2 1696798367260229376,1,1,...,2018-09-04,2017-03,2018-09-06,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,5,11 UMi b,11 UMi,b,HD 136726,HIP 74793,TIC 230061010,Gaia DR2 1696798367260229376,0,1,...,2018-04-25,2009-10,2014-05-14,0.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38090,38091,ups And d,ups And,d,HD 9826,HIP 7513,TIC 189576919,Gaia DR2 348020448377061376,1,2,...,2019-01-28,2011-01,2019-01-31,5.0,1.0,10.0,1.0,0.0,0.0,0.0
38091,38092,ups Leo b,ups Leo,b,,,TIC 49430557,Gaia DR2 3794167001116433152,1,1,...,2022-01-10,2021-12,2022-01-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38092,38093,xi Aql b,xi Aql,b,HD 188310,HIP 97938,TIC 375464367,Gaia DR2 4298361114750843904,0,1,...,2014-07-23,2011-08,2014-07-23,1.0,1.0,1.0,0.0,0.0,0.0,0.0
38093,38094,xi Aql b,xi Aql,b,HD 188310,HIP 97938,TIC 375464367,Gaia DR2 4298361114750843904,0,1,...,2014-05-14,2008-06,2014-05-14,1.0,1.0,1.0,0.0,0.0,0.0,0.0


### Removing controversial planets

In [4]:
print(nasa.shape)
nasa = nasa[nasa['pl_controv_flag'] != 1]
print(nasa.shape)

(38095, 288)
(38020, 288)


## Merging Observations of the same planet into one planet
For now, if there are different values in rows to be merged we will just take their average.\
We could also debate on rather taking the median, doing a majority vote or something similar

In [5]:
print("Amount of distinct planets: ", len(pd.unique(nasa['pl_name'])))

Amount of distinct planets:  5803


Remove unwanted columns

In [6]:
print(nasa.columns)

# nasa = nasa.filter(like="pl_") # another idea of how it could be done...

relevant_columns = [
    'pl_name', 'hostname', 'pl_orbper', 'pl_orbsmax', 'pl_orbeccen',
    'pl_rade', 'pl_bmasse', 'pl_insol', 'pl_eqt',
    'st_teff', 'st_rad', 'st_mass', 'st_met', 'pl_masse',
]

nasa = nasa[relevant_columns]
print(nasa.columns)

Index(['rowid', 'pl_name', 'hostname', 'pl_letter', 'hd_name', 'hip_name',
       'tic_id', 'gaia_id', 'default_flag', 'sy_snum',
       ...
       'rowupdate', 'pl_pubdate', 'releasedate', 'pl_nnotes', 'st_nphot',
       'st_nrvc', 'st_nspec', 'pl_nespec', 'pl_ntranspec', 'pl_ndispec'],
      dtype='object', length=288)
Index(['pl_name', 'hostname', 'pl_orbper', 'pl_orbsmax', 'pl_orbeccen',
       'pl_rade', 'pl_bmasse', 'pl_insol', 'pl_eqt', 'st_teff', 'st_rad',
       'st_mass', 'st_met', 'pl_masse'],
      dtype='object')


In [7]:
# Here we can simply list how aggregating should be done for each column
def aggregate_observations(col):
    if pd.api.types.is_numeric_dtype(col):
        return col.mean()
    else:
        return col.iloc[0] # Takes the first value


nasa = nasa.groupby('pl_name').agg(aggregate_observations).reset_index()
display(nasa)

Unnamed: 0,pl_name,hostname,pl_orbper,pl_orbsmax,pl_orbeccen,pl_rade,pl_bmasse,pl_insol,pl_eqt,st_teff,st_rad,st_mass,st_met,pl_masse
0,11 Com b,11 Com,324.620000,1.226000,0.234500,,5505.066163,,,4808.000000,16.380000,2.463333,-0.3050,
1,11 UMi b,11 UMi,516.219985,1.526667,0.080000,,3818.094733,,,4276.500000,26.935000,2.093333,0.0100,
2,14 And b,14 And,186.300000,0.761667,0.000000,,1224.550433,,,4850.500000,11.275000,1.726667,-0.2250,
3,14 Her b,14 Her,1766.378417,2.814750,0.362925,,1642.383591,,,5296.985000,0.976667,0.927143,0.4150,2559.47216
4,16 Cyg B b,16 Cyg B,799.375000,1.662833,0.676033,,533.514528,,,5728.594000,1.140000,1.016000,0.0564,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5798,ups And b,ups And,4.616229,0.058895,0.030200,,219.960419,,,6137.820000,1.615000,1.250000,0.1110,
5799,ups And c,ups And,240.728533,0.827712,0.238933,,1275.328758,,,6117.093333,1.615000,1.262500,0.1180,4443.24113
5800,ups And d,ups And,1285.346167,2.528382,0.281117,,1616.242590,,,6117.093333,1.615000,1.262500,0.1180,3257.74117
5801,ups Leo b,ups Leo,385.200000,1.180000,0.320000,,162.092490,,,4836.000000,11.220000,1.480000,-0.2000,


## Export the processed data

In [8]:
nasa.to_csv('../data/nasa_aggregated.csv')