## Exoplanets preprocessing

In [56]:
# imports and stuff
import warnings

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

In [57]:
# read in the data
nasa = pd.read_csv(
    "../data/nasa_2025_02.csv",
    comment='#',
    dtype=str
)

display(nasa)

Unnamed: 0,rowid,pl_name,hostname,pl_letter,hd_name,hip_name,tic_id,gaia_id,default_flag,sy_snum,...,rowupdate,pl_pubdate,releasedate,pl_nnotes,st_nphot,st_nrvc,st_nspec,pl_nespec,pl_ntranspec,pl_ndispec
0,1,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,0,2,...,2014-05-14,2008-01,2014-05-14,2,1,2,0,0,0,0
1,2,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,0,2,...,2014-07-23,2011-08,2014-07-23,2,1,2,0,0,0,0
2,3,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,1,2,...,2023-09-19,2023-08,2023-09-19,2,1,2,0,0,0,0
3,4,11 UMi b,11 UMi,b,HD 136726,HIP 74793,TIC 230061010,Gaia DR2 1696798367260229376,1,1,...,2018-09-04,2017-03,2018-09-06,0,1,1,0,0,0,0
4,5,11 UMi b,11 UMi,b,HD 136726,HIP 74793,TIC 230061010,Gaia DR2 1696798367260229376,0,1,...,2018-04-25,2009-10,2014-05-14,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38090,38091,ups And d,ups And,d,HD 9826,HIP 7513,TIC 189576919,Gaia DR2 348020448377061376,1,2,...,2019-01-28,2011-01,2019-01-31,5,1,10,1,0,0,0
38091,38092,ups Leo b,ups Leo,b,,,TIC 49430557,Gaia DR2 3794167001116433152,1,1,...,2022-01-10,2021-12,2022-01-10,0,0,0,0,0,0,0
38092,38093,xi Aql b,xi Aql,b,HD 188310,HIP 97938,TIC 375464367,Gaia DR2 4298361114750843904,0,1,...,2014-07-23,2011-08,2014-07-23,1,1,1,0,0,0,0
38093,38094,xi Aql b,xi Aql,b,HD 188310,HIP 97938,TIC 375464367,Gaia DR2 4298361114750843904,0,1,...,2014-05-14,2008-06,2014-05-14,1,1,1,0,0,0,0


### Removing controversial planets

In [58]:
#nasa = nasa.loc[nasa['pl_controv_flag'] == 0]

## Merging Observations of the same planet into one planet
For now, if there are different values in rows to be merged we will just take their average.\
We could also debate on rather taking the median, doing a majority vote or something similar

In [59]:
print("Amount of distinct planets: ", len(pd.unique(nasa['pl_name'])))

Amount of distinct planets:  5834


In [61]:
# Here we can simply list how aggregating should be done for each column
def aggregate_observations(col):
    if pd.api.types.is_numeric_dtype(col):
        return col.mean()
    else:
        return col.iloc[0] # Takes the first value


nasa = nasa.groupby('pl_name').agg(aggregate_observations).reset_index()
display(nasa)

Unnamed: 0,pl_name,rowid,hostname,pl_letter,hd_name,hip_name,tic_id,gaia_id,default_flag,sy_snum,...,rowupdate,pl_pubdate,releasedate,pl_nnotes,st_nphot,st_nrvc,st_nspec,pl_nespec,pl_ntranspec,pl_ndispec
0,11 Com b,1,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,0,2,...,2014-05-14,2008-01,2014-05-14,2,1,2,0,0,0,0
1,11 UMi b,4,11 UMi,b,HD 136726,HIP 74793,TIC 230061010,Gaia DR2 1696798367260229376,1,1,...,2018-09-04,2017-03,2018-09-06,0,1,1,0,0,0,0
2,14 And b,7,14 And,b,HD 221345,HIP 116076,TIC 333225860,Gaia DR2 1920113512486282240,0,1,...,2014-05-14,2008-12,2014-05-14,0,1,1,0,0,0,0
3,14 Her b,10,14 Her,b,HD 145675,HIP 79248,TIC 219483057,Gaia DR2 1385293808145621504,0,1,...,2018-09-04,2017-03,2018-09-06,0,1,4,1,0,0,0
4,16 Cyg B b,18,16 Cyg B,b,HD 186427,HIP 96901,TIC 27533327,Gaia DR2 2135550755683407232,1,3,...,2018-09-04,2017-03,2018-09-06,5,1,4,3,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5829,ups And b,38073,ups And,b,HD 9826,HIP 7513,TIC 189576919,Gaia DR2 348020448377061376,0,2,...,2015-04-24,2009-03,2015-04-24,5,1,10,1,0,0,0
5830,ups And c,38080,ups And,c,HD 9826,HIP 7513,TIC 189576919,Gaia DR2 348020448377061376,0,2,...,2021-09-20,2021-05,2021-09-20,5,1,10,1,0,0,0
5831,ups And d,38086,ups And,d,HD 9826,HIP 7513,TIC 189576919,Gaia DR2 348020448377061376,0,2,...,2021-09-20,2021-05,2021-09-20,5,1,10,1,0,0,0
5832,ups Leo b,38092,ups Leo,b,,,TIC 49430557,Gaia DR2 3794167001116433152,1,1,...,2022-01-10,2021-12,2022-01-10,0,0,0,0,0,0,0
