In [47]:
import pandas as pd
import numpy as np

In [48]:
exoplanet_path = '../published_output/exoplanet.eu_catalog_23Nov.csv'

dataset_exo = pd.read_csv(exoplanet_path, index_col=0)
print(dataset_exo.shape)

(5529, 97)


In [49]:
features = ['mass', 'semi_major_axis','eccentricity', 'star_radius', 'star_teff','star_mass', 'radius']
features_error_range_list = [f for feature in features for f in (feature, f"{feature}_error_min", f"{feature}_error_max")]
features_error_range_dir = {feature: [f"{feature}_error_min", f"{feature}_error_max"] for feature in features}
features_error_list = [f for feature in features for f in (feature, f"{feature}_error")]
features_error_dir = {feature: [f"{feature}_error"] for feature in features}
print(features_error_range_list)
print(features_error_dir)

['mass', 'mass_error_min', 'mass_error_max', 'semi_major_axis', 'semi_major_axis_error_min', 'semi_major_axis_error_max', 'eccentricity', 'eccentricity_error_min', 'eccentricity_error_max', 'star_radius', 'star_radius_error_min', 'star_radius_error_max', 'star_teff', 'star_teff_error_min', 'star_teff_error_max', 'star_mass', 'star_mass_error_min', 'star_mass_error_max', 'radius', 'radius_error_min', 'radius_error_max']
{'mass': ['mass_error'], 'semi_major_axis': ['semi_major_axis_error'], 'eccentricity': ['eccentricity_error'], 'star_radius': ['star_radius_error'], 'star_teff': ['star_teff_error'], 'star_mass': ['star_mass_error'], 'radius': ['radius_error']}


In [50]:
for key in features_error_dir:
    print(key)

for key in features_error_range_dir:
    print(key)

mass
semi_major_axis
eccentricity
star_radius
star_teff
star_mass
radius
mass
semi_major_axis
eccentricity
star_radius
star_teff
star_mass
radius


In [51]:
dataset_exo = dataset_exo[features_error_range_list]
dataset_exo = dataset_exo.dropna(subset=features)

In [52]:
for key in features_error_range_dir:
    for feature in features_error_range_dir[key]:
        print(feature)
        max_error = dataset_exo[feature].quantile(0.9)
        dataset_exo[feature] = dataset_exo[feature].replace(np.nan, max_error)

mass_error_min
mass_error_max
semi_major_axis_error_min
semi_major_axis_error_max
eccentricity_error_min
eccentricity_error_max
star_radius_error_min
star_radius_error_max
star_teff_error_min
star_teff_error_max
star_mass_error_min
star_mass_error_max
radius_error_min
radius_error_max


In [53]:
mass_and_radii_features = ['mass', 'mass_error_max', 'mass_error_min', 'radius', 'radius_error_max', 'radius_error_min']


In [54]:
from astropy.units import earthMass, jupiterMass, earthRad, jupiterRad, solMass, solRad, AU

In [55]:
def jupiter_to_earth(dataset, feature):
    assert feature in ['mass', 'mass_error_max', 'mass_error_min', 'radius', 'radius_error_max', 'radius_error_min']
    if 'radius' in feature:
        df = dataset[feature].apply(lambda x:(x*jupiterRad).to('earthRad').value)
    elif 'mass' in feature:
        df = dataset[feature].apply(lambda x:(x*jupiterMass).to('earthMass').value)
    new_df = pd.DataFrame({feature: df})
    dataset.update(new_df)
    return dataset

In [56]:
for mr_feature in mass_and_radii_features:
    dataset_exo = jupiter_to_earth(dataset_exo, mr_feature)

In [57]:
dataset_exo['mass_error'] = dataset_exo[['mass_error_min', 'mass_error_max']].mean(axis=1).abs()

In [58]:
for key in features_error_range_dir:
    
    # print(dataset_exo[features_error_range_dir[key]])
    print(features_error_dir[key], features_error_range_dir[key])
    print(dataset_exo[features_error_range_dir[key]].mean(axis=1).shape)
    print(dataset_exo[features_error_range_dir[key]].mean(axis=1))
    print(dataset_exo.shape)
    dataset_exo[features_error_dir[key][0]] = dataset_exo[features_error_range_dir[key]].mean(axis=1).abs()
        

['mass_error'] ['mass_error_min', 'mass_error_max']
(938,)
name
2M 2140+16 b    6356.568132
2M 2206-20 b    6356.568132
51 Peg b          22.247988
55 Cnc e           0.429068
AF Lep b         190.697044
                   ...     
XO-3 b           187.518760
XO-4 b            31.782841
XO-5 b            11.759651
XO-6 b           158.914203
XO-7 b            12.077479
Length: 938, dtype: float64
(938, 22)
['semi_major_axis_error'] ['semi_major_axis_error_min', 'semi_major_axis_error_max']
(938,)
name
2M 2140+16 b    0.150000
2M 2206-20 b    0.400000
51 Peg b        0.007550
55 Cnc e        0.000015
AF Lep b        1.700000
                  ...   
XO-3 b          0.000820
XO-4 b          0.000420
XO-5 b          0.000600
XO-6 b          0.007700
XO-7 b          0.000720
Length: 938, dtype: float64
(938, 22)
['eccentricity_error'] ['eccentricity_error_min', 'eccentricity_error_max']
(938,)
name
2M 2140+16 b    0.0600
2M 2206-20 b    0.1918
51 Peg b        0.0066
55 Cnc e        0.0190


In [59]:
dataset_exo = dataset_exo[features_error_list]

In [60]:
dataset_exo

Unnamed: 0_level_0,mass,mass_error,semi_major_axis,semi_major_axis_error,eccentricity,eccentricity_error,star_radius,star_radius_error,star_teff,star_teff_error,star_mass,star_mass_error,radius,radius_error
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2M 2140+16 b,6356.568132,6356.568132,3.530000,0.150000,0.2600,0.0600,0.100,0.043,2300.0,80.0,0.080,0.060,10.312262,4.371502
2M 2206-20 b,9534.852198,6356.568132,4.480000,0.400000,0.0000,0.1918,0.110,0.015,2350.0,80.0,0.130,0.050,14.571675,2.017617
51 Peg b,149.379351,22.247988,0.052000,0.007550,0.0069,0.0066,1.266,0.046,5793.0,70.0,1.110,0.060,21.297063,3.362694
55 Cnc e,8.590902,0.429068,0.015439,0.000015,0.0280,0.0190,0.980,0.016,5196.0,24.0,1.015,0.051,1.947000,0.037998
AF Lep b,889.919538,190.697044,8.200000,1.700000,0.0400,0.4000,1.250,0.060,6130.0,60.0,1.200,0.060,14.907944,6.725388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XO-3 b,3747.196914,187.518760,0.045400,0.000820,0.2600,0.0170,1.490,0.080,6781.0,44.0,1.410,0.080,13.641330,0.818256
XO-4 b,513.610705,31.782841,0.054850,0.000420,0.0000,0.1918,1.550,0.050,5700.0,70.0,1.320,0.020,14.762228,0.325060
XO-5 b,342.301194,11.759651,0.048700,0.000600,0.0000,0.1918,1.060,0.050,5510.0,44.0,0.880,0.030,11.545250,0.560449
XO-6 b,603.873973,158.914203,0.081500,0.007700,0.0000,0.1918,1.930,0.180,6720.0,100.0,1.470,0.060,23.202590,2.465976


In [73]:
solar_path = '../published_output/solar_system_planets_catalog.csv'

dataset_solar = pd.read_csv(solar_path, index_col=0)
print(dataset_solar.shape)

(8, 17)


In [74]:
solar_features_error = [f for feature in features for f in (feature, f"{feature}_error")]
print(solar_features_error)

['mass', 'mass_error', 'semi_major_axis', 'semi_major_axis_error', 'eccentricity', 'eccentricity_error', 'star_radius', 'star_radius_error', 'star_teff', 'star_teff_error', 'star_mass', 'star_mass_error', 'radius', 'radius_error']


In [75]:
dataset_solar = dataset_solar[solar_features_error]
dataset_solar.shape

(8, 14)

In [76]:
dataset_solar = dataset_solar.dropna(subset=features)
dataset_solar.shape

(8, 14)

In [77]:
if (np.inf in dataset_exo):
    dataset_exo = dataset_exo.replace([np.inf, -np.inf], np.nan)    

In [78]:
## don't understand this part

In [79]:

error_columns = ['mass_error_min', 'mass_error_max',
                     'radius_error_min', 'radius_error_max',
                     'semi_major_axis_error_min', 'semi_major_axis_error_max',
                     'eccentricity_error_min', 'eccentricity_error_max',
                     'star_mass_error_min', 'star_mass_error_max',
                     'star_radius_error_min', 'star_radius_error_max',
                     'star_teff_error_min', 'star_teff_error_max']
for error_col in error_columns:
    max_error = dataset_exo[error_col].quantile(0.9)
    print(max_error)

93.75937994542906
93.75937994542906
1.2329878804032548
1.2329878804032548
0.007549999999999999
0.007549999999999999
0.17937189999999986
0.17937189999999986
0.1
0.1
0.12
0.12
148.79999999999995
148.79999999999995


In [80]:
np.nan in dataset_exo

False

In [81]:
dataset_exo

Unnamed: 0_level_0,mass,semi_major_axis,eccentricity,star_radius,star_teff,star_mass,radius,mass_error_min,mass_error_max,semi_major_axis_error_min,...,eccentricity_error_min,eccentricity_error_max,star_radius_error_min,star_radius_error_max,star_teff_error_min,star_teff_error_max,star_mass_error_min,star_mass_error_max,radius_error_min,radius_error_max
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2M 2140+16 b,6356.568132,3.530000,0.2600,0.100,2300.0,0.080,10.312262,6356.568132,6356.568132,0.150000,...,0.060000,0.060000,0.043,0.043,80.0,80.0,0.060,0.060,4.371502,4.371502
2M 2206-20 b,9534.852198,4.480000,0.0000,0.110,2350.0,0.130,14.571675,6356.568132,6356.568132,0.400000,...,0.179372,0.179372,0.015,0.015,80.0,80.0,0.050,0.050,2.017617,2.017617
51 Peg b,149.379351,0.052000,0.0069,1.266,5793.0,1.110,21.297063,22.247988,22.247988,0.007550,...,0.006600,0.006600,0.046,0.046,70.0,70.0,0.060,0.060,3.362694,3.362694
55 Cnc e,8.590902,0.015439,0.0280,0.980,5196.0,1.015,1.947000,0.429068,0.429068,0.000015,...,0.019000,0.019000,0.016,0.016,24.0,24.0,0.051,0.051,0.037998,0.037998
AF Lep b,889.919538,8.200000,0.0400,1.250,6130.0,1.200,14.907944,190.697044,190.697044,1.700000,...,0.400000,0.400000,0.060,0.060,60.0,60.0,0.060,0.060,6.725388,6.725388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XO-3 b,3747.196914,0.045400,0.2600,1.490,6781.0,1.410,13.641330,187.518760,187.518760,0.000820,...,0.017000,0.017000,0.080,0.080,44.0,44.0,0.080,0.080,0.818256,0.818256
XO-4 b,513.610705,0.054850,0.0000,1.550,5700.0,1.320,14.762228,31.782841,31.782841,0.000420,...,0.179372,0.179372,0.050,0.050,70.0,70.0,0.020,0.020,0.325060,0.325060
XO-5 b,342.301194,0.048700,0.0000,1.060,5510.0,0.880,11.545250,11.759651,11.759651,0.000600,...,0.179372,0.179372,0.050,0.050,44.0,44.0,0.030,0.030,0.560449,0.560449
XO-6 b,603.873973,0.081500,0.0000,1.930,6720.0,1.470,23.202590,158.914203,158.914203,0.007700,...,0.179372,0.179372,0.180,0.180,100.0,100.0,0.060,0.060,2.465976,2.465976


In [None]:
dataset_exo = dataset_exo.dropna(subset=['mass', 'semi_major_axis',
                                             'star_radius', 'star_mass',
                                             'star_teff', 'radius'])
dataset_solar_system = dataset_solar_system.dropna(subset=['mass',
                                                               'semi_major_axis',
                                                               'star_radius',
                                                               'star_mass',
                                                               'star_teff',
                                                               'radius'])

In [None]:
# convert
