In [74]:
# imports and stuff
import warnings

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Import pre-processed dataset from preproc1 (fully aggregated planets)
nasa = pd.read_csv(
    '../data/nasa_aggregated.csv',
)

to_be_estimated = list(nasa[nasa[['pl_rade', 'pl_bmasse', 'pl_eqt']].isnull().any(axis=1)].index)

display(nasa)
display(nasa.columns.values)
print(len(to_be_estimated))
display(nasa.loc[to_be_estimated, ['pl_name', 'pl_rade', 'pl_bmasse', 'pl_eqt']])

Unnamed: 0,pl_name,hostname,pl_orbper,pl_orbsmax,pl_orbeccen,pl_rade,pl_bmasse,pl_masse,pl_dens,pl_insol,...,st_rad,st_mass,st_lum,st_met,st_logg,sy_dist,sy_plx,sy_dist.1,discoverymethod,disc_year
0,14 Her b,14 Her,1765.038900,2.77400,0.3730,,2559.47216,2559.47216,,,...,1.00,0.91,-0.153,0.405,4.43,17.9323,55.73630,17.9323,Radial Velocity,2002
1,16 Cyg B b,16 Cyg B,799.450000,1.67600,0.6832,,556.83537,,,,...,1.16,0.98,0.097,0.074,4.30,21.1397,47.27540,21.1397,Radial Velocity,1996
2,1RXS J160929.1-210524 b,1RXS J160929.1-210524,,330.00000,,18.647,4000.00000,4000.00000,,,...,1.31,0.85,-0.370,,4.00,139.1350,7.15949,139.1350,Imaging,2008
3,47 UMa b,47 UMa,1076.600000,2.05900,0.0160,,774.86566,,,,...,1.14,1.01,0.198,0.026,4.33,13.7967,72.45280,13.7967,Radial Velocity,1996
4,51 Peg b,51 Peg,4.230797,0.05235,0.0042,,147.47238,146.20180,,,...,1.19,1.07,0.136,0.206,4.32,15.4614,64.64880,15.4614,Radial Velocity,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3659,pi Men c,HD 39091,6.267829,0.06900,0.0770,2.042,3.49611,3.63000,2.97,309.0,...,1.10,1.02,0.160,0.050,4.36,18.2702,54.70520,18.2702,Transit,2018
3660,tau Boo b,tau Boo,3.312453,0.04869,0.0074,,1366.66215,1891.00000,,,...,1.44,1.40,0.505,0.272,4.26,15.6521,63.86380,15.6521,Radial Velocity,1996
3661,ups And b,ups And,4.617122,0.05914,0.0069,,214.53417,,,,...,1.62,1.29,0.525,0.122,4.13,13.4054,74.57110,13.4054,Radial Velocity,1996
3662,ups And c,ups And,241.223000,0.82650,0.2660,,624.53282,4443.24113,,,...,1.62,1.29,0.525,0.122,4.13,13.4054,74.57110,13.4054,Radial Velocity,1999


array(['pl_name', 'hostname', 'pl_orbper', 'pl_orbsmax', 'pl_orbeccen',
       'pl_rade', 'pl_bmasse', 'pl_masse', 'pl_dens', 'pl_insol',
       'pl_eqt', 'pl_orbincl', 'st_teff', 'st_rad', 'st_mass', 'st_lum',
       'st_met', 'st_logg', 'sy_dist', 'sy_plx', 'sy_dist.1',
       'discoverymethod', 'disc_year'], dtype=object)

2753


Unnamed: 0,pl_name,pl_rade,pl_bmasse,pl_eqt
0,14 Her b,,2559.47216,
1,16 Cyg B b,,556.83537,
3,47 UMa b,,774.86566,
4,51 Peg b,,147.47238,
5,55 Cnc b,,267.29369,700.0
...,...,...,...,...
3658,kap CrB b,,483.73484,
3660,tau Boo b,,1366.66215,
3661,ups And b,,214.53417,
3662,ups And c,,624.53282,


## Value Estimations for mass, radius and temperature
We use Random Forests to estimate those 3 values in cases where there are missing, because we end up with a lot of unknown planets in our classification.\
Why random forest? They can handle missing values in their training better and are able to understand more complex non-linear relationships. Also they don't need as thorough hyperparamter tuning as other possbile methods like gradient boosting.

In [76]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Build a regression model for each of the 3 target variables
def estimate_missing_values(df):
    target_columns = ['pl_rade', 'pl_bmasse', 'pl_eqt']
    df_estimated = df.copy()

    for target in target_columns:
        train_data = df.drop(columns=["pl_name", "hostname", "disc_year", "discoverymethod"])

        # Removes all column that misses the specific target value
        train_data = train_data.dropna(subset=[target])

        X = train_data.drop(columns=target)
        y = train_data[target]

        # Split into train and test sets
        X_train , X_test , y_train , y_test = train_test_split(X.values , y.values , test_size=0.2 , random_state=42)

        # Train the model
        rf_model = RandomForestRegressor(random_state=42)

        param_grid = {
            'n_estimators': [100, 200, 500],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5]
        }

        gs = GridSearchCV(estimator=rf_model,
                                   param_grid=param_grid,
                                   cv=5,
                                   scoring='neg_mean_squared_error',
                                   verbose=0)

        gs.fit(X_train , y_train)

        y_pred = gs.predict(X_test)

        ################# EVALUATION ##################################
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)

        mean_y_test = np.mean(y_test)
        var_y_test = np.var(y_test)
        range_y_test = np.max(y_test) - np.min(y_test)

        nmae = mae / mean_y_test
        nmse = mse / var_y_test
        nrmse = rmse / range_y_test

        print(f"Evaluating model for {target}:")
        print(f"MAE: {mae} | Normalized MAE: {nmae}")
        print(f"MSE: {mse} | Normalized MSE: {nmse}")
        print(f"RMSE: {rmse} | Normalized RMSE: {nrmse}")
        ###############################################################

        # Predictions
        missing_data = df_estimated[df_estimated[target].isna()]
        X_missing = missing_data.drop(columns=[target, "pl_name", "hostname", "disc_year", "discoverymethod"]).values

        predicted_values = gs.predict(X_missing)
        df_estimated.loc[df_estimated[target].isna(), target] = predicted_values


    return df_estimated

nasa = estimate_missing_values(nasa)
display(nasa)
print(nasa[['pl_rade', 'pl_bmasse', 'pl_eqt']].isnull().sum())

Evaluating model for pl_rade:
MAE: 1.38929612401577 | Normalized MAE: 0.34006157235102075
MSE: 101.3332446810597 | Normalized MSE: 5.370172594107446
RMSE: 10.066441510338183 | Normalized RMSE: 0.45524789753700184
Evaluating model for pl_bmasse:
MAE: 69.21371333665674 | Normalized MAE: 0.1435821911417189
MSE: 43569.74229834027 | Normalized MSE: 0.060932577875316786
RMSE: 208.73366354840869 | Normalized RMSE: 0.042925425134943615
Evaluating model for pl_eqt:
MAE: 65.26248026449947 | Normalized MAE: 0.07188663810378945
MSE: 14192.636122527825 | Normalized MSE: 0.07235485904788312
RMSE: 119.13285072778132 | Normalized RMSE: 0.03529862243786113


Unnamed: 0,pl_name,hostname,pl_orbper,pl_orbsmax,pl_orbeccen,pl_rade,pl_bmasse,pl_masse,pl_dens,pl_insol,...,st_rad,st_mass,st_lum,st_met,st_logg,sy_dist,sy_plx,sy_dist.1,discoverymethod,disc_year
0,14 Her b,14 Her,1765.038900,2.77400,0.3730,20.358294,2559.47216,2559.47216,,,...,1.00,0.91,-0.153,0.405,4.43,17.9323,55.73630,17.9323,Radial Velocity,2002
1,16 Cyg B b,16 Cyg B,799.450000,1.67600,0.6832,9.355590,556.83537,,,,...,1.16,0.98,0.097,0.074,4.30,21.1397,47.27540,21.1397,Radial Velocity,1996
2,1RXS J160929.1-210524 b,1RXS J160929.1-210524,,330.00000,,18.647000,4000.00000,4000.00000,,,...,1.31,0.85,-0.370,,4.00,139.1350,7.15949,139.1350,Imaging,2008
3,47 UMa b,47 UMa,1076.600000,2.05900,0.0160,9.095527,774.86566,,,,...,1.14,1.01,0.198,0.026,4.33,13.7967,72.45280,13.7967,Radial Velocity,1996
4,51 Peg b,51 Peg,4.230797,0.05235,0.0042,10.930067,147.47238,146.20180,,,...,1.19,1.07,0.136,0.206,4.32,15.4614,64.64880,15.4614,Radial Velocity,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3659,pi Men c,HD 39091,6.267829,0.06900,0.0770,2.042000,3.49611,3.63000,2.97,309.0,...,1.10,1.02,0.160,0.050,4.36,18.2702,54.70520,18.2702,Transit,2018
3660,tau Boo b,tau Boo,3.312453,0.04869,0.0074,11.985636,1366.66215,1891.00000,,,...,1.44,1.40,0.505,0.272,4.26,15.6521,63.86380,15.6521,Radial Velocity,1996
3661,ups And b,ups And,4.617122,0.05914,0.0069,9.650665,214.53417,,,,...,1.62,1.29,0.525,0.122,4.13,13.4054,74.57110,13.4054,Radial Velocity,1996
3662,ups And c,ups And,241.223000,0.82650,0.2660,12.212606,624.53282,4443.24113,,,...,1.62,1.29,0.525,0.122,4.13,13.4054,74.57110,13.4054,Radial Velocity,1999


pl_rade      0
pl_bmasse    0
pl_eqt       0
dtype: int64


In [77]:
display(nasa.loc[to_be_estimated, ['pl_name', 'pl_rade', 'pl_bmasse', 'pl_eqt']])

Unnamed: 0,pl_name,pl_rade,pl_bmasse,pl_eqt
0,14 Her b,20.358294,2559.47216,984.835584
1,16 Cyg B b,9.355590,556.83537,857.949457
3,47 UMa b,9.095527,774.86566,692.253856
4,51 Peg b,10.930067,147.47238,1108.755480
5,55 Cnc b,9.232873,267.29369,700.000000
...,...,...,...,...
3658,kap CrB b,12.545235,483.73484,1044.416857
3660,tau Boo b,11.985636,1366.66215,1408.490067
3661,ups And b,9.650665,214.53417,1301.722780
3662,ups And c,12.212606,624.53282,1034.315653


In [78]:
nasa.to_csv('../data/nasa_estimated.csv', index=False)