In [40]:
# imports and stuff
import warnings

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Import pre-processed dataset from preproc1 (fully aggregated planets)
nasa = pd.read_csv(
    '../data/nasa_aggregated.csv',
)

display(nasa)
display(nasa.columns.values)

Unnamed: 0,pl_name,hostname,pl_orbper,pl_orbsmax,pl_orbeccen,pl_rade,pl_bmasse,pl_masse,pl_dens,pl_insol,...,st_mass,st_lum,st_met,st_logg,sy_dist,sy_plx,sy_dist.1,discoverymethod,disc_year,pl_type
0,14 Her b,14 Her,1765.038900,2.77400,0.3730,,2559.47216,2559.47216,,,...,0.91,-0.153,0.405,4.43,17.9323,55.73630,17.9323,Radial Velocity,2002,Unknown
1,16 Cyg B b,16 Cyg B,799.450000,1.67600,0.6832,,556.83537,,,,...,0.98,0.097,0.074,4.30,21.1397,47.27540,21.1397,Radial Velocity,1996,Unknown
2,1RXS J160929.1-210524 b,1RXS J160929.1-210524,,330.00000,,18.647,4000.00000,4000.00000,,,...,0.85,-0.370,,4.00,139.1350,7.15949,139.1350,Imaging,2008,Gas Giant
3,47 UMa b,47 UMa,1076.600000,2.05900,0.0160,,774.86566,,,,...,1.01,0.198,0.026,4.33,13.7967,72.45280,13.7967,Radial Velocity,1996,Unknown
4,51 Peg b,51 Peg,4.230797,0.05235,0.0042,,147.47238,146.20180,,,...,1.07,0.136,0.206,4.32,15.4614,64.64880,15.4614,Radial Velocity,1995,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3659,pi Men c,HD 39091,6.267829,0.06900,0.0770,2.042,3.49611,3.63000,2.97,309.0,...,1.02,0.160,0.050,4.36,18.2702,54.70520,18.2702,Transit,2018,Unknown
3660,tau Boo b,tau Boo,3.312453,0.04869,0.0074,,1366.66215,1891.00000,,,...,1.40,0.505,0.272,4.26,15.6521,63.86380,15.6521,Radial Velocity,1996,Unknown
3661,ups And b,ups And,4.617122,0.05914,0.0069,,214.53417,,,,...,1.29,0.525,0.122,4.13,13.4054,74.57110,13.4054,Radial Velocity,1996,Unknown
3662,ups And c,ups And,241.223000,0.82650,0.2660,,624.53282,4443.24113,,,...,1.29,0.525,0.122,4.13,13.4054,74.57110,13.4054,Radial Velocity,1999,Unknown


array(['pl_name', 'hostname', 'pl_orbper', 'pl_orbsmax', 'pl_orbeccen',
       'pl_rade', 'pl_bmasse', 'pl_masse', 'pl_dens', 'pl_insol',
       'pl_eqt', 'pl_orbincl', 'st_teff', 'st_rad', 'st_mass', 'st_lum',
       'st_met', 'st_logg', 'sy_dist', 'sy_plx', 'sy_dist.1',
       'discoverymethod', 'disc_year', 'pl_type'], dtype=object)

## Value Estimations for mass, radius and temperature
We use Random Forests to estimate those 3 values in cases where there are missing, because we end up with a lot of unknown planets in our classification.\
Why random forest? They can handle missing values in their training better and are able to understand more complex non-linear relationships. Also they don't need as thorough hyperparamter tuning as other possbile methods like gradient boosting.

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Build a regression model for each of the 3 target values
def estimate_missing_values(df):
    target_columns = ['pl_rade', 'pl_bmasse', 'pl_eqt']
    df_estimated = df.copy()

    for target in target_columns:
        train_data = df.drop(columns=["pl_name", "hostname", "disc_year", "discoverymethod", "pl_type"])

        # Removes all column that misses the specific target value
        train_data = train_data.dropna(subset=[target])

        X = train_data.drop(columns=target)
        y = train_data[target]

        # Split into train and test sets
        X_train , X_test , y_train , y_test = train_test_split(X.values , y.values , test_size=0.2 , random_state=42)

        # Train the model
        rf_model = RandomForestRegressor(n_estimators=100 , random_state=42)
        rf_model.fit(X_train , y_train)

        y_pred = rf_model.predict(X_test)

        ################# EVALUATION ##################################
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)

        mean_y_test = np.mean(y_test)
        var_y_test = np.var(y_test)
        range_y_test = np.max(y_test) - np.min(y_test)

        nmae = mae / mean_y_test
        nmse = mse / var_y_test
        nrmse = rmse / range_y_test

        print(f"Evaluating model for {target}:")
        print(f"MAE: {mae} | Normalized MAE: {nmae}")
        print(f"MSE: {mse} | Normalized MSE: {nmse}")
        print(f"RMSE: {rmse} | Normalized RMSE: {nrmse}")
        ###############################################################

        # Predictions
        missing_data = df_estimated[df_estimated[target].isna()]
        X_missing = missing_data.drop(columns=[target, "pl_name", "hostname", "disc_year", "discoverymethod", "pl_type"]).values

        predicted_values = rf_model.predict(X_missing)
        df_estimated.loc[df_estimated[target].isna(), target] = predicted_values


    return df_estimated

nasa = estimate_missing_values(nasa)
display(nasa)
print(nasa[['pl_rade', 'pl_bmasse', 'pl_eqt']].isnull().sum())

Evaluating model for pl_rade:
MAE: 1.1739710117302053 | Normalized MAE: 0.28735589284560664
MSE: 21.311838649641814 | Normalized MSE: 1.1294245260434057
RMSE: 4.616474699339509 | Normalized RMSE: 0.2087768948688273
Evaluating model for pl_bmasse:
MAE: 67.70266975940169 | Normalized MAE: 0.14044756741942838
MSE: 43369.061456952586 | Normalized MSE: 0.060651924367838846
RMSE: 208.25239844225706 | Normalized RMSE: 0.0428264544709262
Evaluating model for pl_eqt:
MAE: 65.47382916053019 | Normalized MAE: 0.07211943896488177
MSE: 13574.746992488956 | Normalized MSE: 0.06920482543008198
RMSE: 116.51071621309757 | Normalized RMSE: 0.034521693692769646


Unnamed: 0,pl_name,hostname,pl_orbper,pl_orbsmax,pl_orbeccen,pl_rade,pl_bmasse,pl_masse,pl_dens,pl_insol,...,st_mass,st_lum,st_met,st_logg,sy_dist,sy_plx,sy_dist.1,discoverymethod,disc_year,pl_type
0,14 Her b,14 Her,1765.038900,2.77400,0.3730,19.52448,2559.47216,2559.47216,,,...,0.91,-0.153,0.405,4.43,17.9323,55.73630,17.9323,Radial Velocity,2002,Unknown
1,16 Cyg B b,16 Cyg B,799.450000,1.67600,0.6832,9.76843,556.83537,,,,...,0.98,0.097,0.074,4.30,21.1397,47.27540,21.1397,Radial Velocity,1996,Unknown
2,1RXS J160929.1-210524 b,1RXS J160929.1-210524,,330.00000,,18.64700,4000.00000,4000.00000,,,...,0.85,-0.370,,4.00,139.1350,7.15949,139.1350,Imaging,2008,Gas Giant
3,47 UMa b,47 UMa,1076.600000,2.05900,0.0160,9.45360,774.86566,,,,...,1.01,0.198,0.026,4.33,13.7967,72.45280,13.7967,Radial Velocity,1996,Unknown
4,51 Peg b,51 Peg,4.230797,0.05235,0.0042,10.80028,147.47238,146.20180,,,...,1.07,0.136,0.206,4.32,15.4614,64.64880,15.4614,Radial Velocity,1995,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3659,pi Men c,HD 39091,6.267829,0.06900,0.0770,2.04200,3.49611,3.63000,2.97,309.0,...,1.02,0.160,0.050,4.36,18.2702,54.70520,18.2702,Transit,2018,Unknown
3660,tau Boo b,tau Boo,3.312453,0.04869,0.0074,12.04660,1366.66215,1891.00000,,,...,1.40,0.505,0.272,4.26,15.6521,63.86380,15.6521,Radial Velocity,1996,Unknown
3661,ups And b,ups And,4.617122,0.05914,0.0069,9.80055,214.53417,,,,...,1.29,0.525,0.122,4.13,13.4054,74.57110,13.4054,Radial Velocity,1996,Unknown
3662,ups And c,ups And,241.223000,0.82650,0.2660,12.73619,624.53282,4443.24113,,,...,1.29,0.525,0.122,4.13,13.4054,74.57110,13.4054,Radial Velocity,1999,Unknown


pl_rade      0
pl_bmasse    0
pl_eqt       0
dtype: int64
