# 1.1. Comparison with Tasker et al. 2020

Here we run the data imputation algorithms using the same dataset as TLG2020: 550 planets and a specific train/test data split.  
We separate Transit case and RV case, and save the results.

In [74]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import ExtraTreesRegressor

from knnxkde import KNNxKDE
from utils import normalization, renormalization, compute_epsilon
from GAIN.gain import gain

In [3]:
rv_dists = np.load('data/mdists__mask[001111]Ndist2000_2018-08-23_mcsteps3000_loss-0.32_multitrain0_0.npy')
transit_dists = np.load('data/mdists__mask[101111]Ndist2000_2018-08-23_mcsteps3000_loss-0.32_multitrain0_0.npy')

In [4]:
colnames = ['idx', 'pl_name', 'pl_radj', 'pl_bmassj', 'pl_orbper', 'pl_teq', 'pl_pnum', 'st_mass']
all_planets = pd.read_csv('data/NEA_radmasstpersmasspnum.csv', names=colnames, sep=',', skiprows=1)

rs = np.random.RandomState(12345)
dataidx = rs.permutation(all_planets.shape[0])  # it's the very same, I have checked

train_names = all_planets['pl_name'][dataidx[0:400]]
test_names = all_planets['pl_name'][dataidx[400:]]

true_masses = np.array(all_planets.iloc[dataidx[400:]]['pl_bmassj'])
true_radii = np.array(all_planets.iloc[dataidx[400:]]['pl_radj'])

## A. Transit Case

In [5]:
original_data = np.array(all_planets[['pl_radj', 'pl_bmassj', 'pl_orbper', 'pl_teq', 'pl_pnum', 'st_mass']])
log_original_data = np.copy(original_data)
for d in [0, 1, 2, 3, 5]:
    log_original_data[:, d] = np.log(original_data[:, d])  # take the log for all columns except pl_pnum

miss_data = np.copy(log_original_data)
for idx in dataidx[400:]:
    miss_data[idx, 1] = np.nan  # For transit case: hide the mass only

norm_miss_data, norm_params = normalization(miss_data)
norm_log_original_data, _ = normalization(log_original_data, parameters=norm_params)

In [6]:
MY_TAU = 1.0 / 50.0  # for kNNxKDE
MY_NB_NEIGH = 20  # for kNNxKDE
MY_NB_NEIGHBOURS = 15
MY_NB_TREES = 20
MY_NB_ITERS = 2500

imputed_masses = {
    'rbm_TLG2020': np.zeros(150),
    'knnxkde': np.zeros(150),
    'knnimputer': np.zeros(150),
    'missforest': np.zeros(150),
    'gain': np.zeros(150),
    'mice': np.zeros(150),
    'mean': np.zeros(150),
}

In [7]:
bins = 500  # Reproduce TLG2020 epsilon results: impute with mode
logbins = np.logspace(np.log10(0.0001), np.log10(100.0), num=bins)  # in M_J

rbm_masses = []
for i in range(150):
    hist = np.histogram(transit_dists[i, :, 1], bins=logbins, density=False)
    mid_points = (hist[1][:-1] + hist[1][1:]) / 2.0
    rbm_masses.append(mid_points[np.argmax(hist[0])])
imputed_masses['rbm_TLG2020'] = np.array(rbm_masses)

eps = compute_epsilon(imputed_masses['rbm_TLG2020'], true_masses)
print(f'TLG2020 epsilon = {eps:.4f}')

TLG2020 epsilon = 0.9803


In [None]:
knnxkde = KNNxKDE(h=0.05, tau=MY_TAU, nb_neigh=MY_NB_NEIGH, metric='nan_std_eucl')
norm_imputed_data = knnxkde.impute_mean(norm_miss_data, nb_draws=1000)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_masses['knnxkde'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

knnimputer = KNNImputer(n_neighbors=MY_NB_NEIGHBOURS)
norm_imputed_data = knnimputer.fit_transform(norm_miss_data)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_masses['knnimputer'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

estimator = ExtraTreesRegressor(n_estimators=MY_NB_TREES)
missforest = IterativeImputer(estimator=estimator, max_iter=10, tol=2e-1, verbose=0)
norm_imputed_data = missforest.fit_transform(norm_miss_data)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_masses['missforest'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

gain_parameters = {'batch_size': 128, 'hint_rate': 0.9, 'alpha': 100, 'iterations': MY_NB_ITERS}
norm_imputed_data = gain(norm_miss_data, gain_parameters)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_masses['gain'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

mice = IterativeImputer(estimator=BayesianRidge(), max_iter=10, tol=2e-1, verbose=0)
norm_imputed_data = mice.fit_transform(norm_miss_data)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_masses['mice'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

mean_imputer = SimpleImputer(strategy='mean')
norm_imputed_data = mean_imputer.fit_transform(norm_miss_data)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_masses['mean'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

In [16]:
with open('results_and_figures/comparison_tlg2020/transit_case_imputed_masses.pkl', 'wb') as f:
    pickle.dump(imputed_masses, f)

## B. RV Case

In [67]:
original_data = np.array(all_planets[['pl_radj', 'pl_bmassj', 'pl_orbper', 'pl_teq', 'pl_pnum', 'st_mass']])
log_original_data = np.copy(original_data)
for d in [0, 1, 2, 3, 5]:
    log_original_data[:, d] = np.log(original_data[:, d])  # take the log for all columns except pl_pnum

miss_data = np.copy(log_original_data)
for idx in dataidx[400:]:
    miss_data[idx, 0] = np.nan  # for the RV case: hide the radius...
    miss_data[idx, 1] = np.nan  # and the mass

norm_miss_data, norm_params = normalization(miss_data)
norm_log_original_data, _ = normalization(log_original_data, parameters=norm_params)

In [68]:
MY_TAU = 1.0 / 50.0  # for kNNxKDE
MY_NB_NEIGH = 20  # for kNNxKDE
MY_NB_NEIGHBOURS = 15
MY_NB_TREES = 20
MY_NB_ITERS = 2500

imputed_masses = {
    'rbm_TLG2020': np.zeros(150),
    'knnxkde': np.zeros(150),
    'knnimputer': np.zeros(150),
    'missforest': np.zeros(150),
    'gain': np.zeros(150),
    'mice': np.zeros(150),
    'mean': np.zeros(150),
}

imputed_radii = {
    'rbm_TLG2020': np.zeros(150),
    'knnxkde': np.zeros(150),
    'knnimputer': np.zeros(150),
    'missforest': np.zeros(150),
    'gain': np.zeros(150),
    'mice': np.zeros(150),
    'mean': np.zeros(150),
}

In [69]:
bins = 500  # Reproduce TLG2020 epsilon results: impute with mode
logbins_radii = np.logspace(np.log10(0.01), np.log10(10.0), num=bins)  # in R_J
logbins_masses = np.logspace(np.log10(0.0001), np.log10(100.0), num=bins)  # in M_J

rbm_radii = []
rbm_masses = []
for i in range(150):
    hist_rad = np.histogram(rv_dists[i, :, 0], bins=logbins_radii, density=False)
    hist_mass = np.histogram(rv_dists[i, :, 1], bins=logbins_masses, density=False)
    mid_points_rad = (hist_rad[1][:-1] + hist_rad[1][1:]) / 2.0
    mid_points_mass = (hist_mass[1][:-1] + hist_mass[1][1:]) / 2.0
    rbm_radii.append(mid_points_rad[np.argmax(hist_rad[0])])
    rbm_masses.append(mid_points_mass[np.argmax(hist_mass[0])])
imputed_radii['rbm_TLG2020'] = np.array(rbm_radii)
imputed_masses['rbm_TLG2020'] = np.array(rbm_masses)

eps_mass = compute_epsilon(imputed_masses['rbm_TLG2020'], true_masses)
eps_rad = compute_epsilon(imputed_radii['rbm_TLG2020'], true_radii)
print(f'TLG2020 radius epsilon = {eps_rad:.4f}')
print(f'TLG2020 mass epsilon = {eps_mass:.4f}')

TLG2020 radius epsilon = 0.5441
TLG2020 mass epsilon = 1.2697


In [70]:
knnxkde = KNNxKDE(h=0.05, tau=MY_TAU, nb_neigh=MY_NB_NEIGH, metric='nan_std_eucl')
norm_imputed_data = knnxkde.impute_mean(norm_miss_data, nb_draws=1000)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_radii['knnxkde'] = np.exp(renorm_imputed_data[dataidx[400:], 0])
imputed_masses['knnxkde'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

knnimputer = KNNImputer(n_neighbors=MY_NB_NEIGHBOURS)
norm_imputed_data = knnimputer.fit_transform(norm_miss_data)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_radii['knnimputer'] = np.exp(renorm_imputed_data[dataidx[400:], 0])
imputed_masses['knnimputer'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

estimator = ExtraTreesRegressor(n_estimators=MY_NB_TREES)
missforest = IterativeImputer(estimator=estimator, max_iter=10, tol=2e-1, verbose=0)
norm_imputed_data = missforest.fit_transform(norm_miss_data)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_radii['missforest'] = np.exp(renorm_imputed_data[dataidx[400:], 0])
imputed_masses['missforest'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

gain_parameters = {'batch_size': 128, 'hint_rate': 0.9, 'alpha': 100, 'iterations': MY_NB_ITERS}
norm_imputed_data = gain(norm_miss_data, gain_parameters)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_radii['gain'] = np.exp(renorm_imputed_data[dataidx[400:], 0])
imputed_masses['gain'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

mice = IterativeImputer(estimator=BayesianRidge(), max_iter=10, tol=2e-1, verbose=0)
norm_imputed_data = mice.fit_transform(norm_miss_data)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_radii['mice'] = np.exp(renorm_imputed_data[dataidx[400:], 0])
imputed_masses['mice'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

mean_imputer = SimpleImputer(strategy='mean')
norm_imputed_data = mean_imputer.fit_transform(norm_miss_data)
renorm_imputed_data = renormalization(norm_imputed_data, norm_params)
imputed_radii['mean'] = np.exp(renorm_imputed_data[dataidx[400:], 0])
imputed_masses['mean'] = np.exp(renorm_imputed_data[dataidx[400:], 1])

2023-07-14 11:57:11.688406: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-14 11:57:11.701920: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled


In [71]:
with open('results_and_figures/comparison_tlg2020/rv_case_imputed_radii.pkl', 'wb') as f:
    pickle.dump(imputed_radii, f)
with open('results_and_figures/comparison_tlg2020/rv_case_imputed_masses.pkl', 'wb') as f:
    pickle.dump(imputed_masses, f)