# Test 3 on kepler and TESS data 

In [64]:
import pandas as pd
import sys
import os

# Added paths to import modules
sys.path.insert(0, os.path.abspath('../'))

# import custom modules
from data_processing.distribution import (plot_class_distribution, plot_feature_importances,
                                          plot_hist_feature_distributions,
                                          plot_hist_feature_distributions_0_1,
                                          compute_train_0_1)
from data_processing.data_analysis import (compute_all_columns_nan, print_nan_numbers_for_features,
                                           threshold_delete_nan)
from data_processing.data_scaling import plot_top_7_difference, data_scaling_normalization
from data_processing.features_prosessing import (remove_non_numeric_columns, rows_id_nan,
                                                 remove_nan_label, feature_processing_kepler_tess)
from data_processing.knn_imputer import k_nearest_neighbors_imputer
from data_processing.replace_disposition import replace_label
from dimensionality_reduction.features_selection import compute_feature_importance, feature_selection_rfc
from model_selection.grid_search import grid_search_param_optimization
from utils.mission import Mission
from utils.util import print_count_nan, print_feature_importance

# Read Data

read the csv file taken from:
1. Kepler: NASA Exoplanet Archive  http://exoplanetarchive.ipac.caltech.edu
2. TESS: ExoFOP https://exofop.ipac.caltech.edu/tess/view_toi.php

In [65]:
kepler_data = pd.read_csv('../data/raw_data/cumulative_2023.11.04_08.48.13.csv')
tess_data = pd.read_csv('../data/raw_data/tess_exofop.csv')
print('Shape of Kepler Cumulative KOI', kepler_data.shape)
print('Shape of ExoFOP TESS data:', tess_data.shape)
_ = print_count_nan(data=kepler_data, name='Kepler Cumulative KOI')
_ = print_count_nan(data=tess_data, name='TESS')

Shape of Kepler Cumulative KOI (9564, 141)
Shape of ExoFOP TESS data: (6977, 62)
Number of NaN values in Kepler Cumulative KOI: 237116 out of 1348524: 17.58%
Number of NaN values in TESS: 38201 out of 432574: 8.83%


# Replace Label

In the data initially there is an disposition that includes 3 or more classes.

Kepler:
1. CONFIRMED
2. CANDIDATE
3. FALSE POSITIVE

with the aim of making a binary classifier we use the following procedure to replace:
1. CONFIRMED, CANDIDATE with the label: 1
2. FALSE POSITIVE with the label: 0

TESS: Disposizione TFOPWG
1. APC = Candidato Planetario Ambiguo
2. CP = Pianeta Confermato
3. FA = Falso Allarme
4. FP = Falso Positivo
5. KP = Pianeta Conosciuto
6. PC = Candidato Planetario

with the aim of making a binary classifier we use the following procedure to replace:
1. KP, CP, PC with the label: 1
2. APC, FA, FP with the label: 0

NASA Exoplanet Archive documentation: https://exoplanetarchive.ipac.caltech.edu/docs/API_TOI_columns.html 

ExoFOP documentation: https://exofop.ipac.caltech.edu/tess/tsm.php

In [66]:
kepler_data = replace_label(data=kepler_data, mission=Mission.KEPLER)
tess_data = replace_label(data=tess_data, mission=Mission.TESS)
print('Shape of kepler data: ', kepler_data.shape)
print('Shape of TESS data: ', tess_data.shape)

Shape of kepler data:  (9564, 141)
Shape of TESS data:  (6977, 62)


# Feature and Label Processing

The Kepler and TESS data have different features, so we want to find an intersection between the two sets of data. To do this, we perform several operations:

1. Removing non-numeric features
2. Removing of identifying and follow-up characteristics
3. Identifying the labels (X_train, y_train)
4. Removing NaN in y_train and correspondingly also the X_train rows (if there are)

In [67]:
X_train, y_train = feature_processing_kepler_tess(kepler_data=kepler_data, tess_data=tess_data)


After the processing:
Number of NaN values in TESS   : 22338 out of 181402: 12.31%
Number of NaN values in Kepler : 7219 out of 229536: 3.15%


In [68]:
print('Shape of X_train data: ', X_train.shape)
print('Shape of y_train data: ', y_train.shape)
_ = print_count_nan(data=y_train, name='y_train')

Shape of X_train data:  (16541, 26)
Shape of y_train data:  (16541,)
Number of NaN values in y_train: 98 out of 16541: 0.59%


In [69]:
# Removing NaN in y_train and correspondingly also the X_train rows
X_train, y_train = remove_nan_label(X_train, y_train)
count = y_train.isna().sum()
print('Shape of X_train data: ', X_train.shape)
print('Shape of y_train data: ', y_train.shape)
_ = print_count_nan(data=X_train, name='X_train')
_ = print_count_nan(data=y_train, name='y_train')

Shape of X_train data:  (16443, 26)
Shape of y_train data:  (16443,)
Number of NaN values in X_train: 48359 out of 427518: 11.31%
Number of NaN values in y_train: 0 out of 16443: 0.00%


In [70]:
# Delete columns
name_columns_to_delete = ['RA', 'Dec', 'Epoch (BJD)', 'Epoch (BJD) err']
X_train = X_train.drop(columns=name_columns_to_delete)

# Data Scaling

Nonostante non costituisca un prerequisito necessario nei modelli di machine learning, il processo di normalizzare i dati è tipicamente impiegato con l'obiettivo di:

1. Standardizzare l'intervallo di valori di tutte le caratteristiche del dataset
2. Migliorare la robustezza numerica degli algoritmi impiegati.

In [71]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer

In [72]:
# scaler = Normalizer()
# X_train_normalized = scaler.fit_transform(X_train)
# X_train = pd.DataFrame(X_train_normalized, columns=X_train.columns)

In [73]:
# scaler = PowerTransformer()
# X_train_normalized = scaler.fit_transform(X_train)
# X_train = pd.DataFrame(X_train_normalized, columns=X_train.columns)

In [74]:
scaler = QuantileTransformer(output_distribution='uniform')
X_train_normalized = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train_normalized, columns=X_train.columns)

# K-Nearest Neighbors

The NaN problem must be managed carefully, there are several solutions that can be used, one of these is the KNN methodology which calculates a distance (e.g. Euclidean or Manhattan) between the observations and calculates the new value to be inserted with some methodologies (e.g. IDWM or IRWM) but considering the nearest K, where K is defined a priori, generally for K the square root of N is considered where N is the number of observations, or an approximation is used.

In this case, for efficiency reasons, the sklearn KNNImputer module was used which is optimized.

In [75]:
_ = print_count_nan(data=X_train, name='X_train')
print('Shape of X_train:', X_train.shape)

Number of NaN values in X_train: 47905 out of 361746: 13.24%
Shape of X_train: (16443, 22)


In [76]:
# Calculate columns that contain only nan and the number of nan for each columns
nan_columns = compute_all_columns_nan(data=X_train)
number_of_nan_columns = {col: X_train[col].isna().sum() for col in X_train}
number_of_nan_columns = dict(sorted(number_of_nan_columns.items(), key=lambda x: x[1], reverse=True))
print_nan_numbers_for_features(data=X_train, number_of_nan_columns=number_of_nan_columns)

Columns containing only not a number in X_train: []
Number of Observations: 16443
NaN for each feature
  1: Stellar Radius (R_Sun) err----> 11281
  2: Stellar Radius (R_Sun)--------> 10031
  3: Stellar Metallicity err-------> 6048
  4: Stellar Metallicity-----------> 6047
  5: Stellar log(g) (cm/s^2) err---> 2469
  6: Stellar Mass (M_Sun) err------> 2342
  7: Planet Radius (R_Earth) err---> 1819
  8: Stellar Mass (M_Sun)----------> 1278
  9: Stellar log(g) (cm/s^2)-------> 1152
 10: Stellar Eff Temp (K) err------> 889
 11: Planet Radius (R_Earth)-------> 830
 12: Planet Equil Temp (K)---------> 642
 13: Stellar Eff Temp (K)----------> 493
 14: Planet Insolation (Earth Flux)> 476
 15: Duration (hours) err----------> 467
 16: Depth (ppm) err---------------> 460
 17: Period (days) err-------------> 454
 18: Depth (ppm)-------------------> 363
 19: Planet SNR--------------------> 363
 20: TESS Mag----------------------> 1
 21: Period (days)-----------------> 0
 22: Duration (hours)--------

In [77]:
TCOL = 2400
nan_columns = threshold_delete_nan(number_of_nan_columns=number_of_nan_columns,
                                        nan_columns_name=nan_columns, threshold=TCOL)
X_train = X_train.drop(columns=nan_columns)
print_count_nan(data=X_train, name='X_train')
print('Shape of X_train:', X_train.shape)

Number of NaN values in X_train: 12029 out of 279531: 4.30%
Shape of X_train: (16443, 17)


In [78]:
# Compute the number of not a number for each rows
number_of_nan_rows = X_train.isna().sum(axis=1)
TROW = 0.5
id_rows = rows_id_nan(number_of_nan_rows=number_of_nan_rows,
                      X_train=X_train, threshold=TROW)

Total rows >= of T=0.5: 2645 out of a total of 16443 By eliminating them you obtain 13798 observations


In [79]:
# Drop rows >= T
X_train = X_train.drop(id_rows).reset_index(drop=True)
y_train = y_train.drop(id_rows).reset_index(drop=True)

In [80]:
print('Shape of X_train:', X_train.shape)
_ = print_count_nan(data=X_train, name='X_train')

Shape of X_train: (13798, 17)
Number of NaN values in X_train: 0 out of 234566: 0.00%


In [81]:
INDEX_OF_K = 10
# Using KNNImputer
X_train = k_nearest_neighbors_imputer(X_train=X_train, index_of_k=INDEX_OF_K)
count = X_train.isna().sum()
print('Number of not a number in X_train is:', count.sum())
print('Shape of X_train data: ', X_train.shape)
print('Shape of y_train data: ', y_train.shape)

Number of NaN values in X_train: 0 out of 234566: 0.00%
There are no NaN
Number of not a number in X_train is: 0
Shape of X_train data:  (13798, 17)
Shape of y_train data:  (13798,)


In [82]:
print('Shape of X_train data: ', X_train.shape)
print('Shape of y_train data: ', y_train.shape)

Shape of X_train data:  (13798, 17)
Shape of y_train data:  (13798,)


In [83]:
# Save processed data
X_train.to_csv('../data/processed_data/X_kepler_tess_quantilet.csv')
y_train.to_csv('../data/processed_data/y_kepler_tess_quantilet.csv')