# Introduction

## Dataset review

We start by checking the datasets available:

In [None]:
from google.colab import drive
drive.mount("/content/drive")
koi_url = "/content/drive/My Drive/Nasa Space Apps Challenge/datos/cumulative_2025.09_28_15_59_12.csv"

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
usecols=[
    "koi_disposition",  # Exoplanet Archive Disposition (our label)
    # "koi_score",        # Disposition Score
    "koi_period",       # Orbital Period [days]
    "koi_time0bk",      # Transit Epoch [BKJD]
    "koi_impact",       # Impact Parameter
    "koi_duration",     # Transit Duration [hrs]
    "koi_prad",         # Planetary Radius [Earth radii]
    "koi_depth",        # Transit Depth [ppm]
    "koi_teq",          # Equilibrium Temperature [K]
    "koi_insol",        # Insolation Flux [Earth flux]
    #"koi_dor",            # Planet-Star Distance over Star Radius
    "koi_model_snr",    # Transit Signal-to-Noise
    "koi_steff",        # Stellar Effective Temperature [K]
    "koi_slogg",        # Stellar Surface Gravity [log10(cm/s**2)]
    "koi_srad",         # Stellar Radius [Solar radii]
    "koi_kepmag",       # Kepler-band [mag]
]
print(f"total of {len(usecols)} columns to analyse")

data = pd.read_csv(koi_url, comment="#", usecols=usecols).dropna()
data["koi_disposition"] = data["koi_disposition"].map({
    "CANDIDATE": -1,
    "FALSE POSITIVE": 0,
    "CONFIRMED": 1,
})

# nan_rows = data[data.isna().any(axis=1)]

# nan_rows

data.describe()

In [None]:
filter_mask = data["koi_disposition"] != -1
clean_data = data[filter_mask]

# Transform values into binary
X = clean_data.iloc[:, 1:]
y = clean_data["koi_disposition"]

X.shape

In [None]:
corr = clean_data.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot=True, cmap="coolwarm")

## Joining datasets

We desire to merge TESS and Kepler datasets to obtain more information, therefore, we begin the merging process:

In [None]:
base_folder = "/content/drive/MyDrive/Nasa Space Apps Challenge"
kepler_folder = f"{base_folder}/NASA_archive/kepler.csv"
tess_folder = f"{base_folder}/NASA_archive/tess.csv"
processed_data_folter = f"{base_folder}/processed_data"

In [None]:
import pandas as pd

kepler_df = pd.read_csv(kepler_folder, comment="#")
tess_df = pd.read_csv(tess_folder, skiprows=1)

In [None]:
kepler_df.describe()

In [None]:
tess_df.describe()

In [None]:
print(kepler_df.shape)
print(tess_df.shape)

kepler_df.dropna(inplace=True)
tess_df.dropna(inplace=True)

print(kepler_df.shape)
print(tess_df.shape)
print(f"final length -> {kepler_df.shape[0] + tess_df.shape[0]}")

Transform Kepler dataset

In [None]:
kepler_df["kepid"] = "KIC " + kepler_df["kepid"].astype(str)
kepler_df.columns = ["search_id", "num_planet", "disposition", "ror", "stellar_mass", "ss_gravity", "period", "duration", "transit_epoch"]
kepler_df["transit_epoch"] = kepler_df["transit_epoch"] + 2454833
kepler_df["num_planet"] = kepler_df["num_planet"].str.split(".").str[1].astype(int)
kepler_mapping = {
    "FALSE POSITIVE": 0,
    "CONFIRMED": 1,
    "CANDIDATE": 2,
}
kepler_df["disposition"] = kepler_df["disposition"].map(kepler_mapping)
kepler_df.head()

Transform TESS dataset

In [None]:
tess_df["TIC ID"] = "TIC " + tess_df["TIC ID"].astype(str)
tess_df["ror"] = tess_df["Planet Radius (R_Earth)"] / tess_df["Stellar Radius (R_Sun)"] * 0.009168 # R_earth / R_sun
tess_df.drop(["Planet Radius (R_Earth)", "Stellar Radius (R_Sun)"], axis=1, inplace=True)
tess_df.columns = ["search_id", "num_planet", "disposition", "stellar_mass", "ss_gravity", "period", "duration", "transit_epoch", "ror"]
tess_df["num_planet"] = tess_df["num_planet"].astype(str).str.split(".").str[1].astype(int)
tess_mapping = {
    "KP": 1,
    "CP": 1,
    'PC': 2,
    'IS': 0,
    'EB': 0,
    'V': 0,
    'O': 0,
    'FP': 0,
}
tess_df["disposition"] = tess_df["disposition"].map(tess_mapping)

tess_df.head()

In [None]:
data = pd.concat([kepler_df, tess_df], ignore_index=True)
data

In [None]:
import os
os.makedirs(processed_data_folter, exist_ok=True)
data.to_csv(os.path.join(processed_data_folter, "kepler_tess_dataset.csv"))