# 🔄 Refresh Dataset from NASA Exoplanet Archive  
This section downloads the **latest Kepler KOI dataset** directly from the NASA Exoplanet Archive (Q1–Q17 DR25),  
renames columns to match our project, creates target labels, and saves fresh `train`/`test` splits.  

➡️ The outputs will replace the old files:  
- train.csv, test.csv, test_solution.csv  
- train2.csv, test2.csv, test_solution2.csv  

These files will then be used by `modified_model.ipynb` for modeling.


In [None]:
# --- Setup: folders & imports ---
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

DATA_DIR = Path("data/raw")
OUT_DIR  = Path("data/processed")  # keep outputs where your project expects them
DATA_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# NASA TAP CSV URL for KOI Q1–Q17 DR25 delivery
KOI_URL = (
    "https://exoplanetarchive.ipac.caltech.edu/TAP/sync"
    "?query=select+*+from+q1_q17_dr25_koi&format=csv"
)


In [None]:
# --- Download the latest KOI table via TAP into a DataFrame ---
raw = pd.read_csv(KOI_URL, low_memory=False)
raw.to_csv(DATA_DIR / "exoplanets_latest_raw.csv", index=False)
print(raw.shape)
raw.head(3)


In [None]:
# --- Exact renaming to match your existing notebooks ---
rename_map = {
'kepid':'KepID',
    'kepoi_name':'KOIName',
    'kepler_name':'KeplerName',
    'koi_disposition':'ExoplanetArchiveDisposition',
    'koi_pdisposition':'DispositionUsingKeplerData',
    'koi_score':'DispositionScore',
    'koi_fpflag_nt':'NotTransitLikeFlag',
    'koi_fpflag_ss':'StellarEclipseFlag',
    'koi_fpflag_co':'CentroidOffsetFlag',
    'koi_fpflag_ec':'EphemerisMatchFlag',
    'koi_period':'OrbitalPeriod[days]',
    'koi_period_err1':'OrbitalPeriodUpperUnc[days]',
    'koi_period_err2':'OrbitalPeriodLowerUnc[days]',
    'koi_time0bk':'TransitEpoch[BKJD]',
    'koi_time0bk_err1':'TransitEpochUpperUnc[BKJD]',
    'koi_time0bk_err2':'TransitEpochLowerUnc[BKJD]',
    'koi_impact':'ImpactParameter',
    'koi_impact_err1':'ImpactParameterUpperUnc',
    'koi_impact_err2':'ImpactParameterLowerUnc',
    'koi_duration':'TransitDuration[hrs]',
    'koi_duration_err1':'TransitDurationUpperUnc[hrs]',
    'koi_duration_err2':'TransitDurationLowerUnc[hrs]',
    'koi_depth':'TransitDepth[ppm]',
    'koi_depth_err1':'TransitDepthUpperUnc[ppm]',
    'koi_depth_err2':'TransitDepthLowerUnc[ppm]',
    'koi_prad':'PlanetaryRadius[EarthRadii]',
    'koi_prad_err1':'PlanetaryRadiusUpperUnc[EarthRadii]',
    'koi_prad_err2':'PlanetaryRadiusLowerUnc[EarthRadii]',
    'koi_teq':'EquilibriumTemperature[K]',
    'koi_teq_err1':'EquilibriumTemperatureUpperUnc[K]',
    'koi_teq_err2':'EquilibriumTemperatureLowerUnc[K]',
    'koi_insol':'InsolationFlux[EarthFlux]',
    'koi_insol_err1':'InsolationFluxUpperUnc[EarthFlux]',
    'koi_insol_err2':'InsolationFluxLowerUnc[EarthFlux]',
    'koi_model_snr':'TransitSNR',
    'koi_tce_plnt_num':'TCEPlanetNumber',
    'koi_tce_delivname':'TCEDeliver',
    'koi_steff':'StellarTeff[K]',
    'koi_steff_err1':'StellarTeffUpperUnc[K]',
    'koi_steff_err2':'StellarTeffLowerUnc[K]',
    'koi_slogg':'StellarLogg[cm/s^2]',
    'koi_slogg_err1':'StellarLoggUpperUnc',
    'koi_slogg_err2':'StellarLoggLowerUnc',
    'koi_srad':'StellarRadius[SolRadii]',
    'koi_srad_err1':'StellarRadiusUpperUnc[SolRadii]',
    'koi_srad_err2':'StellarRadiusLowerUnc[SolRadii]',
    'ra':'RA[deg]',
    'dec':'Dec[deg]',
    'koi_kepmag':'KeplerMag'
}

data = raw.rename(columns=rename_map)

# Labels exactly like your project
data['ExoplanetCandidate'] = data['DispositionUsingKeplerData'].apply(
    lambda x: 1 if x == 'CANDIDATE' else 0
)
data['ExoplanetConfirmed'] = data['ExoplanetArchiveDisposition'].apply(
    lambda x: 2 if x == 'CONFIRMED' else (1 if x == 'CANDIDATE' else 0)
)

# Drop the two disposition columns once labels are created
data = data.drop(['ExoplanetArchiveDisposition', 'DispositionUsingKeplerData'], axis=1)

# Drop name columns (you did this before)
data.drop(columns=['KeplerName','KOIName'], inplace=True, errors='ignore')

# Match your earlier pruning of these two uncertainty cols
data = data.drop(['EquilibriumTemperatureUpperUnc.[K','EquilibriumTemperatureLowerUnc.[K'], axis=1, errors='ignore')

# Remove rows with any remaining NaNs (same as you did)
before = data.shape[0]
data = data.dropna().copy()
after = data.shape[0]

# (Optional) coerce TCEDeliver to a numeric flag, exactly like your notebook
if 'TCEDeliver' in data.columns:
    data['TCEDeliver'] = 1

print(f"Rows before dropna: {before} | after: {after}")
print(data.shape)
data.head(3)


In [None]:
data.to_csv(DATA_DIR / "exoplanets_latest_clean.csv", index=False)


In [None]:
# --- First split (for your original workflow) ---
train, test = train_test_split(data, test_size=0.4, random_state=0)

# Save the solution (labels) before dropping from features
test_solution = test['ExoplanetCandidate'].copy()

test_features = test.drop(['ExoplanetCandidate'], axis=1)

# Save CSVs
train.to_csv(OUT_DIR / "train.csv", index=False)
test_features.to_csv(OUT_DIR / "test.csv", index=False)
test_solution.to_csv(OUT_DIR / "test_solution.csv", index=False)

len(train), len(test_features), len(test_solution)


In [None]:
# --- Second split (to mirror your modified_model.ipynb) ---
train2, test2 = train_test_split(data, test_size=0.4, random_state=1)

test_solution2 = test2['ExoplanetCandidate'].copy()
test2_features = test2.drop(['ExoplanetCandidate'], axis=1)

# Save CSVs
train2.to_csv(OUT_DIR / "train2.csv", index=False)
test2_features.to_csv(OUT_DIR / "test2.csv", index=False)
test_solution2.to_csv(OUT_DIR / "test_solution2.csv", index=False)

len(train2), len(test2_features), len(test_solution2)


In [None]:
print("Unique TCEDeliver values:", data['TCEDeliver'].unique()[:5])
print("ExoplanetCandidate value counts:\n", data['ExoplanetCandidate'].value_counts())


# ✅ Dataset Ready  
We now have updated training and testing files (`train.csv`, `test.csv`, `train2.csv`, etc.)  
built from the **most up-to-date NASA KOI archive**.  

The next section (old preprocessing code) is kept for reference but is no longer required  
since we now start with cleaned, labeled splits.


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Load dataset
data = pd.read_csv("data/raw/exoplanets_2025.csv")

# Rename columns (shorten long names)
data = data.rename(columns={
    'kepid':'KepID',
    'kepoi_name':'KOIName',
    'kepler_name':'KeplerName',
    'koi_disposition':'ExoplanetArchiveDisposition',
    'koi_pdisposition':'DispositionUsingKeplerData',
    'koi_score':'DispositionScore',
    'koi_fpflag_nt':'NotTransit-LikeFalsePositiveFlag',
    'koi_fpflag_ss':'StellarEclipseFalsePositiveFlag',
    'koi_fpflag_co':'CentroidOffsetFalsePositiveFlag',
    'koi_fpflag_ec':'EphemerisMatchIndicatesContaminationFalsePositiveFlag',
    'koi_period':'OrbitalPeriod[days]',
    'koi_time0bk':'TransitEpoch[BKJD]',
    'koi_impact':'ImpactParameter',
    'koi_duration':'TransitDuration[hrs]',
    'koi_depth':'TransitDepth[ppm]',
    'koi_prad':'PlanetaryRadius[Earthradii]',
    'koi_teq':'EquilibriumTemperature[K]',
    'koi_insol':'InsolationFlux[Earthflux]',
    'koi_model_snr':'TransitSignal-to-Noise',
    'koi_tce_plnt_num':'TCEPlanetNumber',
    'koi_tce_delivname':'TCEDeliver',
    'koi_steff':'StellarEffectiveTemperature[K]',
    'koi_slogg':'StellarSurfaceGravity[log10(cm/s**2)]',
    'koi_srad':'StellarRadius[Solarradii]',
    'ra':'RA[decimaldegrees]',
    'dec':'Dec[decimaldegrees]',
    'koi_kepmag':'Kepler-band[mag]'
})

# Create target columns
data['ExoplanetCandidate'] = data['DispositionUsingKeplerData'].apply(lambda x: 1 if x == 'CANDIDATE' else 0)
data['ExoplanetConfirmed'] = data['ExoplanetArchiveDisposition'].apply(
    lambda x: 2 if x == 'CONFIRMED' else 1 if x == 'CANDIDATE' else 0
)

# Drop unused columns
data.drop(columns=['ExoplanetArchiveDisposition','DispositionUsingKeplerData','KeplerName','KOIName'], inplace=True)

# Drop missing values
data.dropna(inplace=True)

# Split into train/test
train, test = train_test_split(data, test_size=0.4, random_state=1)

# Save splits
train.to_csv("data/processed/train2.csv", index=False)
test.to_csv("data/processed/test2.csv", index=False)

# Save solutions separately
test_solution = test[['ExoplanetCandidate']]
test_solution.to_csv("data/processed/test_solution2.csv", index=False)

print("✅ Preprocessing complete: train2.csv, test2.csv, test_solution2.csv saved")
