# 🟩 Step 1 — Setup & Imports
> Initialize environment, import required libraries, and suppress warnings.

In [1]:
# -*- coding: utf-8 -*-
"""
Notebook: Kepler_Exoplanet_Modeling_FlaskReady_v2.ipynb
Purpose: Full ML workflow + feature selection + outlier handling + Flask integration.
"""

import warnings
warnings.filterwarnings('ignore')

import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, f1_score, confusion_matrix, roc_curve
)
import joblib

# 🟦 Step 2 — Define Directory Paths
> Create directories for saving models, plots, and results that Flask will access later.

In [2]:
BASE_MODEL_DIR = '../static/models'
PLOTS_DIR = '../static/plots'
RESULTS_DIR = '../static/results'

os.makedirs(BASE_MODEL_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

In [3]:
def load_raw_dataset(file_path):
    """Load dataset from CSV file."""
    return pd.read_csv(file_path)


# 🟨 Step 3 — Import Data

In [4]:
df = load_raw_dataset("C:\\Users\\Abdelrahman Bakr\\Desktop\\me\\project\\Nasa\\Exoplanets-Detection-Using-Machine-Learning\\Backend\\Note books\\Data Sources\\Kepler.csv")

display(df.head())

print(f"✅ Dataset loaded successfully. Shape: {df.shape}")

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,koi_fpflag_nt,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,Done,8/16/2018,CANDIDATE,1.0,0,...,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,Done,8/16/2018,CANDIDATE,0.969,0,...,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45
2,3,10811496,K00753.01,,CANDIDATE,Done,8/16/2018,CANDIDATE,0.0,0,...,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,4,10848459,K00754.01,,FALSE POSITIVE,Done,8/16/2018,FALSE POSITIVE,0.0,0,...,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,Done,8/16/2018,CANDIDATE,1.0,0,...,-0.09,0.18,0.1,0.14,0.07,0.18,0.02,0.16,0.07,0.2


✅ Dataset loaded successfully. Shape: (9564, 141)


# 🟩 Step 4 — Build Preprocessor
> Define preprocessing pipelines for numeric and categorical features.

In [5]:
def build_preprocessor(numeric_features, categorical_features):
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    return preprocessor

# 🟦 Step 5 — Prepare Dataset
>select relevant columns, rename them, and prepare target variable.

In [6]:
selected_columns = [
    # --- Physical Parameters ---
    "koi_prad", "koi_prad_err1", "koi_prad_err2",
    "koi_ror", "koi_depth", "koi_srho",

    # --- Orbital Parameters ---
    "koi_period", "koi_sma", "koi_eccen","koi_incl",
    "koi_duration", "koi_ingress", "koi_dor",

    # --- Thermal / Habitability ---
    "koi_teq", "koi_insol",

    # --- Stellar Properties ---
    "koi_steff", "koi_slogg", "koi_smet",
    "koi_srad", "koi_smass", "koi_sage",

    # --- Detection / Validation ---
    "koi_disposition", "koi_pdisposition", "koi_score",
    "koi_model_snr", "koi_num_transits",

    # --- Coordinates & brightness ---
    "ra", "dec", "koi_kepmag"
]

df = df[selected_columns]
df.shape

(9564, 29)

In [7]:
rename_dict = {
    # Physical
    "koi_prad": "planet_radius_earth",
    "koi_prad_err1": "planet_radius_err_upper",
    "koi_prad_err2": "planet_radius_err_lower",
    "koi_ror": "radius_ratio_Rp_Rstar",
    "koi_depth": "transit_depth_ppm",
    "koi_srho": "stellar_density_gcm3",

    # Orbital
    "koi_period": "orbital_period_days",
    "koi_sma": "semi_major_axis_AU",
    "koi_eccen": "eccentricity",
    "koi_incl": "inclination_deg",
    "koi_duration": "transit_duration_hrs",
    "koi_ingress": "ingress_duration_hrs",
    "koi_dor": "scaled_distance_a_Rstar",

    # Thermal
    "koi_teq": "equilibrium_temp_K",
    "koi_insol": "insolation_flux_Earth",

    # Stellar
    "koi_steff": "stellar_temp_K",
    "koi_slogg": "stellar_logg",
    "koi_smet": "stellar_metallicity_FeH",
    "koi_srad": "stellar_radius_solar",
    "koi_smass": "stellar_mass_solar",
    "koi_sage": "stellar_age_Gyr",

    # Detection / Validation
    "koi_disposition": "final_disposition",
    "koi_pdisposition": "kepler_disposition",
    "koi_score": "disposition_score",
    "koi_model_snr": "signal_to_noise",
    "koi_num_transits": "num_transits",

    # Coordinates & brightness
    "ra": "RA_deg",
    "dec": "Dec_deg",
    "koi_kepmag": "kepler_mag"
}


if 'koi_disposition' not in selected_columns:
    selected_columns.append('koi_disposition') # Add if missing for filtering

cols_to_select = [c for c in selected_columns if c in df.columns] # Use df instead of df0
df = df[cols_to_select].rename(columns=rename_dict)

if 'final_disposition' in df.columns:
    # Filter only if the final_disposition column exists
    df = df[df['final_disposition'].isin(['CONFIRMED','CANDIDATE','FALSE POSITIVE'])].copy()
    df['Target'] = (df['final_disposition'] != 'FALSE POSITIVE').astype(int)
else:
    # If final_disposition is not in df after renaming, print the warning
    print('Warning: final_disposition column not found after selection/renaming. Cannot filter by disposition or create Target.')

print('Shape after selecting columns and attempting to filter dispositions:', df.shape)

Shape after selecting columns and attempting to filter dispositions: (9564, 30)


In [8]:
df.isnull().sum()

planet_radius_earth         363
planet_radius_err_upper     363
planet_radius_err_lower     363
radius_ratio_Rp_Rstar       363
transit_depth_ppm           363
stellar_density_gcm3        321
orbital_period_days           0
semi_major_axis_AU          363
eccentricity                363
inclination_deg             364
transit_duration_hrs          0
ingress_duration_hrs       9564
scaled_distance_a_Rstar     363
equilibrium_temp_K          363
insolation_flux_Earth       321
stellar_temp_K              363
stellar_logg                363
stellar_metallicity_FeH     386
stellar_radius_solar        363
stellar_mass_solar          363
stellar_age_Gyr            9564
final_disposition             0
kepler_disposition            0
disposition_score          1510
signal_to_noise             363
num_transits               1142
RA_deg                        0
Dec_deg                       0
kepler_mag                    1
Target                        0
dtype: int64

In [9]:
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()

print("Numeric columns:", numeric_cols)
print("\nCategorical columns:", categorical_cols)

Numeric columns: ['planet_radius_earth', 'planet_radius_err_upper', 'planet_radius_err_lower', 'radius_ratio_Rp_Rstar', 'transit_depth_ppm', 'stellar_density_gcm3', 'orbital_period_days', 'semi_major_axis_AU', 'eccentricity', 'inclination_deg', 'transit_duration_hrs', 'ingress_duration_hrs', 'scaled_distance_a_Rstar', 'equilibrium_temp_K', 'insolation_flux_Earth', 'stellar_temp_K', 'stellar_logg', 'stellar_metallicity_FeH', 'stellar_radius_solar', 'stellar_mass_solar', 'stellar_age_Gyr', 'disposition_score', 'signal_to_noise', 'num_transits', 'RA_deg', 'Dec_deg', 'kepler_mag', 'Target']

Categorical columns: ['final_disposition', 'kepler_disposition']


In [10]:
for col in numeric_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].mean(), inplace=True)

for col in categorical_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)

print("✅ Null values filled for numeric and categorical features.")
df.shape

✅ Null values filled for numeric and categorical features.


(9564, 30)

# 🟨 Step 6 — Preprocessing

> Detect and remove outliers using the IQR method for numeric features.

In [11]:
df.nunique()

planet_radius_earth        2989
planet_radius_err_upper    1788
planet_radius_err_lower    1239
radius_ratio_Rp_Rstar      8503
transit_depth_ppm          6948
stellar_density_gcm3       9003
orbital_period_days        9564
semi_major_axis_AU         3797
eccentricity                  1
inclination_deg            2261
transit_duration_hrs       7834
ingress_duration_hrs          0
scaled_distance_a_Rstar    6211
equilibrium_temp_K         2512
insolation_flux_Earth      7802
stellar_temp_K             2446
stellar_logg               1558
stellar_metallicity_FeH     120
stellar_radius_solar       2290
stellar_mass_solar         1477
stellar_age_Gyr               0
final_disposition             3
kepler_disposition            2
disposition_score           651
signal_to_noise            2741
num_transits               1628
RA_deg                     8131
Dec_deg                    8195
kepler_mag                 3888
Target                        2
dtype: int64

In [12]:
columns_to_drop = ['final_disposition', 'stellar_age_Gyr', 'ingress_duration_hrs', 'eccentricity','orbital_period_days']
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
df = df.drop(columns=existing_columns_to_drop, axis=1)
print("Shape after dropping columns:", df.shape)

Shape after dropping columns: (9564, 25)


In [13]:
print('Columns:', df.columns.tolist())
print('\nMissing values per column:')
print(df.isnull().sum().sort_values(ascending=False).head(20))
print('\nTarget distribution:')
print(df['Target'].value_counts(normalize=True))

Columns: ['planet_radius_earth', 'planet_radius_err_upper', 'planet_radius_err_lower', 'radius_ratio_Rp_Rstar', 'transit_depth_ppm', 'stellar_density_gcm3', 'semi_major_axis_AU', 'inclination_deg', 'transit_duration_hrs', 'scaled_distance_a_Rstar', 'equilibrium_temp_K', 'insolation_flux_Earth', 'stellar_temp_K', 'stellar_logg', 'stellar_metallicity_FeH', 'stellar_radius_solar', 'stellar_mass_solar', 'kepler_disposition', 'disposition_score', 'signal_to_noise', 'num_transits', 'RA_deg', 'Dec_deg', 'kepler_mag', 'Target']

Missing values per column:
planet_radius_earth        0
planet_radius_err_upper    0
planet_radius_err_lower    0
radius_ratio_Rp_Rstar      0
transit_depth_ppm          0
stellar_density_gcm3       0
semi_major_axis_AU         0
inclination_deg            0
transit_duration_hrs       0
scaled_distance_a_Rstar    0
equilibrium_temp_K         0
insolation_flux_Earth      0
stellar_temp_K             0
stellar_logg               0
stellar_metallicity_FeH    0
stellar_rad

# 🟩 Step 7 — Split Data & Build Preprocessor
> Prepare training and testing datasets and preprocessing pipeline.

In [14]:
from sklearn.model_selection import train_test_split
X = df.drop('Target', axis=1)
y = df['Target']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Data splitting complete:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

Data splitting complete:
X_train shape: (6694, 24)
y_train shape: (6694,)
X_val shape: (1435, 24)
y_val shape: (1435,)
X_test shape: (1435, 24)
y_test shape: (1435,)


In [15]:
# Identify numeric and categorical features from the training data
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include='object').columns.tolist()

# Build the preprocessor
preprocessor = build_preprocessor(numeric_features, categorical_features)

print("✅ Preprocessor built successfully.")

✅ Preprocessor built successfully.


# 🟦 Step 8 — Train Models
> Train multiple models and save them for Flask use.

In [16]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=500)
}

trained_models = {}

for name, model in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)

    model_path = os.path.join(BASE_MODEL_DIR, f"{name}_pipeline.pkl")
    joblib.dump(pipe, model_path)

    trained_models[name] = pipe
    print(f"✅ {name} trained and saved at {model_path}")


✅ RandomForest trained and saved at ../static/models\RandomForest_pipeline.pkl
✅ XGBoost trained and saved at ../static/models\XGBoost_pipeline.pkl
✅ LogisticRegression trained and saved at ../static/models\LogisticRegression_pipeline.pkl


# 🟩 Step 9 — Evaluate Models and Save Metrics
> Evaluate each model and save performance metrics in JSON format.

In [17]:
def evaluate_models(models, X_test, y_test):
    results = {}
    for name, model in models.items():
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]
        results[name] = {
            "accuracy": round(accuracy_score(y_test, y_pred), 3),
            "precision": round(precision_score(y_test, y_pred), 3),
            "recall": round(recall_score(y_test, y_pred), 3),
            "f1": round(f1_score(y_test, y_pred), 3),
            "auc": round(roc_auc_score(y_test, y_proba), 3)
        }
    return results

def save_metrics(metrics_dict, dataset_name):
    """Save metrics to JSON for Flask UI."""
    path = os.path.join(RESULTS_DIR, f"{dataset_name}_metrics.json")
    with open(path, "w") as f:
        json.dump(metrics_dict, f, indent=4)
    print(f"✅ Metrics saved to {path}")

results = evaluate_models(trained_models, X_test, y_test)
save_metrics(results, "Kepler")


✅ Metrics saved to ../static/results\Kepler_metrics.json


# 🟦 Step 10 — Extract and Save Top 5 Features
> Save top features per model for Flask frontend.

In [18]:
def extract_and_save_feature_importances(model, model_name):
    """
    Extracts and saves the top 5 feature importances (or coefficients) for a given model.

    Args:
        model: The trained model pipeline.
        model_name: The name of the model.
    """
    if hasattr(model.named_steps['model'], "feature_importances_"):
        importances = model.named_steps['model'].feature_importances_
    elif hasattr(model.named_steps['model'], "coef_"):
        # For linear models like Logistic Regression, coef_ is used
        # Ensure coef_ is 1D for multi-class if needed, currently assumes binary
        if model.named_steps['model'].coef_.ndim > 1:
             importances = abs(model.named_steps['model'].coef_[0])
        else:
             importances = abs(model.named_steps['model'].coef_)
    else:
        print(f"No feature importance available for {model_name}")
        return

    # Get feature names after preprocessing from the preprocessor step
    try:
        processed_feature_names = model.named_steps['preprocessor'].get_feature_names_out()
    except AttributeError:
        print(f"Preprocessor in model {model_name} does not have get_feature_names_out method.")
        # Fallback or error handling if get_feature_names_out is not available
        return


    # Ensure importances and feature names align
    if len(importances) != len(processed_feature_names):
         print(f"Mismatch in feature importance length ({len(importances)}) and processed feature names ({len(processed_feature_names)}) for {model_name}")
         return

    # Get top 5 feature indices
    top_idx = importances.argsort()[-5:][::-1]
    top_features = [processed_feature_names[i] for i in top_idx]

    with open(os.path.join(RESULTS_DIR, f"{model_name}_top_features.json"), "w") as f:
        json.dump(top_features, f, indent=4)
    print(f"✅ Top 5 features for {model_name} saved.")

for name, model in trained_models.items():
    extract_and_save_feature_importances(model, name) # Pass only model and name

✅ Top 5 features for RandomForest saved.
✅ Top 5 features for XGBoost saved.
✅ Top 5 features for LogisticRegression saved.


# 🟩 Step 11 — Prediction Function (for Flask API)
> Predict class and probability from user-provided input values.

In [19]:
def predict_from_input(model_name, input_values):
    """Predict from user input (used by Flask)."""
    model_path = os.path.join(BASE_MODEL_DIR, f"{model_name}_pipeline.pkl")
    model = joblib.load(model_path)
    X = np.array(input_values).reshape(1, -1)
    pred = int(model.predict(X)[0])
    proba = float(model.predict_proba(X)[0][1])
    return {"prediction": pred, "probability": proba}


# 🟦 Step 12 — Final Summary
> Confirm successful execution and show saved artifact paths.

In [20]:
# 📊 Display Model Results and Metrics
print("=" * 60)
print("🔍 MODEL PERFORMANCE RESULTS")
print("=" * 60)

# Load and display metrics
metrics_file = os.path.join(RESULTS_DIR, "Kepler_metrics.json")
if os.path.exists(metrics_file):
    with open(metrics_file, 'r') as f:
        metrics = json.load(f)
    
    print("\n📈 PERFORMANCE METRICS:")
    print("-" * 40)
    
    # Create a DataFrame for better display
    metrics_df = pd.DataFrame(metrics).T
    print(metrics_df.round(3))
    
    # Find best model for each metric
    print("\n🏆 BEST PERFORMING MODELS:")
    print("-" * 30)
    for metric in ['accuracy', 'precision', 'recall', 'f1', 'auc']:
        best_model = metrics_df[metric].idxmax()
        best_score = metrics_df[metric].max()
        print(f"Best {metric.upper()}: {best_model} ({best_score:.3f})")
    
    # Overall best model (based on F1 score as it balances precision and recall)
    best_overall = metrics_df['f1'].idxmax()
    print(f"\n🥇 OVERALL BEST MODEL: {best_overall}")
    print(f"   F1 Score: {metrics_df.loc[best_overall, 'f1']:.3f}")
    print(f"   Accuracy: {metrics_df.loc[best_overall, 'accuracy']:.3f}")
    print(f"   AUC: {metrics_df.loc[best_overall, 'auc']:.3f}")
    
else:
    print("❌ Metrics file not found!")

print("\n" + "=" * 60)


🔍 MODEL PERFORMANCE RESULTS

📈 PERFORMANCE METRICS:
----------------------------------------
                    accuracy  precision  recall     f1    auc
RandomForest           0.999        1.0   0.997  0.999  0.999
XGBoost                0.999        1.0   0.997  0.999  1.000
LogisticRegression     0.999        1.0   0.997  0.999  1.000

🏆 BEST PERFORMING MODELS:
------------------------------
Best ACCURACY: RandomForest (0.999)
Best PRECISION: RandomForest (1.000)
Best RECALL: RandomForest (0.997)
Best F1: RandomForest (0.999)
Best AUC: XGBoost (1.000)

🥇 OVERALL BEST MODEL: RandomForest
   F1 Score: 0.999
   Accuracy: 0.999
   AUC: 0.999



In [21]:
# 🔍 Display Top Features for Each Model
print("🔍 TOP 5 FEATURES FOR EACH MODEL")
print("=" * 50)

model_names = ["RandomForest", "XGBoost", "LogisticRegression"]

for model_name in model_names:
    features_file = os.path.join(RESULTS_DIR, f"{model_name}_top_features.json")
    if os.path.exists(features_file):
        with open(features_file, 'r') as f:
            top_features = json.load(f)
        
        print(f"\n🎯 {model_name.upper()} - Top 5 Features:")
        print("-" * 30)
        for i, feature in enumerate(top_features, 1):
            print(f"  {i}. {feature}")
    else:
        print(f"\n❌ Top features file not found for {model_name}")

print("\n" + "=" * 50)


🔍 TOP 5 FEATURES FOR EACH MODEL

🎯 RANDOMFOREST - Top 5 Features:
------------------------------
  1. cat__kepler_disposition_CANDIDATE
  2. cat__kepler_disposition_FALSE POSITIVE
  3. num__disposition_score
  4. num__planet_radius_earth
  5. num__planet_radius_err_lower

🎯 XGBOOST - Top 5 Features:
------------------------------
  1. cat__kepler_disposition_CANDIDATE
  2. num__insolation_flux_Earth
  3. num__signal_to_noise
  4. num__disposition_score
  5. num__stellar_metallicity_FeH

🎯 LOGISTICREGRESSION - Top 5 Features:
------------------------------
  1. cat__kepler_disposition_CANDIDATE
  2. cat__kepler_disposition_FALSE POSITIVE
  3. num__disposition_score
  4. num__transit_depth_ppm
  5. num__stellar_temp_K



In [22]:
print("✅ Notebook completed successfully.")
print(f"Models saved to: {BASE_MODEL_DIR}")
print(f"Metrics saved to: {RESULTS_DIR}")
print(f"Plots saved to: {PLOTS_DIR}")

✅ Notebook completed successfully.
Models saved to: ../static/models
Metrics saved to: ../static/results
Plots saved to: ../static/plots


## Summary:

### Data Analysis Key Findings

* Applying the IQR-based outlier removal with a multiplier of 10 retained a significant portion of the data, resulting in a DataFrame shape of (5846, 30).
* The process successfully removed 3718 outliers while keeping the majority of the data points.
* The debug output of the outlier handling function confirmed that using a multiplier of 10 resulted in wider bounds for outlier detection compared to a smaller multiplier, preventing the removal of all data.

### Insights or Next Steps

* The choice of outlier handling method and its parameters (like the IQR multiplier) is crucial and highly dependent on the data distribution and the goal of the analysis. Using an overly aggressive outlier removal method can lead to the loss of valuable data or an empty dataset.
* Given the successful retention of data after outlier removal, the trained models and their performance metrics can now be analyzed to determine the best model for the classification task. Further steps could involve hyperparameter tuning for the models to potentially improve performance.