In [None]:
#pip install numpy pandas matplotlib seaborn scikit-learn joblib astropy scipy --quiet

In [None]:
#pip install tensorflow lightkurve shap streamlit --quiet

In [None]:
#Importing Libraries
import os
import time
import numpy as np
import pandas as pd
import shutil
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, confusion_matrix, ConfusionMatrixDisplay
import joblib

from lightkurve import search_lightcurvefile
from lightkurve import search_lightcurve
from lightkurve import LightCurveFile 
from astropy.stats import sigma_clip
from scipy.signal import convolve
import shap
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
tf.random.set_seed(42)
np.random.seed(42)

In [None]:
import os
from astroquery.nasa_exoplanet_archive import NasaExoplanetArchive
import pandas as pd

# Define the file name
FILE_NAME = "kepler_koi_clean.csv"
df = None # Initialize DataFrame

# --- 1. Check if the file already exists locally ---
if os.path.exists(FILE_NAME):
    print(f"Loading KOI data from existing file: {FILE_NAME}")
    try:
        # Load the local CSV file
        df = pd.read_csv(FILE_NAME)
        print("Data loaded successfully from local drive.")
    except Exception as e:
        print(f"Error reading local file: {e}. Attempting to re-download...")
        df = None # Force re-download if the file is corrupted

# --- 2. If file is missing or corrupted, fetch from NASA ---
if df is None:
    print("Local file not found or corrupted. Fetching Kepler KOI data from NASA (5+ minutes)...")
    try:
        # Correct way to fetch KOI table
        koi_table = NasaExoplanetArchive.query_criteria(
            table="cumulative",
            select="*" # Fetch all columns
        )

        # Convert to pandas DataFrame
        df = koi_table.to_pandas()

        # Save to CSV
        df.to_csv(FILE_NAME, index=False)

        print(f"Data successfully fetched and saved as {FILE_NAME}")

    except Exception as e:
        print("Error fetching KOI data from NASA:", e)
        df = None # Final failure

# --- 3. Final Output and Checks ---
if df is not None:
    print(f"\nFinal Dataset Shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
    print("\nValue counts for disposition:")
    print(df['koi_disposition'].value_counts())
else:
    print("\nFATAL ERROR: Could not load data from local file or NASA server.")

In [None]:
#Parameters for Large-Scale Real Kepler Pipeline
CACHE_DIR = './kepler_cache' 

# Set lightkurve environment variable to force it to IGNORE its default, corrupted cache
os.environ['LIGHTKURVE_CACHE_DIR'] = CACHE_DIR

# CRITICAL: Clean and recreate the directory to remove ALL corrupted files
if os.path.exists(CACHE_DIR):
    shutil.rmtree(CACHE_DIR)
os.makedirs(CACHE_DIR, exist_ok=True)
print(f"Set LIGHTKURVE_CACHE_DIR to: {CACHE_DIR}")
print("Cache directory has been cleaned and reset.")

N_SAMPLES_PER_CLASS = 200
N_BINS = 400
MAX_LEN = N_BINS
BATCH_SIZE = 16
EPOCHS = 25

In [None]:
# New Section 8: Execute External Parallel Processing Script

import numpy as np
import joblib

# CRITICAL: This runs the process_data.py script outside the unstable kernel environment
print("---------------------------------------------------------------")
print("Starting external script. This should now run without I/O errors.")
print("Check the VS Code Terminal/Output tab for real-time progress.")
print("---------------------------------------------------------------")

# Execute the external Python file
!python process_data.py

# Load the saved results back into the notebook environment
try:
    X_arr, y_arr = joblib.load('processed_data_output.pkl')
    
    print("\n--- Processed Data Successfully Loaded Back into Notebook ---")
    print(f"Final loaded data shape: {X_arr.shape}")
    print(f"Class distribution: {np.bincount(y_arr)}")

except FileNotFoundError:
    print("\nERROR: Could not find 'processed_data_output.pkl'.")
    print("The script likely failed. Check the 'parallel_processing_script.log' file for the error.")
    
# Print log summary for debugging
print("\n--- LOG FILE SUMMARY (for debugging) ---")
try:
    with open('parallel_processing_script.log', 'r') as f:
        log_lines = f.readlines()
        for line in log_lines[-10:]:
            print(line.strip())
except FileNotFoundError:
    print("Log file not found.")

In [None]:
#Train/Test Split and Class Weights
X_train, X_test, y_train, y_test = train_test_split(X_arr, y_arr, test_size = 42, stratify = y_arr, random_state = 42)
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes = classes, y = y_train)
class_weight_dict = {int(c): float(w) for c, w in zip(classes, class_weights)}
print('Class Weights:', class_weight_dict)

In [None]:
#1D CNN Definition
def build_cnn(input_shape):
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv1D(32, 9, padding='same', activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(64, 5, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(128, 3, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(128, 3, padding='same', activation='relu')(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs, outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
    loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    return model
model = build_cnn(X_train.shape[1:])
model.summary()

In [None]:
#Training with #Callbacks
cb = [callbacks.EarlyStopping(monitor = 'val_auc', patience = 6, restore_best_weights = True, mode = 'max'), 
      callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.5, patience = 3, min_lr = 1e-6, verbose = 1), 
      callbacks.ModelCheckpoint('best_cnn_kepler.h5', monitor  = 'val_auc', save_best_only = True, mode = 'max')]
history = model.fit(X_train, y_train, validation_split=0.15, epochs=EPOCHS, batch_size=BATCH_SIZE, class_weight=class_weight_dict, callbacks=cb, verbose=2)
      

In [None]:
#Evaluation, ROC, PR, and Confusion Matrix
y_pred = model.predict(X_test).ravel()  

#ROC CURVE 
from sklearn.metrics import roc_curve, precision_recall_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

#Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred)
pr_auc = auc(recall, precision)

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(fpr, tpr, label=f'ROC AUC={roc_auc:.3f}')
plt.plot([0,1],[0,1],'--',color='gray')
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.title('ROC Curve'); plt.legend()

plt.subplot(1,2,2)
plt.plot(recall, precision, label=f'PR AUC={pr_auc:.3f}')
plt.xlabel('Recall'); plt.ylabel('Precision')
plt.title('Precision-Recall Curve'); plt.legend()
plt.tight_layout()
plt.show()

# Confusion Matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, (y_pred>0.5).astype(int))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['FalsePositive','Confirmed'])
disp.plot(cmap='Blues'); plt.show()

# Visualize top predicted planets
for i in np.argsort(y_pred)[-6:][::-1]:
    plt.figure(figsize=(8,3))
    plt.plot(X_test[i].squeeze(), label=f'pred={y_pred[i]:.3f}, label={y_test[i]}')
    plt.title('Folded / Resampled Light Curve')
    plt.xlabel('Phase bin'); plt.legend(); plt.show()
    
# Save preprocessed arrays
np.savez_compressed('kepler_200_dataset.npz', X_train=X_train, X_test=X_test,
y_train=y_train, y_test=y_test)
model.save('cnn_kepler_200_v2.h5')
print('Saved dataset and CNN model.')