In [None]:
!pip install numpy pandas matplotlib seaborn scikit-learn tensorflow lightkurve joblib astropy scipy shap streamlit --quiet

In [None]:
#Importing Libraries
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, confusion_matrix, ConfusionMatrixDisplay
import joblib
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from lightkurve import search_lightcurvefile
from astropy.stats import sigma_clip
from scipy.signal import convolve
import shap
from concurrent.futures import ThreadPoolExecutor, as_completed
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# Load your KOI CSV
df = pd.read_csv('kepler_koi.csv')
df.head()
df.columns
df['koi_disposition'].value_counts()

In [None]:
#Parameters for Large-Scale Real Kepler Pipeline
CACHE_DIR = '/content/kepler_cache'
os.makedirs(CACHE_DIR, exist_ok=True)
N_SAMPLES_PER_CLASS = 200
N_BINS = 400
MAX_LEN = N_BINS
BATCH_SIZE = 16
EPOCHS = 25

In [None]:
#Downloading and Caching Kepler Light Curves
def download_lightcurve_cached(kepid, max_retries = 3, delay = 3):
    cache_path = os.path.join(CACHE_DIR, f'KIC{int(kepid)}.fits')
    for attempt in range(max_entries):
        try:
            if os.path.exists(cache_path):
                from lightkurve import LightCurveFile
                lcfile = LightCurveFile(cache_path)
            else:
                lcfile = search_lightcurvefile(f'KIC {kepid}',mission='Kepler').download(path=cache_path)
            lc = lcfile.PDCSAP_FLUX.remove_nans()
            return lc
        except Exception as e:
            print(f'[Attempt {attempt+1}] Failed for {kepid}: {e}')
            time.sleep(delay)
    return None

In [None]:
#Preprocessing Helpers
def detrend_and_normalize(lc, window_length=401):
    try:
        flat = lc.flatten(window_length=window_length)
    except Exception:
        flat = lc.copy()
    norm = (flat.flux / np.nanmedian(flat.flux)) - 1.0
    return np.array(norm)
def fold_and_resample(lc, period, epoch_time, n_bins=N_BINS):
    try:
        folded = lc.fold(period=period, t0=epoch_time)
        phase_vals = (folded.time.value - np.nanmedian(folded.time.value)) /
        period
        grid = np.linspace(-0.5, 0.5, n_bins)
        mask = np.isfinite(phase_vals) & np.isfinite(folded.flux)
        if mask.sum() < (n_bins // 10):
            return None
        resampled = np.interp(grid, phase_vals[mask], folded.flux[mask],
        left=np.nanmedian(folded.flux), right=np.nanmedian(folded.flux))
        resampled = (resampled / np.nanmedian(resampled)) - 1.0
        return resampled
    except Exception:
        return None

In [None]:
#Safe Function to Extract Period and Epoch
def safe_get_period_and_epoch(row):
    period, epoch = None, None
    for c in ['koi_period', 'period', 'kepoi_period']:
        if c in row.index and not pd.isna(row[c]):
            period = float(row[c]);
            break
    for c in ['koi_time0bk', 'koi_time0', 'time0bk', 'epoch']:
        if c in row.index and not pd.isna(row[c]):
            epoch = float(row[c]);
            break
    return period, epoch
    

In [None]:
#Sample 200 Positives
confirmed_df = df[df['koi_disposition'].str.upper() == "CONFIRMED"]
false_df = df[df['koi_disposition'].str.upper().str.contains('FALSE', na = False)]

confirmed_sample = confirmed_df.sample(min(N_SAMPLES_PER_CLASS, len(confirmed_def)), random_state = 42)
false_sample = false_df.sample(min(N_SAMPLES_PER_CLASS, len(false_df)),
random_state=42)

In [None]:
#Parallel Preprocessing for Efficiency
X_list, y_list = [], []

def process_row(row, label) :
    kepid = int(row['kepid'])
    lc = download_lightcurve_cached(kepid)
    if lc is None: return None
    period, epoch = safe_get_period_and_epoch(row)
    if period and epoch:
        res = fold_and_resample(lc, period, epoch, n_bins=N_BINS)
        if res is not None: return res, label
        arr = detrend_and_normalize(lc)
        arr_res = np.interp(np.linspace(0,1,MAX_LEN), np.linspace(0,1,len(arr)), arr)
        return arr_res, label

with ThreadPoolExecutor(max_workers = 8)as ex :
    futures = [ex.submit(process_row, row, 1) for _, row in confirmed_sample.iterrows()] + [ex.submit(process_row, row, 0) for _, row in
false_sample.iterrows()] 
    for f in as_completed(futures):
        res = f.result()
        if res: X_list.append(res[0]); y_list.appen(res[1])
X_arr = = np.expand_dims(np.array(X_list, dtype='float32'), -1)
y_arr = np.array(y_list, dtype='int64')
print('Dataset shape:', X_arr.shape, 'Class distribution:', np.bincount(y_arr))

In [None]:
#Train/Test Split and Class Weights
X_train X_test, y_train, y_test = train_test_split(X_arr, y_arr, test_size = 42, stratify = y_arr, random_state 42)
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes = classes, y = y_train)
class_weight_dict = {int(c): float(w) for c, w in zip(classes, class_weights)}
print('Class Weights:', class_weights_dict)

In [None]:
#1D CNN Definition
def build_cnn(input_shape):
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv1D(32, 9, padding='same', activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(64, 5, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(128, 3, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(128, 3, padding='same', activation='relu')(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs, outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
    loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    return model
model = build_cnn(X_train.shape[1:])
model.summary()

In [None]:
#Training with #Callbacks
cb = [callbacks.EarlyStopping(monitor = 'val_auc', patience = 6, restore_best_weights = True, mode = 'max'), 
      callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.5, patience = 3, min_lr = 1e-6, verbose = 1), 
      callbacks.ModelCheckpoint('best_cnn_kepler.h5', monitor  = 'val_auc', save_best_only = True, mode = 'max')]
history = model.fit(X_train, y_train, validation_split=0.15, epochs=EPOCHS, batch_size=BATCH_SIZE, class_weight=class_weight_dict, callbacks=cb, verbose=2)
      

In [None]:
#Evaluation, ROC, PR, and Confusion Matrix
y_pred = model.predict(X_test).ravel()

#ROC CURVE 
from sklearn.metrics import roc_curve, precision_recall_curve, auc fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

#Precision Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred)
pr_auc = auc(recall, precision)

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(fpr, tpr, label=f'ROC AUC={roc_auc:.3f}')
plt.plot([0,1],[0,1],'--',color='gray')
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.title('ROC Curve'); plt.legend()

plt.subplot(1,2,2)
plt.plot(recall, precision, label=f'PR AUC={pr_auc:.3f}')
plt.xlabel('Recall'); plt.ylabel('Precision')
plt.title('Precision-Recall Curve'); plt.legend()
plt.tight_layout()
plt.show()

# Confusion Matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, (y_pred>0.5).astype(int))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['FalsePositive','Confirmed'])
disp.plot(cmap='Blues'); plt.show()

# Visualize top predicted planets
for i in np.argsort(y_pred)[-6:][::-1]:
    plt.figure(figsize=(8,3))
    plt.plot(X_test[i].squeeze(), label=f'pred={y_pred[i]:.3f}, label={y_test[i]}')
    plt.title('Folded / Resampled Light Curve')
    plt.xlabel('Phase bin'); plt.legend(); plt.show()
    
# Save preprocessed arrays
np.savez_compressed('kepler_200_dataset.npz', X_train=X_train, X_test=X_test,
y_train=y_train, y_test=y_test)
model.save('cnn_kepler_200_v2.h5')
print('Saved dataset and CNN model.')