# CNN Model - Light curve data

## Datasets

We start by checking the dataset we will be working with.

In [1]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Mauricio/dataset_.csv").drop("Unnamed: 0", axis=1)

train_data = df[df["disposition"] != 2]
train_data.head()

Unnamed: 0,search_id,num_planet,disposition,ror,stellar_mass,ss_gravity,period,duration,transit_epoch,global_view,local_view
0,KIC 9838468,1,1,0.012628,0.954,4.309,54.409961,9.314,2455008.0,"[-0.11291305906538904, 0.10793178189212953, 0....","[0.2713626011471899, 0.27876775831018774, 0.40..."
1,KIC 9838414,1,0,0.043932,0.748,4.551,1.332615,5.161,2454965.0,"[-0.10636503616246262, 0.1578112142941182, -0....","[0.09742806203994805, 0.14907600709401858, 0.0..."
2,KIC 9838060,1,0,0.093998,0.915,4.572,23.815784,3.7591,2454975.0,"[0.000595742676004736, 0.0003238881800717846, ...","[0.02286672191496115, 0.03699818466571551, 0.0..."
3,KIC 9837685,1,1,0.027248,0.923,4.562,13.712185,2.437,2454969.0,"[0.0011361371758506302, 0.05481143227875999, 0...","[0.010731168333516785, -0.03394275005661493, -..."
4,KIC 9837661,2,1,0.038292,0.513,4.744,2.226496,1.7073,2454966.0,"[-0.11011312998955108, -0.30014839757803774, 0...","[0.4321008216256368, 0.2384604509332056, -0.34..."


In [4]:
import json
import numpy as np

Xg = np.array([np.array(json.loads(s)) for s in train_data.loc[:, "global_view"]])
Xl = np.array([np.array(json.loads(s)) for s in train_data.loc[:, "local_view"]])
Xs = train_data.loc[:, "ror":"ss_gravity"].to_numpy()
y = train_data["disposition"].to_numpy()

print(Xg.shape)
print(Xl.shape)
print(Xs.shape)
print(y.shape)

(1549, 1001)
(1549, 101)
(1549, 3)
(1549,)


In [5]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Conv1D, MaxPooling1D, GlobalAveragePooling1D,
    Dense, Dropout, BatchNormalization, Concatenate, Flatten
)
from tensorflow.keras.metrics import (
    Accuracy, BinaryAccuracy, Precision, Recall, AUC, FalsePositives,
    FalseNegatives, TruePositives, TrueNegatives
)

## Network architecture & training

Here we build the convolutional neuronal network inspired by the one described by Shallue & Vandenburg (2017)

In [6]:
# Rama Gloabl
global_input = Input(shape=(1001, 1), name="global_view")

xg = Conv1D(16,5, activation="relu")(global_input)
xg = Conv1D(16,5, activation="relu")(xg)
xg = MaxPooling1D(3, strides=2)(xg)

xg = Conv1D(32,5, activation="relu")(xg)
xg = Conv1D(32,5, activation="relu")(xg)
xg = MaxPooling1D(3, strides=2)(xg)

xg = Conv1D(64,5, activation="relu")(xg)
xg = Conv1D(64,5, activation="relu")(xg)
xg = MaxPooling1D(3, strides=2)(xg)

xg = Conv1D(128,5, activation="relu")(xg)
xg = Conv1D(128,5, activation="relu")(xg)
xg = MaxPooling1D(3, strides=2)(xg)

xg = Flatten()(xg)

# Local view
local_input = Input(shape=(101, 1), name="local_view")

xl = Conv1D(16,5, activation="relu")(local_input)
xl = Conv1D(16,5, activation="relu")(xl)
xl = MaxPooling1D(3, strides=2)(xl)

xl = Conv1D(32,5, activation="relu")(xl)
xl = Conv1D(32,5, activation="relu")(xl)
xl = MaxPooling1D(3, strides=2)(xl)

xl = Flatten()(xl)

scalar_input = Input(shape=(3,), name="scalar_features")

combined = Concatenate()([xg, xl, scalar_input])

z = Dense(512, activation="relu")(combined)
z = Dropout(0.3)(z)
z = Dense(512, activation="relu")(z)
z = Dropout(0.3)(z)
z = Dense(512, activation="relu")(z)
z = Dropout(0.3)(z)
z = Dense(512, activation="relu")(z)

output = Dense(1, activation="sigmoid", name="output")(z)

model = Model(inputs=[global_input, local_input, scalar_input], outputs=output)
model.summary()

Then split the data

In [11]:
# Separate dataset
import numpy as np
from sklearn.model_selection import train_test_split

# y: (n,) o (n,1)
y_bin = y.ravel().astype(int)

Xg_train, Xg_test, Xl_train, Xl_test, Xs_train, Xs_test, y_train, y_test = train_test_split(
    Xg, Xl, Xs, y_bin, test_size=0.15, random_state=42, stratify=y_bin
)
Xg_train, Xg_val,  Xl_train, Xl_val,  Xs_train, Xs_val,  y_train, y_val  = train_test_split(
    Xg_train, Xl_train, Xs_train, y_train, test_size=0.1765,  # ≈ 0.15 / 0.85
    random_state=42, stratify=y_train
)

print("Train:", Xg_train.shape, Xl_train.shape, Xs_train.shape, y_train.shape)
print("Valid:", Xg_val.shape,   Xl_val.shape,   Xs_val.shape,   y_val.shape)
print("Test :", Xg_test.shape,  Xl_test.shape,  Xs_test.shape,  y_test.shape)


Train: (1083, 1001) (1083, 101) (1083, 3) (1083,)
Valid: (233, 1001) (233, 101) (233, 3) (233,)
Test : (233, 1001) (233, 101) (233, 3) (233,)


And train the neural network

In [12]:
import tensorflow as tf
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC(name="AUC"), tf.keras.metrics.Precision(name="Precision"), tf.keras.metrics.Recall(name="Recall")]
)

callbacks = [
    EarlyStopping(monitor="val_AUC", mode="max", patience=8, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_AUC", mode="max", factor=0.5, patience=4, min_lr=1e-6),
    ModelCheckpoint("best_model.keras", monitor="val_AUC", mode="max", save_best_only=True)
]

In [13]:
from sklearn.utils.class_weight import compute_class_weight
classes = np.array([0,1])
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight = {0: weights[0], 1: weights[1]}

In [14]:
history = model.fit(
    [Xg_train, Xl_train, Xs_train], y_train,
    validation_data=([Xg_val, Xl_val, Xs_val], y_val),
    epochs=60,
    batch_size=32,
    callbacks=callbacks,
    class_weight=class_weight,  # descomenta si lo usas
    verbose=1
)


Epoch 1/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 382ms/step - AUC: 0.9585 - Precision: 0.9218 - Recall: 0.9153 - accuracy: 0.9067 - loss: 0.2574 - val_AUC: 0.9724 - val_Precision: 0.9739 - val_Recall: 0.8485 - val_accuracy: 0.9013 - val_loss: 0.2593 - learning_rate: 0.0010
Epoch 2/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - AUC: 0.9604 - Precision: 0.9275 - Recall: 0.9208 - accuracy: 0.9112 - loss: 0.2455 - val_AUC: 0.9689 - val_Precision: 0.9457 - val_Recall: 0.9242 - val_accuracy: 0.9270 - val_loss: 0.2371 - learning_rate: 0.0010
Epoch 3/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - AUC: 0.9658 - Precision: 0.9294 - Recall: 0.9073 - accuracy: 0.9113 - loss: 0.2255 - val_AUC: 0.9658 - val_Precision: 0.8707 - val_Recall: 0.9697 - val_accuracy: 0.9013 - val_loss: 0.2624 - learning_rate: 0.0010
Epoch 4/60
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - AUC: 0.9702 - 

## Evaluation metrics

Finally, we test the validity of our model with the usual evaluation metrics as well as a visualization of the accuracy and loss change over the epochs when training.

In [15]:
from sklearn.metrics import f1_score, precision_recall_curve

# Probabilidades en valid
p_val = model.predict([Xg_val, Xl_val, Xs_val], verbose=0).ravel()

# Barrido de umbrales usando la curva Prec-Recall
prec, rec, thr = precision_recall_curve(y_val, p_val)
f1s = 2 * (prec*rec) / (prec + rec + 1e-12)
best_idx = np.nanargmax(f1s)
best_thr = thr[best_idx] if best_idx < len(thr) else 0.5
print(f"Umbral óptimo (F1): {best_thr:.4f}")

Umbral óptimo (F1): 0.1978


In [16]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix,
    classification_report
)

# Carga pesos óptimos por si EarlyStopping no restauró:
model.load_weights("best_model.keras")

p_test = model.predict([Xg_test, Xl_test, Xs_test], verbose=0).ravel()
yhat_test = (p_test >= best_thr).astype(int)

acc  = accuracy_score(y_test, yhat_test)
prec = precision_score(y_test, yhat_test, zero_division=0)
rec  = recall_score(y_test, yhat_test, zero_division=0)
f1   = f1_score(y_test, yhat_test, zero_division=0)
roc  = roc_auc_score(y_test, p_test)               # AUC-ROC (umbral-independiente)
ap   = average_precision_score(y_test, p_test)     # AUC-PR (AP)

cm   = confusion_matrix(y_test, yhat_test)
report = classification_report(y_test, yhat_test, digits=3, zero_division=0)

print("=== TEST METRICS ===")
print(f"Accuracy:     {acc:.4f}")
print(f"Precision:    {prec:.4f}")
print(f"Recall:       {rec:.4f}")
print(f"F1:           {f1:.4f}")
print(f"AUC-ROC:      {roc:.4f}")
print(f"AUC-PR (AP):  {ap:.4f}")
print("Confusion matrix:\n", cm)
print("\nClassification report:\n", report)


=== TEST METRICS ===
Accuracy:     0.9270
Precision:    0.9197
Recall:       0.9545
F1:           0.9368
AUC-ROC:      0.9677
AUC-PR (AP):  0.9670
Confusion matrix:
 [[ 90  11]
 [  6 126]]

Classification report:
               precision    recall  f1-score   support

           0      0.938     0.891     0.914       101
           1      0.920     0.955     0.937       132

    accuracy                          0.927       233
   macro avg      0.929     0.923     0.925       233
weighted avg      0.927     0.927     0.927       233



In [17]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix,
    classification_report
)

# Carga pesos óptimos por si EarlyStopping no restauró:
model.load_weights("best_model.keras")

p_train = model.predict([Xg_train, Xl_train, Xs_train], verbose=0).ravel()
yhat_train = (p_train >= best_thr).astype(int)

acc2  = accuracy_score(y_train, yhat_train)
prec2 = precision_score(y_train, yhat_train, zero_division=0)
rec2  = recall_score(y_train, yhat_train, zero_division=0)
f12   = f1_score(y_train, yhat_train, zero_division=0)
roc2  = roc_auc_score(y_train, p_train)               # AUC-ROC (umbral-independiente)
ap2   = average_precision_score(y_train, p_train)     # AUC-PR (AP)

cm2   = confusion_matrix(y_test, yhat_test)
report2 = classification_report(y_train, yhat_train, digits=3, zero_division=0)

print("=== TRAIN METRICS ===")
print(f"Accuracy:     {acc2:.4f}")
print(f"Precision:    {prec2:.4f}")
print(f"Recall:       {rec2:.4f}")
print(f"F1:           {f12:.4f}")
print(f"AUC-ROC:      {roc2:.4f}")
print(f"AUC-PR (AP):  {ap2:.4f}")
print("Confusion matrix:\n", cm2)
print("\nClassification report:\n", report2)


=== TRAIN METRICS ===
Accuracy:     0.9317
Precision:    0.9498
Recall:       0.9281
F1:           0.9388
AUC-ROC:      0.9735
AUC-PR (AP):  0.9744
Confusion matrix:
 [[ 90  11]
 [  6 126]]

Classification report:
               precision    recall  f1-score   support

           0      0.909     0.936     0.923       471
           1      0.950     0.928     0.939       612

    accuracy                          0.932      1083
   macro avg      0.930     0.932     0.931      1083
weighted avg      0.932     0.932     0.932      1083



In [None]:
# --- Simple train/val(test) split, train, evaluate, and plot ---
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# 1) Hold out a test set
Xg_tr, Xg_te, Xl_tr, Xl_te, Xs_tr, Xs_te, y_tr, y_te = train_test_split(
    Xg, Xl, Xs, y, test_size=0.2, random_state=42, stratify=y
)

# 2) Train with an internal validation split
history = model.fit(
    [Xg_tr, Xl_tr, Xs_tr], y_tr,
    batch_size=32,
    epochs=10,
    validation_split=0.2,
    verbose=1
)

# 3) One-shot test evaluation
test_scores = model.evaluate([Xg_te, Xl_te, Xs_te], y_te, verbose=0)
test_loss = float(test_scores[0])      # 'loss'
test_acc  = float(test_scores[1])      # 'accuracy' (the first metric)

# 4) Plots (train vs val, plus a flat test line)
epochs = range(1, len(history.history['loss']) + 1)

plt.figure()
plt.plot(epochs, history.history['loss'], label='train')
plt.plot(epochs, history.history['val_loss'], label='val')
plt.hlines(test_loss, 1, len(epochs), linestyles='dashed', label=f'test={test_loss:.3f}')
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend(); plt.tight_layout()
plt.show()

plt.figure()
plt.plot(epochs, history.history['accuracy'], label='train')
plt.plot(epochs, history.history['val_accuracy'], label='val')
plt.hlines(test_acc, 1, len(epochs), linestyles='dashed', label=f'test={test_acc:.3f}')
plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.legend(); plt.tight_layout()
plt.show()


## Saving Model

In [None]:
model.save("/content/drive/MyDrive/Mauricio/exoplanet_model.h5")