In [1]:
# Downloading libraries etc
from pathlib import Path
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import pickle

In [2]:
# Gets data and split
X = pd.read_parquet("table_v1.parquet")
y = pd.read_csv("labels_v1.csv")
spl = pd.read_csv("split_v1.csv")

# Merge into one dataframe
df = X.merge(y, on="kepid").merge(spl, on="kepid")

# Binary classification label
target_col = "label_lenient"

print("Data shape:", df.shape)
print("Planet-positive rate:", df[target_col].mean())

Data shape: (150762, 30)
Planet-positive rate: 0.02406441941603322


In [4]:
# Splits into testing and training and validating
feature_cols = [
    c for c in df.columns
    if c not in ["kepid", "split", "label_lenient", "label_strict"]
]

X_all = df[feature_cols]
y_all = df[target_col]

X_train = X_all[df["split"] == "train"]
y_train = y_all[df["split"] == "train"]

X_val   = X_all[df["split"] == "val"]
y_val   = y_all[df["split"] == "val"]

X_test  = X_all[df["split"] == "test"]
y_test  = y_all[df["split"] == "test"]

print("Train size:", X_train.shape)
print("Val size:",   X_val.shape)
print("Test size:", X_test.shape)

Train size: (105532, 26)
Val size: (15077, 26)
Test size: (30153, 26)


In [5]:
# Logistic Regression Model
log_reg = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=1000,
        n_jobs=-1,
        class_weight="balanced",
        solver="lbfgs"
    ))
])

# Neural Network (MLP) Model
mlp_nn = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", MLPClassifier(
        hidden_layer_sizes=(64,),  # one dense hidden layer with 64 neurons
        activation="relu",
        solver="adam",
        max_iter=50,               # keep training quick
        random_state=42
    ))
])

In [6]:
# Confirms when models are done training
print("\nTraining Logistic Regression...")
log_reg.fit(X_train, y_train)
print("Logistic Regression trained.\n")

print("Training Neural Network (MLP)...")
mlp_nn.fit(X_train, y_train)
print("Neural Network trained.\n")


Training Logistic Regression...
Logistic Regression trained.

Training Neural Network (MLP)...
Neural Network trained.

