In [None]:
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# 2. Load & split
df = pd.read_csv('data/HIGGS.csv', header=None)
X, y = df.iloc[:,1:].values, df.iloc[:,0].values
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 3. Baseline logistic regression
lr = LogisticRegression(max_iter=500, n_jobs=-1)
lr.fit(X_train[:100000], y_train[:100000])  # sub-sample for speed
y_pred_lr = lr.predict_proba(X_test[:10000])[:,1]
print('Logistic AUC:', roc_auc_score(y_test[:10000], y_pred_lr))

# 4. Small Keras ANN
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['AUC'])
model.fit(X_train[:50000], y_train[:50000],
          validation_data=(X_val[:10000], y_val[:10000]),
          epochs=20, batch_size=1024)
print('ANN AUC:', model.evaluate(X_test[:10000], y_test[:10000])[1])
