In [27]:
# Cell 1: Data Loading & Preprocessing

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.combine import SMOTEENN
import warnings

# Suppress TensorFlow and XGBoost warnings
warnings.filterwarnings("ignore", category=UserWarning, module="tensorflow")
warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")

# Load the training and testing data
train_data = pd.read_csv('exoTrain.csv')
test_data = pd.read_csv('exoTest.csv')

print(train_data.head())
print(train_data.info())

# 1. Scale data (assuming first column is LABEL)
scaler = StandardScaler()
train_data_scaled = train_data.copy()
train_data_scaled.iloc[:, 1:] = scaler.fit_transform(train_data.iloc[:, 1:])
test_data_scaled = test_data.copy()
test_data_scaled.iloc[:, 1:] = scaler.transform(test_data.iloc[:, 1:])

# 2. Handle outliers
def handle_outliers(data):
    for column in data.columns[1:]:
        data_column = data[column]
        median, std = data_column.median(), data_column.std()
        outliers = (data_column - median).abs() > 3 * std
        data.loc[outliers, column] = np.sign(data_column[outliers]) * 3 * std + median
    return data

train_data_scaled = handle_outliers(train_data_scaled)
test_data_scaled = handle_outliers(test_data_scaled)

# 3. PCA: Reduce dimensions (preserve 95% variance)
pca = PCA(n_components=0.95)
train_data_pca = pca.fit_transform(train_data_scaled.iloc[:, 1:])
test_data_pca = pca.transform(test_data_scaled.iloc[:, 1:])

# 4. Balance data using SMOTEENN
X_train = train_data_pca
y_train = train_data['LABEL'] - 1  # Convert LABEL to 0/1
smote_enn = SMOTEENN(random_state=42)
X_train_balanced, y_train_balanced = smote_enn.fit_resample(X_train, y_train)
print(f"Balanced features: {X_train_balanced.shape}, Balanced labels: {np.array(y_train_balanced).shape}")


   LABEL   FLUX.1   FLUX.2   FLUX.3   FLUX.4   FLUX.5   FLUX.6  FLUX.7  \
0      2    93.85    83.81    20.10   -26.98   -39.56  -124.71 -135.18   
1      2   -38.88   -33.83   -58.54   -40.09   -79.31   -72.81  -86.55   
2      2   532.64   535.92   513.73   496.92   456.45   466.00  464.50   
3      2   326.52   347.39   302.35   298.13   317.74   312.70  322.33   
4      2 -1107.21 -1112.59 -1118.95 -1095.10 -1057.55 -1034.48 -998.34   

    FLUX.8  FLUX.9  ...  FLUX.3188  FLUX.3189  FLUX.3190  FLUX.3191  \
0   -96.27  -79.89  ...     -78.07    -102.15    -102.15      25.13   
1   -85.33  -83.97  ...      -3.28     -32.21     -32.21     -24.89   
2   486.39  436.56  ...     -71.69      13.31      13.31     -29.89   
3   311.31  312.42  ...       5.71      -3.73      -3.73      30.05   
4 -1022.71 -989.57  ...    -594.37    -401.66    -401.66    -357.24   

   FLUX.3192  FLUX.3193  FLUX.3194  FLUX.3195  FLUX.3196  FLUX.3197  
0      48.57      92.54      39.32      61.42       5.08  

In [29]:
# Cell 2: Logistic Regression Baseline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, classification_report

y_test = test_data['LABEL'] - 1  # Convert LABEL to 0/1

log_reg = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
log_reg.fit(X_train_balanced, y_train_balanced)

y_proba = log_reg.predict_proba(test_data_pca)[:, 1]
precision_lr, recall_lr, thresholds_lr = precision_recall_curve(y_test, y_proba)
f1_scores_lr = (2 * precision_lr[:-1] * recall_lr[:-1]) / (precision_lr[:-1] + recall_lr[:-1] + 1e-9)
optimal_idx_lr = np.argmax(f1_scores_lr)
best_threshold_lr = thresholds_lr[optimal_idx_lr]

y_pred_lr = (y_proba >= best_threshold_lr).astype(int)
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_lr, zero_division=0))


Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       565
           1       0.33      0.20      0.25         5

    accuracy                           0.99       570
   macro avg       0.66      0.60      0.62       570
weighted avg       0.99      0.99      0.99       570



In [31]:
# Cell 3: Neural Network Model & Evaluation

import tensorflow as tf
tf.keras.backend.clear_session()

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import precision_recall_curve, classification_report
from sklearn.model_selection import train_test_split

print("Unique classes in y_train_balanced:", np.unique(y_train_balanced))

model = Sequential([
    Input(shape=(X_train_balanced.shape[1],)),
    Dense(64, activation='relu', kernel_regularizer=l2(0.02)),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_regularizer=l2(0.02)),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)

early_stop = EarlyStopping(
    monitor='val_recall',
    patience=15,
    mode='max',
    restore_best_weights=True
)

X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_balanced, y_train_balanced,
    test_size=0.2,
    stratify=y_train_balanced,
    random_state=42
)

y_train_final = np.array(y_train_final)
y_val = np.array(y_val)

class_weight = {0: 1, 1: 15}

history = model.fit(
    X_train_final, y_train_final,
    epochs=100,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stop],
    class_weight=class_weight,
    verbose=1
)

val_proba = model.predict(X_val).flatten()
precision_nn, recall_nn, thresholds_nn = precision_recall_curve(y_val, val_proba)
f1_scores_nn = (2 * precision_nn[:-1] * recall_nn[:-1]) / (precision_nn[:-1] + recall_nn[:-1] + 1e-9)
best_threshold_nn = thresholds_nn[np.argmax(f1_scores_nn)]
print(f"Optimal threshold for NN: {best_threshold_nn:.4f}")

nn_proba = model.predict(test_data_pca).flatten()
y_pred_nn = (nn_proba >= best_threshold_nn).astype(int)
print("\nNeural Network Results:")
print(classification_report(y_test, y_pred_nn, zero_division=0))


Unique classes in y_train_balanced: [0 1]
Epoch 1/100
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.5084 - loss: 5.5026 - precision: 0.5145 - recall: 0.9088 - val_accuracy: 0.5148 - val_loss: 2.3105 - val_precision: 0.5148 - val_recall: 1.0000
Epoch 2/100
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5149 - loss: 2.6886 - precision: 0.5144 - recall: 0.9999 - val_accuracy: 0.5173 - val_loss: 1.7958 - val_precision: 0.5161 - val_recall: 1.0000
Epoch 3/100
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5243 - loss: 2.2412 - precision: 0.5221 - recall: 1.0000 - val_accuracy: 0.5194 - val_loss: 1.6067 - val_precision: 0.5172 - val_recall: 1.0000
Epoch 4/100
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.5229 - loss: 2.0609 - precision: 0.5195 - recall: 1.0000 - val_accuracy: 0.5219 - val_loss: 1.5168 - val_precision: 0.5185

In [33]:
# Cell 4: XGBoost Model & Evaluation

from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_curve, classification_report

# Compute scale_pos_weight for imbalanced data
neg_count = np.sum(np.array(y_train_balanced) == 0)
pos_count = np.sum(np.array(y_train_balanced) == 1)
scale_pos_weight = neg_count / pos_count
print("Scale_pos_weight:", scale_pos_weight)

# Initialize XGBClassifier without using the "use_label_encoder" parameter
xgb_model = XGBClassifier(
    random_state=42,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss'
)

xgb_model.fit(X_train_balanced, y_train_balanced)

xgb_proba = xgb_model.predict_proba(test_data_pca)[:, 1]
precision_xgb, recall_xgb, thresholds_xgb = precision_recall_curve(y_test, xgb_proba)
f1_scores_xgb = (2 * precision_xgb[:-1] * recall_xgb[:-1]) / (precision_xgb[:-1] + recall_xgb[:-1] + 1e-9)
optimal_idx_xgb = np.argmax(f1_scores_xgb)
best_threshold_xgb = thresholds_xgb[optimal_idx_xgb]
print("Optimal threshold for XGB:", best_threshold_xgb)

y_pred_xgb = (xgb_proba >= best_threshold_xgb).astype(int)
print("XGBoost Model Results:")
print(classification_report(y_test, y_pred_xgb, zero_division=0))


Scale_pos_weight: 0.942936397860115
Optimal threshold for XGB: 0.35649315
XGBoost Model Results:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       565
           1       0.33      0.20      0.25         5

    accuracy                           0.99       570
   macro avg       0.66      0.60      0.62       570
weighted avg       0.99      0.99      0.99       570



In [35]:
# Cell 5: Random Forest Model & Evaluation

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, classification_report

# Initialize the RandomForestClassifier with balanced class weights
rf_model = RandomForestClassifier(n_estimators=200, 
                                  class_weight='balanced',
                                  random_state=42)

# Train on the balanced training data
rf_model.fit(X_train_balanced, y_train_balanced)

# Predict probabilities on the test data
rf_proba = rf_model.predict_proba(test_data_pca)[:, 1]

# Generate the precision-recall curve and compute F1-scores for threshold optimization
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, rf_proba)
# Note: thresholds_rf has one less element than precision_rf and recall_rf.
f1_scores_rf = (2 * precision_rf[:-1] * recall_rf[:-1]) / (precision_rf[:-1] + recall_rf[:-1] + 1e-9)
optimal_idx_rf = np.argmax(f1_scores_rf)
best_threshold_rf = thresholds_rf[optimal_idx_rf]
print("Optimal threshold for RF:", best_threshold_rf)

# Apply the optimal threshold to get binary predictions
y_pred_rf = (rf_proba >= best_threshold_rf).astype(int)

# Display the classification report
print("Random Forest Model Results:")
print(classification_report(y_test, y_pred_rf, zero_division=0))


Optimal threshold for RF: 0.395
Random Forest Model Results:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       565
           1       1.00      0.20      0.33         5

    accuracy                           0.99       570
   macro avg       1.00      0.60      0.66       570
weighted avg       0.99      0.99      0.99       570

