In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, log_loss, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from scipy.stats import uniform, randint
from lightgbm import LGBMClassifier
import shap


In [2]:
df = pd.read_csv("AppML_InitialProject_train.csv")
real_data = pd.read_csv("AppML_InitialProject_test_classification.csv")

In [3]:
target_column = 'p_Truth_isElectron'
all_features = []
for col in df.columns:
    if col != target_column and col != "p_Truth_Energy":
        all_features.append(col)

# Normalize data for feature selection
print("Normalizing data for feature selection...")
scaler_fs = StandardScaler()
X_normalized = scaler_fs.fit_transform(df[all_features])
X_normalized_df = pd.DataFrame(X_normalized, columns=all_features)

Normalizing data for feature selection...


In [5]:
X_temp = X_normalized
y_temp = df[target_column]
X_temp_train, X_temp_test, y_temp_train, y_temp_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)


xgb_model = xgb.XGBClassifier(
    n_estimators=100, 
    max_depth=10, 
    random_state=42,
    device='cuda',
    objective="binary:logistic"
)
xgb_model.fit(X_temp_train, y_temp_train)

# Get feature importance from XGBoost
xgb_importance = xgb_model.feature_importances_
top_xgb_indices = np.argsort(xgb_importance)[-70:]  # Top 70 features
xgb_features = [all_features[i] for i in top_xgb_indices]
print(f"Features after XGBoost analysis: {len(xgb_features)}")

Features after XGBoost analysis: 70


In [None]:
X_xgb = X_normalized_df[xgb_features]
X_xgb_train, X_xgb_test, y_xgb_train, y_xgb_test = train_test_split(X_xgb, df[target_column], test_size=0.3, random_state=42)

# Train LGBMClassifier for SHAP analysis
temp_model = LGBMClassifier(
    n_estimators=100,
    num_leaves=31,    
    device='gpu'
)
temp_model.fit(X_xgb_train, y_xgb_train)

# Calculate SHAP values
explainer = shap.TreeExplainer(temp_model)
shap_values = explainer.shap_values(X_xgb_test[:1000])
if isinstance(shap_values, list):
    shap_importance = np.abs(shap_values[1]).mean(0)
else:
    shap_importance = np.abs(shap_values).mean(0)

# Select top features based on SHAP
top_shap_indices = np.argsort(shap_importance)[-20:]
selected_columns = [xgb_features[i] for i in top_shap_indices]

[LightGBM] [Info] Number of positive: 26556, number of negative: 99444
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 15206
[LightGBM] [Info] Number of data points in the train set: 126000, number of used features: 70
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 62 dense feature groups (7.69 MB) transferred to GPU in 0.006957 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.210762 -> initscore=-1.320339
[LightGBM] [Info] Start training from score -1.320339




In [7]:
print(f"Final selected features: {len(selected_columns)}")
print("Selected features:", selected_columns)

Final selected features: 20
Selected features: ['pX_f1core', 'p_Reta', 'pX_ambiguityType', 'pX_deltaEta0', 'p_f3', 'p_pt_track', 'p_d0', 'p_f1', 'p_deltaPhiRescaled2', 'p_etcone30', 'p_Rhad', 'p_deltaEta1', 'pX_deltaEta1', 'p_etcone20', 'p_numberOfInnermostPixelHits', 'p_dPOverP', 'p_TRTPID', 'p_ptPU30', 'p_sigmad0', 'pX_MultiLepton']


In [8]:
X_df = df[selected_columns]
y = df[target_column]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_df)

X = np.array(X_scaled)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# XGBoost solution

In [9]:
param_distributions = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),  
    'reg_alpha': uniform(0, 1), 
    'reg_lambda': uniform(0, 1) 
}

base_model = xgb.XGBClassifier(
    objective="binary:logistic",
    random_state=42,
    n_jobs=-1
    
)

# Perform randomized search with 3-fold CV
random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_distributions,
    n_iter=10,  # Number of parameter combinations to try
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

random_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [10]:
best_params = random_search.best_params_
print(f"\nBest parameters found:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

print(f"Best cross-validation score (AUC): {random_search.best_score_:.4f}")

model = random_search.best_estimator_
model.set_params(device='cuda')

print("\nPerforming 3-Fold Cross Validation with optimized hyperparameters...")

cv_model = random_search.best_estimator_
cv_model.set_params(device='cpu')

cv_scores_accuracy = cross_val_score(cv_model, X, y, cv=3, scoring='accuracy', n_jobs=-1)
cv_scores_auc = cross_val_score(cv_model, X, y, cv=3, scoring='roc_auc', n_jobs=-1)
cv_scores_f1 = cross_val_score(cv_model, X, y, cv=3, scoring='f1', n_jobs=-1)

print("Cross Validation Results:")
print(f"Accuracy - Mean: {cv_scores_accuracy.mean():.4f}, Std: {cv_scores_accuracy.std():.4f}")
print(f"AUC - Mean: {cv_scores_auc.mean():.4f}, Std: {cv_scores_auc.std():.4f}")
print(f"F1 Score - Mean: {cv_scores_f1.mean():.4f}, Std: {cv_scores_f1.std():.4f}")



Best parameters found:
  colsample_bytree: 0.9329770563201687
  learning_rate: 0.052467822135655234
  max_depth: 14
  n_estimators: 376
  reg_alpha: 0.6174815096277165
  reg_lambda: 0.6116531604882809
  subsample: 0.6028265220878869
Best cross-validation score (AUC): 0.9933

Performing 3-Fold Cross Validation with optimized hyperparameters...
Cross Validation Results:
Accuracy - Mean: 0.9691, Std: 0.0000
AUC - Mean: 0.9935, Std: 0.0000
F1 Score - Mean: 0.9249, Std: 0.0002


# Neural network solution

In [11]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch
from sklearn.model_selection import KFold

In [12]:
b_size=2**13

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU: NVIDIA GeForce RTX 2060


In [None]:
X_df = df[selected_columns]
y = df[target_column]

# Apply preprocessing to your features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_df)

# Convert to numpy arrays first
X = np.array(X_scaled, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# Split data into train and final test set (80% for CV, 20% for final testing)
X_train_cv, X_test, y_train_cv, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

kfold = KFold(n_splits=3, shuffle=True, random_state=42)
cv_scores = []

In [None]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 32)
        self.fc4 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

In [None]:
input_dim = X_train_cv.shape[1]
criterion = nn.BCELoss() 
num_epochs = 10

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_cv)):
    print(f"\nFold {fold + 1}/3")
    
    X_fold_train, X_fold_val = X_train_cv[train_idx], X_train_cv[val_idx]
    y_fold_train, y_fold_val = y_train_cv[train_idx], y_train_cv[val_idx]
    
    X_fold_train_tensor = torch.tensor(X_fold_train, dtype=torch.float32).to(device)
    y_fold_train_tensor = torch.tensor(y_fold_train, dtype=torch.float32).to(device)
    X_fold_val_tensor = torch.tensor(X_fold_val, dtype=torch.float32).to(device)
    y_fold_val_tensor = torch.tensor(y_fold_val, dtype=torch.float32).to(device)
    
    fold_train_dataset = TensorDataset(X_fold_train_tensor, y_fold_train_tensor)
    fold_train_loader = DataLoader(fold_train_dataset, batch_size=b_size, shuffle=True)
    fold_val_dataset = TensorDataset(X_fold_val_tensor, y_fold_val_tensor)
    fold_val_loader = DataLoader(fold_val_dataset, batch_size=b_size, shuffle=False)
    
    model = SimpleNN(input_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in fold_train_loader:
            # Data is already on GPU from TensorDataset
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        
        if epoch == num_epochs - 1: 
            epoch_loss = running_loss / len(fold_train_loader.dataset)
            print(f"  Final training loss: {epoch_loss:.4f}")
    
    model.eval()
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for inputs, labels in fold_val_loader:
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()
            # Move predictions back to CPU for sklearn metrics
            val_predictions.extend(predicted.squeeze().cpu().tolist())
            val_true_labels.extend(labels.squeeze().cpu().tolist())
    
    fold_accuracy = accuracy_score(val_true_labels, val_predictions)
    cv_scores.append(fold_accuracy)
    print(f"  Fold {fold + 1} Validation Accuracy: {fold_accuracy:.4f}")



Starting 3-fold Cross Validation...

Fold 1/3
  Final training loss: 0.1368
  Fold 1 Validation Accuracy: 0.9483

Fold 2/3
  Final training loss: 0.1409
  Fold 2 Validation Accuracy: 0.9498

Fold 3/3
  Final training loss: 0.1374
  Fold 3 Validation Accuracy: 0.9494

Cross-Validation Results:
Individual fold scores: ['0.9483', '0.9498', '0.9494']
Mean CV Accuracy: 0.9492 (+/- 0.0013)

Training final model on all training data...
Epoch [1/10], Loss: 0.5702
Epoch [2/10], Loss: 0.3234
Epoch [3/10], Loss: 0.2011
Epoch [4/10], Loss: 0.1628
Epoch [5/10], Loss: 0.1510
Epoch [6/10], Loss: 0.1449
Epoch [7/10], Loss: 0.1402
Epoch [8/10], Loss: 0.1363
Epoch [9/10], Loss: 0.1333
Epoch [10/10], Loss: 0.1307

Final Test Set Accuracy: 0.9516


In [None]:
print(f"\nCross-Validation Results:")
print(f"Individual fold scores: {[f'{score:.4f}' for score in cv_scores]}")
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")

print(f"\nTraining final model on all training data...")
X_train_cv_tensor = torch.tensor(X_train_cv, dtype=torch.float32).to(device)
y_train_cv_tensor = torch.tensor(y_train_cv, dtype=torch.float32).to(device)

final_train_dataset = TensorDataset(X_train_cv_tensor, y_train_cv_tensor)
final_train_loader = DataLoader(final_train_dataset, batch_size=b_size, shuffle=True)

# Initialize final model and move to GPU
final_model = SimpleNN(input_dim).to(device)
final_optimizer = optim.Adam(final_model.parameters(), lr=0.001)



In [None]:
for epoch in range(num_epochs):
    final_model.train()
    running_loss = 0.0
    for inputs, labels in final_train_loader:
        # Data is already on GPU from TensorDataset
        final_optimizer.zero_grad()
        outputs = final_model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        final_optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(final_train_loader.dataset)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}")

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

final_test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
final_test_loader = DataLoader(final_test_dataset, batch_size=b_size, shuffle=False)

final_model.eval()
test_predictions = []
test_true_labels = []
with torch.no_grad():
    for inputs, labels in final_test_loader:
        outputs = final_model(inputs)
        predicted = (outputs > 0.5).float()
        # Move predictions back to CPU for sklearn metrics
        test_predictions.extend(predicted.squeeze().cpu().tolist())
        test_true_labels.extend(labels.squeeze().cpu().tolist())

final_test_accuracy = accuracy_score(test_true_labels, test_predictions)
print(f"\nFinal Test Set Accuracy: {final_test_accuracy:.4f}")

def predict(model, input_tensor):
    model.eval()
    with torch.no_grad():
        # Ensure input tensor is on the same device as the model
        if input_tensor.device != next(model.parameters()).device:
            input_tensor = input_tensor.to(next(model.parameters()).device)
        output = model(input_tensor)
    return output  

predicted_labels = predict(final_model, X_test_tensor)

Epoch [1/10], Loss: 0.1285
Epoch [2/10], Loss: 0.1266
Epoch [3/10], Loss: 0.1250
Epoch [4/10], Loss: 0.1233
Epoch [5/10], Loss: 0.1215
Epoch [6/10], Loss: 0.1199
Epoch [7/10], Loss: 0.1181
Epoch [8/10], Loss: 0.1166
Epoch [9/10], Loss: 0.1153
Epoch [10/10], Loss: 0.1142

Final Test Set Accuracy: 0.9572
