In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix, classification_report
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(r"C:\Users\avnis\My_Projects\Higgs-Boson-Classifier\data\Data science dataset.csv")

In [3]:
# Sample check
print("Dataset shape:", df.shape)
print(df.head())

Dataset shape: (250000, 33)
   EventId  DER_mass_MMC  DER_mass_transverse_met_lep  DER_mass_vis  DER_pt_h  \
0   100000       138.470                       51.655        97.827    27.980   
1   100001       160.937                       68.768       103.235    48.146   
2   100002      -999.000                      162.172       125.953    35.635   
3   100003       143.905                       81.417        80.943     0.414   
4   100004       175.864                       16.915       134.805    16.405   

   DER_deltaeta_jet_jet  DER_mass_jet_jet  DER_prodeta_jet_jet  \
0                  0.91           124.711                2.666   
1               -999.00          -999.000             -999.000   
2               -999.00          -999.000             -999.000   
3                  9.00          -999.000             -999.000   
4               -999.00          -999.000             -999.000   

   DER_deltar_tau_lep  DER_pt_tot  ...  PRI_jet_num  PRI_jet_leading_pt  \
0            

In [4]:
# converting lable bin to 0/1
if "Label_bin" not in df.columns:
    df["Label_bin"] = df["Label"].apply(lambda x: 1 if x == "s" else 0)


In [5]:
X = df.drop(columns=["EventId", "Label", "Label_bin", "Weight"], errors='ignore')
y = df["Label_bin"]
sample_weights = df["Weight"] if "Weight" in df.columns else None


In [6]:
for col in X.columns:
    # Adding missing flags
    if X[col].isna().sum() > 0 or (-999.0 in X[col].values):
        X[col + "_missing_flag"] = ((X[col] == -999.0) | X[col].isna()).astype(int)
    
    # Replacing -999 with NaN first
    X[col] = X[col].replace(-999.0, np.nan)
    
    # Filling missing with median
    median_val = X[col].median()
    X[col] = X[col].fillna(median_val)


In [7]:
# 3. Train-Test Split
# ------------------------
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, sample_weights, test_size=0.3, stratify=y, random_state=42
)

In [8]:
# 4. Scaling
# ------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Saving scaler
joblib.dump(scaler, rf"C:\Users\avnis\My_Projects\Higgs-Boson-Classifier\models\scaler_higgs.pkl")

['C:\\Users\\avnis\\My_Projects\\Higgs-Boson-Classifier\\models\\scaler_higgs.pkl']

In [10]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

models = {
    "LogisticRegression": LogisticRegression(max_iter=200, n_jobs=-1, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=50, max_depth=12, n_jobs=-1, random_state=42),
    "HistGradientBoosting": HistGradientBoostingClassifier(max_iter=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(
        n_estimators=100,
        max_depth=5,
        random_state=42,
        eval_metric='logloss'  # will not give warning now
        
    )
}

In [11]:
# 6. Train and Evaluate
# ------------------------
import joblib
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report

results = {}

for name, model in models.items():
    print(f"\nTraining {name} ...")
    
    # Train model
    model.fit(X_train_scaled, y_train, sample_weight=w_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Metrics
    acc = accuracy_score(y_test, y_pred, sample_weight=w_test)
    f1 = f1_score(y_test, y_pred, sample_weight=w_test)
    roc = roc_auc_score(y_test, y_proba, sample_weight=w_test)
    precision = precision_score(y_test, y_pred, sample_weight=w_test)
    recall = recall_score(y_test, y_pred, sample_weight=w_test)
    
    # Confusion matrix & report
    cm = confusion_matrix(y_test, y_pred, sample_weight=w_test)
    report = classification_report(y_test, y_pred, sample_weight=w_test)
    
    # Storing results
    results[name] = {
        "Accuracy": acc,
        "F1": f1,
        "ROC-AUC": roc,
        "Precision": precision,
        "Recall": recall,
        "Confusion Matrix": cm,
        "Classification Report": report
    }
    trained_features = list(X_train.columns)
    joblib.dump(trained_features, rf"C:\Users\avnis\My_Projects\Higgs-Boson-Classifier\models\{name}_features.pkl")
    
    # Printing metrics
    print(f"{name} -> Accuracy: {acc:.4f}, F1: {f1:.4f}, ROC-AUC: {roc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
    print("Confusion Matrix:")
    for row in cm:
        print([f"{x:.0f}" for x in row])  # rounding to integer
    print("Classification Report:\n", report)
    
    # Saving model
    joblib.dump(model, rf"C:\Users\avnis\My_Projects\Higgs-Boson-Classifier\models\{name}_higgs_model.pkl")


Training LogisticRegression ...
LogisticRegression -> Accuracy: 0.9983, F1: 0.0013, ROC-AUC: 0.8779, Precision: 0.2145, Recall: 0.0007
Confusion Matrix:
['123323', '1']
['208', '0']
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00 123323.07339733066
           1       0.21      0.00      0.00 207.6524791019955

    accuracy                           1.00 123530.72587643265
   macro avg       0.61      0.50      0.50 123530.72587643265
weighted avg       1.00      1.00      1.00 123530.72587643265


Training RandomForest ...
RandomForest -> Accuracy: 0.9983, F1: 0.0162, ROC-AUC: 0.8901, Precision: 0.2166, Recall: 0.0084
Confusion Matrix:
['123317', '6']
['206', '2']
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00 123323.07339733066
           1       0.22      0.01      0.02 207.6524791019955

    accuracy                           1.00 12