# Initialization

In [39]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import csv
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [40]:
df = pd.read_csv('higgs-boson/training.csv')

In [88]:
df.describe()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,...,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,224999.5,-49.023079,49.239819,81.181982,57.895962,-708.420675,-601.237051,-709.356603,2.3731,18.917332,...,209.797178,0.979176,-348.329567,-399.254314,-399.259788,-692.381204,-709.121609,-709.118631,73.064591,1.646767
std,72168.927986,406.345647,35.344886,40.828691,63.655682,454.480565,657.972302,453.019877,0.782911,22.273494,...,126.499506,0.977426,532.962789,489.338286,489.333883,479.875496,453.384624,453.389017,98.015662,1.875103
min,100000.0,-999.0,0.0,6.329,0.0,-999.0,-999.0,-999.0,0.208,0.0,...,13.678,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,0.001502
25%,162499.75,78.10075,19.241,59.38875,14.06875,-999.0,-999.0,-999.0,1.81,2.841,...,123.0175,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,0.018636
50%,224999.5,105.012,46.524,73.752,38.4675,-999.0,-999.0,-999.0,2.4915,12.3155,...,179.739,1.0,38.96,-1.872,-2.093,-999.0,-999.0,-999.0,40.5125,1.156188
75%,287499.25,130.60625,73.598,92.259,79.169,0.49,83.446,-4.593,2.961,27.591,...,263.37925,2.0,75.349,0.433,0.503,33.703,-2.457,-2.275,109.93375,2.404128
max,349999.0,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,...,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433,7.822543


In [41]:
def reset_variables(df): 
    global x_train, x_test, y_train, y_test, df_n
    columns = list(df.columns)
    columns.remove("EventId")
    columns.remove("Weight")
    columns.remove("Label")
    scaler = StandardScaler()
    scaler.fit(df.drop(["Label", "EventId", "Weight"], axis=1))
    df_n = scaler.transform(df.drop(["Label", "EventId", "Weight"], axis=1))
    df_n = pd.DataFrame(df_n, columns=columns)
    x_train, x_test, y_train, y_test = train_test_split(df_n, df['Label'], test_size=0.1, random_state=101)

In [42]:
def reset_for_test(test_file): 
    global test_file_n
    columns = list(test_file.columns)
    columns.remove("EventId")
    scaler = StandardScaler()
    scaler.fit(test_file.drop(["EventId"], axis=1))
    test_file_n = scaler.transform(test_file.drop(["EventId"], axis=1))
    test_file_n = pd.DataFrame(test_file_n, columns=columns)

# Logisitic Regression

In [43]:
from sklearn.linear_model import LogisticRegression
reset_variables(df)
lm = LogisticRegression()
lm.fit(x_train, y_train)

In [44]:
predictions = lm.predict(x_test)

In [45]:
metrics.accuracy_score(y_test, predictions)

0.74976

In [46]:
metrics.confusion_matrix(y_test, predictions)

array([[14193,  2242],
       [ 4014,  4551]])

In [47]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           b       0.78      0.86      0.82     16435
           s       0.67      0.53      0.59      8565

    accuracy                           0.75     25000
   macro avg       0.72      0.70      0.71     25000
weighted avg       0.74      0.75      0.74     25000



# KNN

In [48]:
from sklearn.neighbors import KNeighborsClassifier

reset_variables(df)
for i in range(1,11): 
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    predictions = knn.predict(x_test)
    print("Accuracy for k = ", i, " is ", metrics.accuracy_score(y_test, predictions))
# knn = KNeighborsClassifier(n_neighbors=10)

# knn.fit(x_train, y_train)

Accuracy for k =  1  is  0.74364
Accuracy for k =  2  is  0.75652
Accuracy for k =  3  is  0.77188
Accuracy for k =  4  is  0.77772
Accuracy for k =  5  is  0.7858
Accuracy for k =  6  is  0.79016
Accuracy for k =  7  is  0.79316
Accuracy for k =  8  is  0.79604
Accuracy for k =  9  is  0.79604
Accuracy for k =  10  is  0.7982


In [49]:
reset_variables(df)
for i in range(11,21): 
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    predictions = knn.predict(x_test)
    print("Accuracy for k = ", i, " is ", metrics.accuracy_score(y_test, predictions))
# knn = KNeighborsClassifier(n_neighbors=10)

# knn.fit(x_train, y_train)

Accuracy for k =  11  is  0.79888
Accuracy for k =  12  is  0.79804
Accuracy for k =  13  is  0.79872
Accuracy for k =  14  is  0.79968
Accuracy for k =  15  is  0.79968
Accuracy for k =  16  is  0.80072
Accuracy for k =  17  is  0.80168
Accuracy for k =  18  is  0.80204
Accuracy for k =  19  is  0.80104
Accuracy for k =  20  is  0.80052


In [50]:
reset_variables(df)
for i in range(31,51): 
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    predictions = knn.predict(x_test)
    print("Accuracy for k = ", i, " is ", metrics.accuracy_score(y_test, predictions))
# knn = KNeighborsClassifier(n_neighbors=10)

# knn.fit(x_train, y_train)

Accuracy for k =  31  is  0.8026
Accuracy for k =  32  is  0.8032
Accuracy for k =  33  is  0.803
Accuracy for k =  34  is  0.80244
Accuracy for k =  35  is  0.80308
Accuracy for k =  36  is  0.80372
Accuracy for k =  37  is  0.80456
Accuracy for k =  38  is  0.80468
Accuracy for k =  39  is  0.80404
Accuracy for k =  40  is  0.80432
Accuracy for k =  41  is  0.80508
Accuracy for k =  42  is  0.8036
Accuracy for k =  43  is  0.8048
Accuracy for k =  44  is  0.80444
Accuracy for k =  45  is  0.80476
Accuracy for k =  46  is  0.80492
Accuracy for k =  47  is  0.80436
Accuracy for k =  48  is  0.80456
Accuracy for k =  49  is  0.80376
Accuracy for k =  50  is  0.80544


In [51]:
knn = KNeighborsClassifier(n_neighbors=40)
knn.fit(x_train, y_train)
predictions = knn.predict(x_test)
metrics.accuracy_score(y_test, predictions)

0.80432

In [52]:
metrics.confusion_matrix(y_test, predictions)

array([[14411,  2024],
       [ 2868,  5697]])

# SVM

SVM seems too inefficient and takes too long to use it by itself

In [53]:
# from sklearn.svm import SVC
# svc = SVC()
# reset_variables(df)
# model = svc.fit(x_train, y_train)
# predictions = model.predict(x_test)
# print(metrics.accuracy_score(y_test, predictions))

# Decision Trees

In [54]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
reset_variables(df)
model = dt.fit(x_train, y_train)
predictions = model.predict(x_test)
print(metrics.accuracy_score(y_test, predictions))

0.76284


In [55]:
print(metrics.confusion_matrix(y_test, predictions))

[[13498  2937]
 [ 2992  5573]]


# Random forest

In [56]:
from sklearn.ensemble import RandomForestClassifier
rf_v1 = RandomForestClassifier(n_estimators=60)
reset_variables(df)
model = rf_v1.fit(x_train, y_train)
predictions = model.predict(x_test)
print(metrics.accuracy_score(y_test, predictions))

0.8378


# Predictions on Test File v1

In [57]:
test_file = pd.read_csv('higgs-boson/test.csv')
reset_for_test(test_file)

In [58]:
final_predictions = rf_v1.predict(test_file_n)

In [59]:
prediction_confidences_temp = rf_v1.predict_proba(test_file_n)
prediction_confidences = []
for i in range(prediction_confidences_temp.shape[0]): 
    prediction_confidences.append(max([prediction_confidences_temp[i, 0], prediction_confidences_temp[i, 1]]))
prediction_confidences = np.array(prediction_confidences)
ranks = np.argsort(np.argsort(-prediction_confidences)) + 1

In [60]:
test_predictions = pd.DataFrame({'EventId': test_file['EventId'],
                   'RankOrder': ranks,
                   'Class': final_predictions})
test_predictions.to_csv('test_predictions/test_predictions_v1.csv', index=False)

# Filtering Data

In [61]:
df = pd.read_csv('higgs-boson/training.csv')

In [62]:
columns = list(df.columns)
filtered_data = []

for row in range(len(df['EventId'])):
    for column in columns[:-1]:  
        if df[column][row] < -990:
            break  
    else:
        filtered_data.append(df.loc[row])

filtered_data_df = pd.DataFrame(filtered_data, columns=columns)

In [63]:
from sklearn.ensemble import RandomForestClassifier
rf_v2 = RandomForestClassifier(n_estimators=60)
reset_variables(filtered_data_df)
model = rf_v2.fit(x_train, y_train)
predictions = model.predict(x_test)
print(metrics.accuracy_score(y_test, predictions))

0.8389606576629477


# Predictions on Test File v2

In [64]:
test_file = pd.read_csv('higgs-boson/test.csv')
reset_for_test(test_file)

In [65]:
final_predictions = rf_v2.predict(test_file_n)

In [66]:
prediction_confidences_temp = rf_v2.predict_proba(test_file_n)
prediction_confidences = []
for i in range(prediction_confidences_temp.shape[0]): 
    prediction_confidences.append(max([prediction_confidences_temp[i, 0], prediction_confidences_temp[i, 1]]))
prediction_confidences = np.array(prediction_confidences)
ranks = np.argsort(np.argsort(-prediction_confidences)) + 1

In [67]:
test_predictions = pd.DataFrame({'EventId': test_file['EventId'],
                   'RankOrder': ranks,
                   'Class': final_predictions})
test_predictions.to_csv('test_predictions/test_predictions_v2.csv', index=False)

# Grid Search with Cross Validation

In [68]:
df = pd.read_csv('higgs-boson/training.csv')
reset_variables(df)

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

param_grid = {
    'n_estimators': [40, 60, 80],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create Random Forest model
rf = RandomForestClassifier(random_state=42)

# Setup Grid Search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    n_jobs=-1,  # Use all available cores
    verbose=2,
    scoring='accuracy'  # Can change based on your needs
)

# Fit Grid Search
grid_search.fit(x_train, y_train)

# Best parameters and results
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Evaluate on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=40; total time=  33.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=40; total time=  33.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=40; total time=  33.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=40; total time=  33.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=40; total time=  33.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time=  51.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time=  51.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time=

KeyboardInterrupt: 

In [70]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
print(classification_report(y_test, y_pred))

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [71]:
df = pd.read_csv('higgs-boson/training.csv')

In [72]:
reset_variables(df)
rf_v3 = RandomForestClassifier(bootstrap=False, 
                            max_depth=None, 
                            min_samples_leaf=4, 
                            min_samples_split=10, 
                            n_estimators=80)
model = rf_v3.fit(x_train, y_train)
predictions = model.predict(x_test)
print(metrics.accuracy_score(y_test, predictions))

0.83884


# Predictions on Test File v3

In [73]:
test_file = pd.read_csv('higgs-boson/test.csv')
reset_for_test(test_file)

In [74]:
final_predictions = rf_v3.predict(test_file_n)

In [75]:
prediction_confidences_temp = rf_v3.predict_proba(test_file_n)
prediction_confidences = []
for i in range(prediction_confidences_temp.shape[0]): 
    prediction_confidences.append(max([prediction_confidences_temp[i, 0], prediction_confidences_temp[i, 1]]))
prediction_confidences = np.array(prediction_confidences)
ranks = np.argsort(np.argsort(-prediction_confidences)) + 1

In [76]:
test_predictions = pd.DataFrame({'EventId': test_file['EventId'],
                   'RankOrder': ranks,
                   'Class': final_predictions})
test_predictions.to_csv('test_predictions/test_predictions_v3.csv', index=False)

# PCA decomposition

In [77]:
df = pd.read_csv('higgs-boson/training.csv')  
reset_variables(df)

In [78]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
x_pca = pca.fit_transform(df_n)

print("Explained variance ratio:", np.sum(pca.explained_variance_ratio_))
print("Number of components chosen:", pca.n_components_)

Explained variance ratio: 0.9585544215038367
Number of components chosen: 14


In [79]:
x_train, x_test, y_train, y_test = train_test_split(x_pca, df['Label'], test_size=0.1, random_state=42)

rf_v4 = RandomForestClassifier(n_estimators=100, 
                            random_state=42, 
                            bootstrap=False, 
                            max_depth=None, 
                            min_samples_leaf=4, 
                            min_samples_split=10)
rf_v4.fit(x_train, y_train)

y_pred = rf_v4.predict(x_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy with PCA: {accuracy:.4f}")


Accuracy with PCA: 0.8213


# Predictions on Test File v4

In [80]:
test_file = pd.read_csv('higgs-boson/test.csv')
reset_for_test(test_file)
test_file_pca = pca.transform(test_file_n)

In [81]:
final_predictions = rf_v4.predict(test_file_pca)

In [82]:
prediction_confidences_temp = rf_v4.predict_proba(test_file_pca)
prediction_confidences = []
for i in range(prediction_confidences_temp.shape[0]): 
    prediction_confidences.append(max([prediction_confidences_temp[i, 0], prediction_confidences_temp[i, 1]]))
prediction_confidences = np.array(prediction_confidences)
ranks = np.argsort(np.argsort(-prediction_confidences)) + 1

In [83]:
test_predictions = pd.DataFrame({'EventId': test_file['EventId'],
                   'RankOrder': ranks,
                   'Class': final_predictions})
test_predictions.to_csv('test_predictions/test_predictions_v4.csv', index=False)

# Weighting Scheme in accordance to correlation to label

In [84]:
df = pd.read_csv('higgs-boson/training.csv')
x = df.drop(["Label", "EventId", "Weight"], axis=1)
y = df['Label'].map({'b': 0, 's': 1})

In [85]:
correlation_weights = x.corrwith(pd.Series(y))
correlation_weights = correlation_weights.abs()
correlation_weights = correlation_weights / correlation_weights.max()
x_weighted = x * correlation_weights
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_weighted)
x_train, x_test, y_train, y_test = train_test_split(x_scaled, df['Label'], test_size=0.1, random_state=42)
x_train = pd.DataFrame(x_train, columns=x.columns) 
x_test = pd.DataFrame(x_test, columns=x.columns)

In [86]:
rf_v5 = RandomForestClassifier(n_estimators=100, 
                            random_state=42, 
                            bootstrap=False, 
                            max_depth=None, 
                            min_samples_leaf=4, 
                            min_samples_split=10)
rf_v5.fit(x_train, y_train)

y_pred = rf_v5.predict(x_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy with Feature Weighting: {accuracy:.4f}")

print("Feature Weights:\n", correlation_weights)

Accuracy with Feature Weighting: 0.8390
Feature Weights:
 DER_mass_MMC                   0.680507
DER_mass_transverse_met_lep    1.000000
DER_mass_vis                   0.039995
DER_pt_h                       0.547840
DER_deltaeta_jet_jet           0.403058
DER_mass_jet_jet               0.545677
DER_prodeta_jet_jet            0.399952
DER_deltar_tau_lep             0.034845
DER_pt_tot                     0.043501
DER_sum_pt                     0.436038
DER_pt_ratio_lep_tau           0.556011
DER_met_phi_centrality         0.773279
DER_lep_eta_centrality         0.402205
PRI_tau_pt                     0.669378
PRI_tau_eta                    0.002684
PRI_tau_phi                    0.012528
PRI_lep_pt                     0.090908
PRI_lep_eta                    0.004314
PRI_lep_phi                    0.011739
PRI_met                        0.063927
PRI_met_phi                    0.021271
PRI_met_sumet                  0.385627
PRI_jet_num                    0.380018
PRI_jet_leading_pt    

# Predictions on Test File v5

In [None]:
test_file = pd.read_csv('higgs-boson/test.csv')
reset_for_test(test_file)

In [None]:
final_predictions = rf_v5.predict(test_file_n)

In [None]:
prediction_confidences_temp = rf_v5.predict_proba(test_file_n)
prediction_confidences = []
for i in range(prediction_confidences_temp.shape[0]): 
    prediction_confidences.append(max([prediction_confidences_temp[i, 0], prediction_confidences_temp[i, 1]]))
prediction_confidences = np.array(prediction_confidences)
ranks = np.argsort(np.argsort(-prediction_confidences)) + 1

In [None]:
test_predictions = pd.DataFrame({'EventId': test_file['EventId'],
                   'RankOrder': ranks,
                   'Class': final_predictions})
test_predictions.to_csv('test_predictions/test_predictions_v5.csv', index=False)