In [50]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler


In [51]:
Census = pd.read_csv('adult.data', header=None, names=[
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
    'hours-per-week', 'native-country', 'class'
])
Census

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [52]:
Census['class'] = Census['class'].str.strip()
Census['class'] = Census['class'].map({'>50K': 1, '<=50K': 0})

In [53]:
relevant_columns = [
    'age', 
    'workclass', 
    'education-num', 
    'marital-status', 
    'occupation', 
    'relationship', 
    'sex', 
    'capital-gain', 
    'capital-loss', 
    'hours-per-week', 
    'class'
]
Census = Census[relevant_columns]
Census


Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,class
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,0
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,0
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,0
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,0
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,0
...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,12,Married-civ-spouse,Tech-support,Wife,Female,0,0,38,0
32557,40,Private,9,Married-civ-spouse,Machine-op-inspct,Husband,Male,0,0,40,1
32558,58,Private,9,Widowed,Adm-clerical,Unmarried,Female,0,0,40,0
32559,22,Private,9,Never-married,Adm-clerical,Own-child,Male,0,0,20,0


In [54]:
categorical_cols = ['workclass', 'marital-status', 'occupation', 'relationship', 'sex']
Census = pd.get_dummies(Census, columns=categorical_cols, drop_first=True)


In [55]:
from sklearn.preprocessing import StandardScaler
numerical_cols = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
scaler = StandardScaler()
Census[numerical_cols] = scaler.fit_transform(Census[numerical_cols])


In [56]:
boolean_cols = Census.select_dtypes(include='bool').columns

# Convert Boolean columns to integers (1 for True, 0 for False)
Census[boolean_cols] = Census[boolean_cols].astype(int)


# Random Forest

#### Hyperparam Tuning

In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Features and target
X = Census.drop("class", axis=1)  # Replace with your dataset
y = Census["class"]               # Replace with your dataset

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Oversample the minority class using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Define parameter grid
param_grid = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_features": ["sqrt", "log2", None],
    "criterion": ["gini", "entropy"]
}

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20,  # Number of combinations to try
    scoring='accuracy',
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_resampled, y_resampled)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate on test set using the best model
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))




Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_estimators': 100, 'max_features': 'log2', 'criterion': 'gini'}
Test Set Accuracy: 83.60%
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4942
           1       0.65      0.68      0.67      1571

    accuracy                           0.84      6513
   macro avg       0.78      0.78      0.78      6513
weighted avg       0.84      0.84      0.84      6513



### 80/20 Split

In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Train-test split with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=i)
    
    # Oversample the minority class using SMOTE
    smote = SMOTE(random_state=i)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Train Random Forest Classifier
    classifier = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_features = 'log2', criterion = 'gini', random_state=i)
    classifier.fit(X_resampled, y_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---




Accuracy: 83.80%
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4918
           1       0.66      0.70      0.68      1595

    accuracy                           0.84      6513
   macro avg       0.78      0.79      0.79      6513
weighted avg       0.84      0.84      0.84      6513


--- Trial 2 ---




Accuracy: 83.96%
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      5026
           1       0.64      0.69      0.66      1487

    accuracy                           0.84      6513
   macro avg       0.77      0.79      0.78      6513
weighted avg       0.85      0.84      0.84      6513


--- Trial 3 ---




Accuracy: 82.68%
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4928
           1       0.63      0.68      0.66      1585

    accuracy                           0.83      6513
   macro avg       0.76      0.78      0.77      6513
weighted avg       0.83      0.83      0.83      6513


Summary of Results Across Trials:
   Trial  Accuracy  Precision (Class 1)  Recall (Class 1)  F1-Score (Class 1)
0      1  0.838016             0.659953          0.698433            0.678648
1      2  0.839552             0.636252          0.694015            0.663879
2      3  0.826808             0.634649          0.679495            0.656307
Average Accuracy across the 3 trials: 83.48%


### 50/50

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np


# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Train-test split with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.5, random_state=i)
    
    # Oversample the minority class using SMOTE
    smote = SMOTE(random_state=i)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Train Random Forest Classifier
    classifier = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_features = 'log2', criterion = 'gini', random_state=i)
    classifier.fit(X_resampled, y_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---




Accuracy: 83.76%
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.89     12323
           1       0.66      0.68      0.67      3958

    accuracy                           0.84     16281
   macro avg       0.78      0.78      0.78     16281
weighted avg       0.84      0.84      0.84     16281


--- Trial 2 ---




Accuracy: 83.62%
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89     12476
           1       0.64      0.69      0.66      3805

    accuracy                           0.84     16281
   macro avg       0.77      0.78      0.78     16281
weighted avg       0.84      0.84      0.84     16281


--- Trial 3 ---




Accuracy: 83.38%
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89     12345
           1       0.65      0.68      0.66      3936

    accuracy                           0.83     16281
   macro avg       0.77      0.78      0.78     16281
weighted avg       0.84      0.83      0.83     16281


Summary of Results Across Trials:
   Trial  Accuracy  Precision (Class 1)  Recall (Class 1)  F1-Score (Class 1)
0      1  0.837602             0.660872          0.681910            0.671226
1      2  0.836189             0.639256          0.686465            0.662020
2      3  0.833794             0.650073          0.676829            0.663181
Average Accuracy across the 3 trials: 83.59%


### 20/80

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np


# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Train-test split with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.8, random_state=i)
    
    # Oversample the minority class using SMOTE
    smote = SMOTE(random_state=i)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Train Random Forest Classifier
    classifier = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_features = 'log2', criterion = 'gini', random_state=i)
    classifier.fit(X_resampled, y_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---




Accuracy: 83.74%
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.89     19729
           1       0.66      0.68      0.67      6320

    accuracy                           0.84     26049
   macro avg       0.78      0.78      0.78     26049
weighted avg       0.84      0.84      0.84     26049


--- Trial 2 ---




Accuracy: 83.42%
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89     19845
           1       0.64      0.69      0.66      6204

    accuracy                           0.83     26049
   macro avg       0.77      0.78      0.78     26049
weighted avg       0.84      0.83      0.84     26049


--- Trial 3 ---




Accuracy: 83.15%
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     19752
           1       0.65      0.66      0.66      6297

    accuracy                           0.83     26049
   macro avg       0.77      0.77      0.77     26049
weighted avg       0.83      0.83      0.83     26049


Summary of Results Across Trials:
   Trial  Accuracy  Precision (Class 1)  Recall (Class 1)  F1-Score (Class 1)
0      1  0.837422             0.661253          0.676424            0.668752
1      2  0.834235             0.641422          0.689394            0.664543
2      3  0.831548             0.647916          0.663967            0.655843
Average Accuracy across the 3 trials: 83.44%


## SVM

#### Hyperparam Tuning

In [64]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd


# Define the parameter grid for tuning
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  # Regularization parameter
}

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the SVM model
svm_model = SVC(class_weight='balanced', random_state=42)

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=param_grid,
    n_iter=20,  # Number of random combinations to try
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)



Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Parameters: {'C': 10}
Accuracy: 81.31%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.80      0.87      4942
           1       0.58      0.85      0.69      1571

    accuracy                           0.81      6513
   macro avg       0.76      0.82      0.78      6513
weighted avg       0.85      0.81      0.82      6513



### 80/20

In [65]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pandas as pd


# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Perform train-test split with different random seeds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    
    # Train SVM classifier
    classifier = SVC(kernel='linear', class_weight='balanced', C = 10, random_state=i)
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")




--- Trial 1 ---
Accuracy: 78.86%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.77      0.85      4918
           1       0.54      0.85      0.66      1595

    accuracy                           0.79      6513
   macro avg       0.74      0.81      0.75      6513
weighted avg       0.84      0.79      0.80      6513


--- Trial 2 ---
Accuracy: 78.73%
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.77      0.85      5026
           1       0.52      0.86      0.65      1487

    accuracy                           0.79      6513
   macro avg       0.73      0.81      0.75      6513
weighted avg       0.85      0.79      0.80      6513


--- Trial 3 ---
Accuracy: 79.52%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.78      0.85      4928
           1       0.55      0.85      0.67      1585

    accurac

### 50/50

In [66]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pandas as pd


# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Perform train-test split with different random seeds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
    
    # Train SVM classifier
    classifier = SVC(kernel='linear', class_weight='balanced', C = 10, random_state=i)
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---
Accuracy: 79.76%
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.78      0.85     12323
           1       0.55      0.87      0.68      3958

    accuracy                           0.80     16281
   macro avg       0.75      0.82      0.76     16281
weighted avg       0.85      0.80      0.81     16281


--- Trial 2 ---
Accuracy: 79.36%
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.78      0.85     12476
           1       0.54      0.85      0.66      3805

    accuracy                           0.79     16281
   macro avg       0.74      0.81      0.76     16281
weighted avg       0.85      0.79      0.81     16281


--- Trial 3 ---
Accuracy: 79.63%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.78      0.85     12345
           1       0.55      0.85      0.67      3936

    accurac

### 20/80

In [68]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pandas as pd


# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Perform train-test split with different random seeds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=i)
    
    # Train SVM classifier
    classifier = SVC(kernel='linear', class_weight='balanced',C =10, random_state=i)
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")




--- Trial 1 ---
Accuracy: 80.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.78      0.86     19729
           1       0.56      0.86      0.68      6320

    accuracy                           0.80     26049
   macro avg       0.75      0.82      0.77     26049
weighted avg       0.85      0.80      0.81     26049


--- Trial 2 ---
Accuracy: 79.83%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.78      0.86     19845
           1       0.55      0.85      0.67      6204

    accuracy                           0.80     26049
   macro avg       0.75      0.82      0.76     26049
weighted avg       0.85      0.80      0.81     26049


--- Trial 3 ---
Accuracy: 80.10%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.78      0.86     19752
           1       0.56      0.86      0.68      6297

    accurac

## ANN

#### Hyperparam Tuning

In [70]:
from sklearn.model_selection import ParameterGrid
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define hyperparameter grid
param_grid = {
    "lr": [0.001, 0.01, 0.1],                # Learning rates to tune
    "hidden_size_1": [32, 64, 128],          # Hidden layer 1 sizes to tune
    "hidden_size_2": [16, 32, 64],           # Hidden layer 2 sizes to tune
}

# Convert the grid to a list of parameter combinations
grid = list(ParameterGrid(param_grid))

# To store results
tuning_results = []

# Iterate over all combinations of hyperparameters
for params in grid:
    print(f"\nTesting Parameters: {params}")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Define model with hyperparameters
    class ANN(nn.Module):
        def __init__(self, input_size):
            super(ANN, self).__init__()
            self.fc1 = nn.Linear(input_size, params["hidden_size_1"])
            self.fc2 = nn.Linear(params["hidden_size_1"], params["hidden_size_2"])
            self.fc3 = nn.Linear(params["hidden_size_2"], 2)  # 2 output classes

        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = torch.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=params["lr"])

    # Training loop
    epochs = 50  # Fixed number of epochs
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()

    # Append results
    tuning_results.append({
        "Params": params,
        "Accuracy": accuracy
    })

# Convert results to DataFrame and display the best parameters
results_df = pd.DataFrame(tuning_results)
best_result = results_df.loc[results_df["Accuracy"].idxmax()]
print("\nBest Parameters and Results:")
print(best_result)

best_params = best_result["Params"]
best_accuracy = best_result["Accuracy"]

print("\nBest Parameters:")
print(f"Hidden Layer 1 Size: {best_params['hidden_size_1']}")
print(f"Hidden Layer 2 Size: {best_params['hidden_size_2']}")
print(f"Learning Rate: {best_params['lr']}")
print(f"\nBest Accuracy: {best_accuracy * 100:.2f}%")




Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 16, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 16, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 16, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 32, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 32, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 32, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 64, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 64, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 64, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 16, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 16, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 16, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 32, 'lr': 0.001}

Testing Pa

### 80/20

In [71]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the ANN
class ANN(nn.Module):
    def __init__(self, input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# To store results
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Split data with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=i
    )
    
    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Initialize the model
    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.1)

    # Train the model
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}")

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()
        print(f"Accuracy: {accuracy:.4f}")

    # Append results
    results.append({"Trial": i + 1, "Accuracy": accuracy})

# Summary of results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
print(f"Average Accuracy: {results_df['Accuracy'].mean() * 100:.2f}%")



--- Trial 1 ---
Epoch [10/100], Loss: 0.4118
Epoch [20/100], Loss: 0.3511
Epoch [30/100], Loss: 0.3376
Epoch [40/100], Loss: 0.3171
Epoch [50/100], Loss: 0.3082
Epoch [60/100], Loss: 0.3034
Epoch [70/100], Loss: 0.2995
Epoch [80/100], Loss: 0.2974
Epoch [90/100], Loss: 0.2951
Epoch [100/100], Loss: 0.2932
Accuracy: 0.8440

--- Trial 2 ---
Epoch [10/100], Loss: 0.4440
Epoch [20/100], Loss: 0.4176
Epoch [30/100], Loss: 0.3627
Epoch [40/100], Loss: 0.3345
Epoch [50/100], Loss: 0.3221
Epoch [60/100], Loss: 0.3142
Epoch [70/100], Loss: 0.3073
Epoch [80/100], Loss: 0.3029
Epoch [90/100], Loss: 0.3016
Epoch [100/100], Loss: 0.2988
Accuracy: 0.8558

--- Trial 3 ---
Epoch [10/100], Loss: 0.4408
Epoch [20/100], Loss: 0.4211
Epoch [30/100], Loss: 0.3764
Epoch [40/100], Loss: 0.3505
Epoch [50/100], Loss: 0.3336
Epoch [60/100], Loss: 0.3230
Epoch [70/100], Loss: 0.3141
Epoch [80/100], Loss: 0.3078
Epoch [90/100], Loss: 0.3029
Epoch [100/100], Loss: 0.2996
Accuracy: 0.8412

Summary of Results Acros

### 50/50

In [72]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the ANN
class ANN(nn.Module):
    def __init__(self, input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# To store results
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Split data with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=i
    )
    
    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Initialize the model
    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.1)

    # Train the model
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}")

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()
        print(f"Accuracy: {accuracy:.4f}")

    # Append results
    results.append({"Trial": i + 1, "Accuracy": accuracy})

# Summary of results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
print(f"Average Accuracy: {results_df['Accuracy'].mean() * 100:.2f}%")



--- Trial 1 ---
Epoch [10/100], Loss: 0.4434
Epoch [20/100], Loss: 0.4066
Epoch [30/100], Loss: 0.3590
Epoch [40/100], Loss: 0.3408
Epoch [50/100], Loss: 0.3261
Epoch [60/100], Loss: 0.3156
Epoch [70/100], Loss: 0.3075
Epoch [80/100], Loss: 0.3011
Epoch [90/100], Loss: 0.2949
Epoch [100/100], Loss: 0.2874
Accuracy: 0.8476

--- Trial 2 ---
Epoch [10/100], Loss: 0.5551
Epoch [20/100], Loss: 0.3614
Epoch [30/100], Loss: 0.3341
Epoch [40/100], Loss: 0.3225
Epoch [50/100], Loss: 0.3145
Epoch [60/100], Loss: 0.3082
Epoch [70/100], Loss: 0.3033
Epoch [80/100], Loss: 0.2986
Epoch [90/100], Loss: 0.2939
Epoch [100/100], Loss: 0.2900
Accuracy: 0.8542

--- Trial 3 ---
Epoch [10/100], Loss: 0.5512
Epoch [20/100], Loss: 0.4488
Epoch [30/100], Loss: 0.4004
Epoch [40/100], Loss: 0.3624
Epoch [50/100], Loss: 0.3423
Epoch [60/100], Loss: 0.3296
Epoch [70/100], Loss: 0.3206
Epoch [80/100], Loss: 0.3130
Epoch [90/100], Loss: 0.3074
Epoch [100/100], Loss: 0.3030
Accuracy: 0.8473

Summary of Results Acros

### 20/80

In [73]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the ANN
class ANN(nn.Module):
    def __init__(self, input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# To store results
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Split data with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.8, random_state=i
    )
    
    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Initialize the model
    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.1)

    # Train the model
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}")

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()
        print(f"Accuracy: {accuracy:.4f}")

    # Append results
    results.append({"Trial": i + 1, "Accuracy": accuracy})

# Summary of results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
print(f"Average Accuracy: {results_df['Accuracy'].mean() * 100:.2f}%")



--- Trial 1 ---
Epoch [10/100], Loss: 0.6022
Epoch [20/100], Loss: 0.3669
Epoch [30/100], Loss: 0.3059
Epoch [40/100], Loss: 0.2832
Epoch [50/100], Loss: 0.2679
Epoch [60/100], Loss: 0.2545
Epoch [70/100], Loss: 0.2446
Epoch [80/100], Loss: 0.2438
Epoch [90/100], Loss: 0.2315
Epoch [100/100], Loss: 0.2407
Accuracy: 0.8388

--- Trial 2 ---
Epoch [10/100], Loss: 0.4330
Epoch [20/100], Loss: 0.4407
Epoch [30/100], Loss: 0.3844
Epoch [40/100], Loss: 0.3616
Epoch [50/100], Loss: 0.3419
Epoch [60/100], Loss: 0.3288
Epoch [70/100], Loss: 0.3185
Epoch [80/100], Loss: 0.3086
Epoch [90/100], Loss: 0.2977
Epoch [100/100], Loss: 0.2959
Accuracy: 0.8453

--- Trial 3 ---
Epoch [10/100], Loss: 0.5560
Epoch [20/100], Loss: 0.4584
Epoch [30/100], Loss: 0.3913
Epoch [40/100], Loss: 0.3521
Epoch [50/100], Loss: 0.3324
Epoch [60/100], Loss: 0.3150
Epoch [70/100], Loss: 0.3031
Epoch [80/100], Loss: 0.2928
Epoch [90/100], Loss: 0.2834
Epoch [100/100], Loss: 0.2748
Accuracy: 0.8411

Summary of Results Acros

## XG Boost

In [76]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.metrics import roc_auc_score


### Hyperparam Tuning Using Normal Looping

In [77]:
best_params = None
best_auc = 0
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8
for n_estimators in [100, 200, 300]:
    for max_depth in [3, 5, 7]:
        for learning_rate in [0.01, 0.05, 0.1]:
            xgb = XGBClassifier(
                scale_pos_weight=scale_pos_weight,
                n_estimators=n_estimators,
                max_depth=max_depth,
                learning_rate=learning_rate,
                random_state=42,
                eval_metric='logloss'
            )
            xgb.fit(X_train, y_train)
            y_proba = xgb.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_proba)
            if auc > best_auc:
                best_auc = auc
                best_params = {
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'learning_rate': learning_rate
                }
print(f"Best Parameters: {best_params}")


Best Parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1}


### 80/20

In [82]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results
results = []

# Perform 3 trials
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")

    # Split the data with a different random seed each time
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=i
    )

    # Calculate scale_pos_weight to handle class imbalance
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8

    # Initialize and train the XGBoost classifier
    xgb = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        n_estimators=200,
        max_depth=3,
        learning_rate=0.1,
        random_state=i,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb.predict(X_test)
    y_proba = xgb.predict_proba(X_test)[:, 1]  # For ROC-AUC

    # Calculate metrics
    accuracy = (y_pred == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "ROC-AUC": roc_auc,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })

    # Print metrics for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Summarize results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)

# Calculate and print the average metrics across trials
average_accuracy = results_df["Accuracy"].mean()
average_roc_auc = results_df["ROC-AUC"].mean()
print(f"\nAverage Accuracy across trials: {average_accuracy * 100:.2f}%")
print(f"Average ROC-AUC across trials: {average_roc_auc:.2f}")



--- Trial 1 ---
Accuracy: 84.03%
ROC-AUC Score: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.84      0.89      4918
           1       0.63      0.84      0.72      1595

    accuracy                           0.84      6513
   macro avg       0.79      0.84      0.80      6513
weighted avg       0.87      0.84      0.85      6513


--- Trial 2 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 83.37%
ROC-AUC Score: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.83      0.89      5026
           1       0.60      0.84      0.70      1487

    accuracy                           0.83      6513
   macro avg       0.77      0.84      0.79      6513
weighted avg       0.87      0.83      0.84      6513


--- Trial 3 ---
Accuracy: 83.92%
ROC-AUC Score: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.85      0.89      4928
           1       0.63      0.82      0.71      1585

    accuracy                           0.84      6513
   macro avg       0.78      0.83      0.80      6513
weighted avg       0.86      0.84      0.85      6513


Summary of Results Across Trials:
   Trial  Accuracy   ROC-AUC  Precision (Class 1)  Recall (Class 1)  \
0      1  0.840319  0.924611             0.630711          0.839498   
1      2  0.833717  0.922657    

Parameters: { "use_label_encoder" } are not used.



### 50/50

In [81]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results
results = []

# Perform 3 trials
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")

    # Split the data with a different random seed each time
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.5, random_state=i
    )

    # Calculate scale_pos_weight to handle class imbalance
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8

    # Initialize and train the XGBoost classifier
    xgb = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        n_estimators=200,
        max_depth=3,
        learning_rate=0.1,
        random_state=i,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb.predict(X_test)
    y_proba = xgb.predict_proba(X_test)[:, 1]  # For ROC-AUC

    # Calculate metrics
    accuracy = (y_pred == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "ROC-AUC": roc_auc,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })

    # Print metrics for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Summarize results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)

# Calculate and print the average metrics across trials
average_accuracy = results_df["Accuracy"].mean()
average_roc_auc = results_df["ROC-AUC"].mean()
print(f"\nAverage Accuracy across trials: {average_accuracy * 100:.2f}%")
print(f"Average ROC-AUC across trials: {average_roc_auc:.2f}")



--- Trial 1 ---
Accuracy: 84.56%
ROC-AUC Score: 0.93
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.85      0.89     12323
           1       0.64      0.83      0.72      3958

    accuracy                           0.85     16281
   macro avg       0.79      0.84      0.81     16281
weighted avg       0.87      0.85      0.85     16281


--- Trial 2 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 83.99%
ROC-AUC Score: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.84      0.89     12476
           1       0.62      0.83      0.71      3805

    accuracy                           0.84     16281
   macro avg       0.78      0.84      0.80     16281
weighted avg       0.87      0.84      0.85     16281


--- Trial 3 ---
Accuracy: 84.19%
ROC-AUC Score: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.85      0.89     12345
           1       0.64      0.81      0.71      3936

    accuracy                           0.84     16281
   macro avg       0.78      0.83      0.80     16281
weighted avg       0.86      0.84      0.85     16281


Summary of Results Across Trials:
   Trial  Accuracy   ROC-AUC  Precision (Class 1)  Recall (Class 1)  \
0      1  0.845587  0.927194             0.640194          0.832996   
1      2  0.839936  0.924317    

Parameters: { "use_label_encoder" } are not used.



### 20/80

In [80]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results
results = []

# Perform 3 trials
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")

    # Split the data with a different random seed each time
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.8, random_state=i
    )

    # Calculate scale_pos_weight to handle class imbalance
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8

    # Initialize and train the XGBoost classifier
    xgb = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        n_estimators=200,
        max_depth=3,
        learning_rate=0.1,
        random_state=i,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb.predict(X_test)
    y_proba = xgb.predict_proba(X_test)[:, 1]  # For ROC-AUC

    # Calculate metrics
    accuracy = (y_pred == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "ROC-AUC": roc_auc,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })

    # Print metrics for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Summarize results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)

# Calculate and print the average metrics across trials
average_accuracy = results_df["Accuracy"].mean()
average_roc_auc = results_df["ROC-AUC"].mean()
print(f"\nAverage Accuracy across trials: {average_accuracy * 100:.2f}%")
print(f"Average ROC-AUC across trials: {average_roc_auc:.2f}")



--- Trial 1 ---
Accuracy: 84.15%
ROC-AUC Score: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.85      0.89     19729
           1       0.63      0.82      0.72      6320

    accuracy                           0.84     26049
   macro avg       0.79      0.84      0.80     26049
weighted avg       0.86      0.84      0.85     26049


--- Trial 2 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 84.15%
ROC-AUC Score: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.85      0.89     19845
           1       0.63      0.81      0.71      6204

    accuracy                           0.84     26049
   macro avg       0.78      0.83      0.80     26049
weighted avg       0.86      0.84      0.85     26049


--- Trial 3 ---
Accuracy: 84.20%
ROC-AUC Score: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.85      0.89     19752
           1       0.63      0.82      0.72      6297

    accuracy                           0.84     26049
   macro avg       0.79      0.84      0.80     26049
weighted avg       0.86      0.84      0.85     26049


Summary of Results Across Trials:
   Trial  Accuracy   ROC-AUC  Precision (Class 1)  Recall (Class 1)  \
0      1  0.841491  0.923277             0.633386          0.823101   
1      2  0.841453  0.920960    

Parameters: { "use_label_encoder" } are not used.

