#### COGS 118 Project - Bank Marketing

In [35]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler


In [36]:
Bank_Marketing = pd.read_csv('bank-full.csv', sep=";")

### Data Exploration

In [37]:
Bank_Marketing

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [38]:
# Check Null Values
print(Bank_Marketing.isnull().sum())

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [39]:
# Check data types
print(Bank_Marketing.dtypes)

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object


## Convert our Output Variable to Binary

In [40]:
# Convert target variable to binary
Bank_Marketing['y'] = Bank_Marketing['y'].map({'yes': 1, 'no': 0})

In [41]:
Bank_Marketing['y']

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

## Select Relavant columns

In [42]:
Bank_Marketing.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [43]:
columns = ['age','housing','education','balance','loan','duration','y']
Bank = Bank_Marketing[columns]

In [44]:
Bank['housing'] = Bank['housing'].map({'yes': 1, 'no': 0, 'unknown': np.nan})
Bank['loan'] = Bank['loan'].map({'yes': 1, 'no': 0, 'unknown': np.nan})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bank['housing'] = Bank['housing'].map({'yes': 1, 'no': 0, 'unknown': np.nan})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bank['loan'] = Bank['loan'].map({'yes': 1, 'no': 0, 'unknown': np.nan})


In [45]:
Bank = Bank.replace('unknown',np.nan)
Bank = Bank.replace('primary',0)
Bank = Bank.replace('secondary',0.5)
Bank = Bank.replace('tertiary',1)


  Bank = Bank.replace('tertiary',1)


In [46]:
from sklearn.preprocessing import StandardScaler

# Columns to scale
columns_to_scale = ['age', 'balance', 'duration']

# Initialize scaler
scaler = StandardScaler()

# Scale selected columns
Bank[columns_to_scale] = scaler.fit_transform(Bank[columns_to_scale])


In [47]:
Bank

Unnamed: 0,age,housing,education,balance,loan,duration,y
0,1.606965,1.0,1.0,0.256419,0.0,0.011016,0
1,0.288529,1.0,0.5,-0.437895,0.0,-0.416127,0
2,-0.747384,1.0,0.5,-0.446762,1.0,-0.707361,0
3,0.571051,1.0,,0.047205,0.0,-0.645231,0
4,-0.747384,0.0,,-0.447091,0.0,-0.233620,0
...,...,...,...,...,...,...,...
45206,0.947747,0.0,1.0,-0.176460,0.0,2.791329,1
45207,2.831227,0.0,0.0,0.120447,0.0,0.768224,1
45208,2.925401,0.0,0.5,1.429593,0.0,3.373797,1
45209,1.512791,0.0,0.5,-0.228024,0.0,0.970146,0


In [48]:
rows_b = Bank.shape[0]
Bank = Bank.dropna()
rows_a = Bank.shape[0]
print(rows_b,rows_a)

45211 43354


In [49]:
Bank.value_counts('y')

y
0    38317
1     5037
Name: count, dtype: int64

# Random Forest

#### Hyperparam Tuning

In [74]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Features and target
X = Bank.drop("y", axis=1)  # Replace with your dataset
y = Bank["y"]               # Replace with your dataset

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Oversample the minority class using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Define parameter grid
param_grid = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_features": ["sqrt", "log2", None],
    "criterion": ["gini", "entropy"]
}

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20,  # Number of combinations to try
    scoring='accuracy',
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_resampled, y_resampled)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate on test set using the best model
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))




Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_estimators': 500, 'max_features': 'log2', 'criterion': 'entropy'}
Test Set Accuracy: 83.72%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.87      0.90      7638
           1       0.38      0.58      0.46      1033

    accuracy                           0.84      8671
   macro avg       0.66      0.72      0.68      8671
weighted avg       0.87      0.84      0.85      8671



### 80/20 Split

In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

# Features and target
X = Bank.drop("y", axis=1)  # Features
y = Bank["y"]               # Target

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Train-test split with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=i)
    
    # Oversample the minority class using SMOTE
    smote = SMOTE(random_state=i)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Train Random Forest Classifier
    classifier = RandomForestClassifier(n_estimators=500, class_weight='balanced', max_features = 'log2', criterion = 'entropy', random_state=i)
    classifier.fit(X_resampled, y_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---




Accuracy: 83.80%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.87      0.91      7692
           1       0.36      0.58      0.44       979

    accuracy                           0.84      8671
   macro avg       0.65      0.72      0.68      8671
weighted avg       0.88      0.84      0.85      8671


--- Trial 2 ---




Accuracy: 83.65%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.87      0.90      7651
           1       0.37      0.57      0.45      1020

    accuracy                           0.84      8671
   macro avg       0.66      0.72      0.68      8671
weighted avg       0.87      0.84      0.85      8671


--- Trial 3 ---




Accuracy: 83.14%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.86      0.90      7629
           1       0.38      0.61      0.46      1042

    accuracy                           0.83      8671
   macro avg       0.66      0.73      0.68      8671
weighted avg       0.87      0.83      0.85      8671


Summary of Results Across Trials:
   Trial  Accuracy  Precision (Class 1)  Recall (Class 1)  F1-Score (Class 1)
0      1  0.837966             0.362758          0.575077            0.444883
1      2  0.836466             0.372436          0.569608            0.450388
2      3  0.831392             0.375445          0.607486            0.464076
Average Accuracy across the 3 trials: 83.53%


### 50/50

In [76]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

# Features and target
X = Bank.drop("y", axis=1)  # Features
y = Bank["y"]               # Target

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Train-test split with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.5, random_state=i)
    
    # Oversample the minority class using SMOTE
    smote = SMOTE(random_state=i)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Train Random Forest Classifier
    classifier = RandomForestClassifier(n_estimators=500, class_weight='balanced', max_features = 'log2', criterion = 'entropy', random_state=i)
    classifier.fit(X_resampled, y_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---




Accuracy: 83.60%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.87      0.90     19122
           1       0.37      0.58      0.45      2555

    accuracy                           0.84     21677
   macro avg       0.66      0.72      0.68     21677
weighted avg       0.87      0.84      0.85     21677


--- Trial 2 ---




Accuracy: 83.29%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.87      0.90     19171
           1       0.36      0.58      0.45      2506

    accuracy                           0.83     21677
   macro avg       0.65      0.72      0.67     21677
weighted avg       0.87      0.83      0.85     21677


--- Trial 3 ---




Accuracy: 82.98%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.86      0.90     19125
           1       0.36      0.58      0.45      2552

    accuracy                           0.83     21677
   macro avg       0.65      0.72      0.67     21677
weighted avg       0.87      0.83      0.85     21677


Summary of Results Across Trials:
   Trial  Accuracy  Precision (Class 1)  Recall (Class 1)  F1-Score (Class 1)
0      1  0.836001             0.373225          0.576125            0.452993
1      2  0.832910             0.361401          0.580607            0.445499
2      3  0.829820             0.361847          0.583464            0.446678
Average Accuracy across the 3 trials: 83.29%


### 20/80

In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

# Features and target
X = Bank.drop("y", axis=1)  # Features
y = Bank["y"]               # Target

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Train-test split with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.8, random_state=i)
    
    # Oversample the minority class using SMOTE
    smote = SMOTE(random_state=i)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    # Train Random Forest Classifier
    classifier = RandomForestClassifier(n_estimators=500, class_weight='balanced', max_features = 'log2', criterion = 'entropy', random_state=i)
    classifier.fit(X_resampled, y_resampled)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
import pandas as pd
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---




Accuracy: 83.36%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.87      0.90     30662
           1       0.36      0.58      0.45      4022

    accuracy                           0.83     34684
   macro avg       0.65      0.72      0.67     34684
weighted avg       0.87      0.83      0.85     34684


--- Trial 2 ---




Accuracy: 82.93%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.86      0.90     30608
           1       0.36      0.57      0.44      4076

    accuracy                           0.83     34684
   macro avg       0.65      0.72      0.67     34684
weighted avg       0.87      0.83      0.85     34684


--- Trial 3 ---




Accuracy: 83.07%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.87      0.90     30631
           1       0.36      0.57      0.44      4053

    accuracy                           0.83     34684
   macro avg       0.65      0.72      0.67     34684
weighted avg       0.87      0.83      0.85     34684


Summary of Results Across Trials:
   Trial  Accuracy  Precision (Class 1)  Recall (Class 1)  F1-Score (Class 1)
0      1  0.833641             0.363009          0.575833            0.445299
1      2  0.829316             0.358415          0.572620            0.440876
2      3  0.830671             0.357946          0.565754            0.438474
Average Accuracy across the 3 trials: 83.12%


## SVM

#### Hyperparam Tuning

In [80]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd

# Separate features and target
X = Bank.drop("y", axis=1)  # Features
y = Bank["y"]               # Target

# Define the parameter grid for tuning
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  # Regularization parameter
}

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the SVM model
svm_model = SVC(class_weight='balanced', random_state=42)

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=param_grid,
    n_iter=20,  # Number of random combinations to try
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)



Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Parameters: {'C': 1}
Accuracy: 79.07%
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.79      0.87      7638
           1       0.34      0.80      0.48      1033

    accuracy                           0.79      8671
   macro avg       0.65      0.80      0.67      8671
weighted avg       0.89      0.79      0.82      8671



### 80/20

In [81]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pandas as pd

# Separate features and target
X = Bank.drop("y", axis=1)  # Features
y = Bank["y"]               # Target

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Perform train-test split with different random seeds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    
    # Train SVM classifier
    classifier = SVC(kernel='linear', class_weight='balanced', C = 1, random_state=i)
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")




--- Trial 1 ---
Accuracy: 78.84%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.80      0.87      7692
           1       0.31      0.73      0.44       979

    accuracy                           0.79      8671
   macro avg       0.64      0.76      0.65      8671
weighted avg       0.89      0.79      0.82      8671


--- Trial 2 ---
Accuracy: 79.62%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.80      0.87      7651
           1       0.33      0.73      0.46      1020

    accuracy                           0.80      8671
   macro avg       0.65      0.77      0.67      8671
weighted avg       0.88      0.80      0.83      8671


--- Trial 3 ---
Accuracy: 78.50%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.79      0.87      7629
           1       0.33      0.74      0.45      1042

    accurac

### 50/50

In [82]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pandas as pd

# Separate features and target
X = Bank.drop("y", axis=1)  # Features
y = Bank["y"]               # Target

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Perform train-test split with different random seeds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
    
    # Train SVM classifier
    classifier = SVC(kernel='linear', class_weight='balanced', C = 1, random_state=i)
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")



--- Trial 1 ---
Accuracy: 78.92%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.79      0.87     19122
           1       0.33      0.75      0.46      2555

    accuracy                           0.79     21677
   macro avg       0.64      0.77      0.66     21677
weighted avg       0.88      0.79      0.82     21677


--- Trial 2 ---
Accuracy: 79.21%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.80      0.87     19171
           1       0.32      0.73      0.45      2506

    accuracy                           0.79     21677
   macro avg       0.64      0.76      0.66     21677
weighted avg       0.88      0.79      0.82     21677


--- Trial 3 ---
Accuracy: 78.95%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.80      0.87     19125
           1       0.33      0.74      0.45      2552

    accurac

### 20/80

In [85]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pandas as pd

# Separate features and target
X = Bank.drop("y", axis=1)  # Features
y = Bank["y"]               # Target

# To store results for each test
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Perform train-test split with different random seeds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=i)
    
    # Train SVM classifier
    classifier = SVC(kernel='linear', class_weight='balanced',C =1, random_state=i)
    classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })
    
    # Print results for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Display overall results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
average_accuracy = results_df["Accuracy"].mean()
print(f"Average Accuracy across the 3 trials: {average_accuracy * 100:.2f}%")




--- Trial 1 ---
Accuracy: 79.22%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.80      0.87     30662
           1       0.33      0.74      0.45      4022

    accuracy                           0.79     34684
   macro avg       0.64      0.77      0.66     34684
weighted avg       0.89      0.79      0.82     34684


--- Trial 2 ---
Accuracy: 79.11%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.80      0.87     30608
           1       0.33      0.75      0.46      4076

    accuracy                           0.79     34684
   macro avg       0.64      0.77      0.66     34684
weighted avg       0.89      0.79      0.82     34684


--- Trial 3 ---
Accuracy: 79.43%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.80      0.87     30631
           1       0.33      0.73      0.45      4053

    accurac

## ANN

#### Hyperparam Tuning

In [88]:
from sklearn.model_selection import ParameterGrid
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define hyperparameter grid
param_grid = {
    "lr": [0.001, 0.01, 0.1],                # Learning rates to tune
    "hidden_size_1": [32, 64, 128],          # Hidden layer 1 sizes to tune
    "hidden_size_2": [16, 32, 64],           # Hidden layer 2 sizes to tune
}

# Convert the grid to a list of parameter combinations
grid = list(ParameterGrid(param_grid))

# To store results
tuning_results = []

# Iterate over all combinations of hyperparameters
for params in grid:
    print(f"\nTesting Parameters: {params}")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Define model with hyperparameters
    class ANN(nn.Module):
        def __init__(self, input_size):
            super(ANN, self).__init__()
            self.fc1 = nn.Linear(input_size, params["hidden_size_1"])
            self.fc2 = nn.Linear(params["hidden_size_1"], params["hidden_size_2"])
            self.fc3 = nn.Linear(params["hidden_size_2"], 2)  # 2 output classes

        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = torch.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=params["lr"])

    # Training loop
    epochs = 50  # Fixed number of epochs
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()

    # Append results
    tuning_results.append({
        "Params": params,
        "Accuracy": accuracy
    })

# Convert results to DataFrame and display the best parameters
results_df = pd.DataFrame(tuning_results)
best_result = results_df.loc[results_df["Accuracy"].idxmax()]
print("\nBest Parameters and Results:")
print(best_result)

best_params = best_result["Params"]
best_accuracy = best_result["Accuracy"]

print("\nBest Parameters:")
print(f"Hidden Layer 1 Size: {best_params['hidden_size_1']}")
print(f"Hidden Layer 2 Size: {best_params['hidden_size_2']}")
print(f"Learning Rate: {best_params['lr']}")
print(f"\nBest Accuracy: {best_accuracy * 100:.2f}%")




Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 16, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 16, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 16, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 32, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 32, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 32, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 64, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 64, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 32, 'hidden_size_2': 64, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 16, 'lr': 0.001}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 16, 'lr': 0.01}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 16, 'lr': 0.1}

Testing Parameters: {'hidden_size_1': 64, 'hidden_size_2': 32, 'lr': 0.001}

Testing Pa

### 80/20

In [89]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the ANN
class ANN(nn.Module):
    def __init__(self, input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# To store results
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Split data with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=i
    )
    
    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Initialize the model
    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.1)

    # Train the model
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}")

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()
        print(f"Accuracy: {accuracy:.4f}")

    # Append results
    results.append({"Trial": i + 1, "Accuracy": accuracy})

# Summary of results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
print(f"Average Accuracy: {results_df['Accuracy'].mean() * 100:.2f}%")



--- Trial 1 ---
Epoch [10/100], Loss: 0.3120
Epoch [20/100], Loss: 0.2946
Epoch [30/100], Loss: 0.2834
Epoch [40/100], Loss: 0.2711
Epoch [50/100], Loss: 0.2649
Epoch [60/100], Loss: 0.2610
Epoch [70/100], Loss: 0.2582
Epoch [80/100], Loss: 0.2563
Epoch [90/100], Loss: 0.2550
Epoch [100/100], Loss: 0.2542
Accuracy: 0.8902

--- Trial 2 ---
Epoch [10/100], Loss: 0.3112
Epoch [20/100], Loss: 0.2989
Epoch [30/100], Loss: 0.2831
Epoch [40/100], Loss: 0.2700
Epoch [50/100], Loss: 0.2627
Epoch [60/100], Loss: 0.2590
Epoch [70/100], Loss: 0.2566
Epoch [80/100], Loss: 0.2553
Epoch [90/100], Loss: 0.2545
Epoch [100/100], Loss: 0.2538
Accuracy: 0.8932

--- Trial 3 ---
Epoch [10/100], Loss: 0.3095
Epoch [20/100], Loss: 0.2938
Epoch [30/100], Loss: 0.2805
Epoch [40/100], Loss: 0.2682
Epoch [50/100], Loss: 0.2616
Epoch [60/100], Loss: 0.2581
Epoch [70/100], Loss: 0.2561
Epoch [80/100], Loss: 0.2546
Epoch [90/100], Loss: 0.2535
Epoch [100/100], Loss: 0.2527
Accuracy: 0.8854

Summary of Results Acros

### 50/50

In [90]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the ANN
class ANN(nn.Module):
    def __init__(self, input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# To store results
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Split data with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=i
    )
    
    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Initialize the model
    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.1)

    # Train the model
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}")

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()
        print(f"Accuracy: {accuracy:.4f}")

    # Append results
    results.append({"Trial": i + 1, "Accuracy": accuracy})

# Summary of results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
print(f"Average Accuracy: {results_df['Accuracy'].mean() * 100:.2f}%")



--- Trial 1 ---
Epoch [10/100], Loss: 0.3498
Epoch [20/100], Loss: 0.2931
Epoch [30/100], Loss: 0.2774
Epoch [40/100], Loss: 0.2661
Epoch [50/100], Loss: 0.2602
Epoch [60/100], Loss: 0.2561
Epoch [70/100], Loss: 0.2536
Epoch [80/100], Loss: 0.2521
Epoch [90/100], Loss: 0.2510
Epoch [100/100], Loss: 0.2500
Accuracy: 0.8900

--- Trial 2 ---
Epoch [10/100], Loss: 0.3192
Epoch [20/100], Loss: 0.2892
Epoch [30/100], Loss: 0.2752
Epoch [40/100], Loss: 0.2656
Epoch [50/100], Loss: 0.2598
Epoch [60/100], Loss: 0.2567
Epoch [70/100], Loss: 0.2550
Epoch [80/100], Loss: 0.2538
Epoch [90/100], Loss: 0.2527
Epoch [100/100], Loss: 0.2518
Accuracy: 0.8916

--- Trial 3 ---
Epoch [10/100], Loss: 0.3046
Epoch [20/100], Loss: 0.2962
Epoch [30/100], Loss: 0.2838
Epoch [40/100], Loss: 0.2705
Epoch [50/100], Loss: 0.2629
Epoch [60/100], Loss: 0.2583
Epoch [70/100], Loss: 0.2555
Epoch [80/100], Loss: 0.2536
Epoch [90/100], Loss: 0.2521
Epoch [100/100], Loss: 0.2512
Accuracy: 0.8892

Summary of Results Acros

### 20/80

In [91]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the ANN
class ANN(nn.Module):
    def __init__(self, input_size):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# To store results
results = []

# Perform 3 random tests
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")
    
    # Split data with a different random seed
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.8, random_state=i
    )
    
    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long)
    y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long)

    # Initialize the model
    model = ANN(input_size=X_train.shape[1])

    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.1)

    # Train the model
    for epoch in range(100):  # Adjust the number of epochs as needed
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}")

    # Evaluate the model
    with torch.no_grad():
        y_pred = model(X_test).argmax(dim=1)
        accuracy = (y_pred == y_test).float().mean().item()
        print(f"Accuracy: {accuracy:.4f}")

    # Append results
    results.append({"Trial": i + 1, "Accuracy": accuracy})

# Summary of results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)
print(f"Average Accuracy: {results_df['Accuracy'].mean() * 100:.2f}%")



--- Trial 1 ---
Epoch [10/100], Loss: 0.3288
Epoch [20/100], Loss: 0.3059
Epoch [30/100], Loss: 0.2867
Epoch [40/100], Loss: 0.2742
Epoch [50/100], Loss: 0.2657
Epoch [60/100], Loss: 0.2596
Epoch [70/100], Loss: 0.2553
Epoch [80/100], Loss: 0.2518
Epoch [90/100], Loss: 0.2488
Epoch [100/100], Loss: 0.2460
Accuracy: 0.8908

--- Trial 2 ---
Epoch [10/100], Loss: 0.3144
Epoch [20/100], Loss: 0.2809
Epoch [30/100], Loss: 0.2671
Epoch [40/100], Loss: 0.2588
Epoch [50/100], Loss: 0.2532
Epoch [60/100], Loss: 0.2493
Epoch [70/100], Loss: 0.2461
Epoch [80/100], Loss: 0.2438
Epoch [90/100], Loss: 0.2416
Epoch [100/100], Loss: 0.2393
Accuracy: 0.8867

--- Trial 3 ---
Epoch [10/100], Loss: 0.3156
Epoch [20/100], Loss: 0.2808
Epoch [30/100], Loss: 0.2629
Epoch [40/100], Loss: 0.2529
Epoch [50/100], Loss: 0.2488
Epoch [60/100], Loss: 0.2461
Epoch [70/100], Loss: 0.2435
Epoch [80/100], Loss: 0.2414
Epoch [90/100], Loss: 0.2394
Epoch [100/100], Loss: 0.2381
Accuracy: 0.8880

Summary of Results Acros

## XG Boost

### Hyperparam Tuning Using Normal Looping

In [94]:
best_params = None
best_auc = 0

for n_estimators in [100, 200, 300]:
    for max_depth in [3, 5, 7]:
        for learning_rate in [0.01, 0.05, 0.1]:
            xgb = XGBClassifier(
                scale_pos_weight=scale_pos_weight,
                n_estimators=n_estimators,
                max_depth=max_depth,
                learning_rate=learning_rate,
                random_state=42,
                eval_metric='logloss'
            )
            xgb.fit(X_train, y_train)
            y_proba = xgb.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_proba)
            if auc > best_auc:
                best_auc = auc
                best_params = {
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'learning_rate': learning_rate
                }
print(f"Best Parameters: {best_params}")


Best Parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05}


### 80/20

In [95]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Assume 'Bank' is your DataFrame with the target column 'y'
# Separate features and target
X = Bank.drop("y", axis=1)
y = Bank["y"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results
results = []

# Perform 3 trials
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")

    # Split the data with a different random seed each time
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=i
    )

    # Calculate scale_pos_weight to handle class imbalance
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8

    # Initialize and train the XGBoost classifier
    xgb = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        n_estimators=200,
        max_depth=3,
        learning_rate=0.05,
        random_state=i,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb.predict(X_test)
    y_proba = xgb.predict_proba(X_test)[:, 1]  # For ROC-AUC

    # Calculate metrics
    accuracy = (y_pred == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "ROC-AUC": roc_auc,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })

    # Print metrics for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Summarize results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)

# Calculate and print the average metrics across trials
average_accuracy = results_df["Accuracy"].mean()
average_roc_auc = results_df["ROC-AUC"].mean()
print(f"\nAverage Accuracy across trials: {average_accuracy * 100:.2f}%")
print(f"Average ROC-AUC across trials: {average_roc_auc:.2f}")



--- Trial 1 ---
Accuracy: 80.68%
ROC-AUC Score: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.81      0.88      7692
           1       0.34      0.76      0.47       979

    accuracy                           0.81      8671
   macro avg       0.65      0.79      0.68      8671
weighted avg       0.89      0.81      0.84      8671


--- Trial 2 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 80.75%
ROC-AUC Score: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.81      0.88      7651
           1       0.35      0.77      0.48      1020

    accuracy                           0.81      8671
   macro avg       0.66      0.79      0.68      8671
weighted avg       0.89      0.81      0.83      8671


--- Trial 3 ---
Accuracy: 80.05%
ROC-AUC Score: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.80      0.88      7629
           1       0.35      0.78      0.48      1042

    accuracy                           0.80      8671
   macro avg       0.66      0.79      0.68      8671
weighted avg       0.89      0.80      0.83      8671


Summary of Results Across Trials:
   Trial  Accuracy   ROC-AUC  Precision (Class 1)  Recall (Class 1)  \
0      1  0.806827  0.872791             0.340805          0.760981   
1      2  0.807519  0.868802    

Parameters: { "use_label_encoder" } are not used.



### 50/50

In [96]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Assume 'Bank' is your DataFrame with the target column 'y'
# Separate features and target
X = Bank.drop("y", axis=1)
y = Bank["y"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results
results = []

# Perform 3 trials
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")

    # Split the data with a different random seed each time
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.5, random_state=i
    )

    # Calculate scale_pos_weight to handle class imbalance
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8

    # Initialize and train the XGBoost classifier
    xgb = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        n_estimators=200,
        max_depth=3,
        learning_rate=0.05,
        random_state=i,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb.predict(X_test)
    y_proba = xgb.predict_proba(X_test)[:, 1]  # For ROC-AUC

    # Calculate metrics
    accuracy = (y_pred == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "ROC-AUC": roc_auc,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })

    # Print metrics for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Summarize results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)

# Calculate and print the average metrics across trials
average_accuracy = results_df["Accuracy"].mean()
average_roc_auc = results_df["ROC-AUC"].mean()
print(f"\nAverage Accuracy across trials: {average_accuracy * 100:.2f}%")
print(f"Average ROC-AUC across trials: {average_roc_auc:.2f}")



--- Trial 1 ---
Accuracy: 80.96%
ROC-AUC Score: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.82      0.88     19122
           1       0.36      0.76      0.49      2555

    accuracy                           0.81     21677
   macro avg       0.66      0.79      0.68     21677
weighted avg       0.89      0.81      0.84     21677


--- Trial 2 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 80.12%
ROC-AUC Score: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.81      0.88     19171
           1       0.34      0.77      0.47      2506

    accuracy                           0.80     21677
   macro avg       0.65      0.79      0.67     21677
weighted avg       0.89      0.80      0.83     21677


--- Trial 3 ---
Accuracy: 80.14%
ROC-AUC Score: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.81      0.88     19125
           1       0.35      0.77      0.48      2552

    accuracy                           0.80     21677
   macro avg       0.65      0.79      0.68     21677
weighted avg       0.89      0.80      0.83     21677


Summary of Results Across Trials:
   Trial  Accuracy   ROC-AUC  Precision (Class 1)  Recall (Class 1)  \
0      1  0.809568  0.869648             0.356242          0.762818   
1      2  0.801218  0.866845    

Parameters: { "use_label_encoder" } are not used.



### 20/80

In [97]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Assume 'Bank' is your DataFrame with the target column 'y'
# Separate features and target
X = Bank.drop("y", axis=1)
y = Bank["y"]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store results
results = []

# Perform 3 trials
for i in range(3):
    print(f"\n--- Trial {i + 1} ---")

    # Split the data with a different random seed each time
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.8, random_state=i
    )

    # Calculate scale_pos_weight to handle class imbalance
    scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) * 0.8

    # Initialize and train the XGBoost classifier
    xgb = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        n_estimators=200,
        max_depth=3,
        learning_rate=0.05,
        random_state=i,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb.predict(X_test)
    y_proba = xgb.predict_proba(X_test)[:, 1]  # For ROC-AUC

    # Calculate metrics
    accuracy = (y_pred == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_proba)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Append results
    results.append({
        "Trial": i + 1,
        "Accuracy": accuracy,
        "ROC-AUC": roc_auc,
        "Precision (Class 1)": report["1"]["precision"],
        "Recall (Class 1)": report["1"]["recall"],
        "F1-Score (Class 1)": report["1"]["f1-score"]
    })

    # Print metrics for this trial
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"ROC-AUC Score: {roc_auc:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Summarize results
results_df = pd.DataFrame(results)
print("\nSummary of Results Across Trials:")
print(results_df)

# Calculate and print the average metrics across trials
average_accuracy = results_df["Accuracy"].mean()
average_roc_auc = results_df["ROC-AUC"].mean()
print(f"\nAverage Accuracy across trials: {average_accuracy * 100:.2f}%")
print(f"Average ROC-AUC across trials: {average_roc_auc:.2f}")



--- Trial 1 ---
Accuracy: 81.22%
ROC-AUC Score: 0.86
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.82      0.89     30662
           1       0.35      0.75      0.48      4022

    accuracy                           0.81     34684
   macro avg       0.66      0.78      0.68     34684
weighted avg       0.89      0.81      0.84     34684


--- Trial 2 ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 80.31%
ROC-AUC Score: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.81      0.88     30608
           1       0.35      0.78      0.48      4076

    accuracy                           0.80     34684
   macro avg       0.66      0.79      0.68     34684
weighted avg       0.89      0.80      0.83     34684


--- Trial 3 ---
Accuracy: 81.07%
ROC-AUC Score: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.82      0.88     30631
           1       0.35      0.75      0.48      4053

    accuracy                           0.81     34684
   macro avg       0.66      0.79      0.68     34684
weighted avg       0.89      0.81      0.84     34684


Summary of Results Across Trials:
   Trial  Accuracy   ROC-AUC  Precision (Class 1)  Recall (Class 1)  \
0      1  0.812190  0.864746             0.353412          0.746892   
1      2  0.803079  0.865314    

Parameters: { "use_label_encoder" } are not used.

