# CIND 820: Big Data Analytics Project - Initial Results and the Code

## Importing Libraries

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, average_precision_score, confusion_matrix, roc_curve, precision_recall_curve

## Loading Dataset, Creating Features and Target, and Feature Selection

In [19]:
# Load your dataset
file_path = "/Users/ajanthanjoseph/Documents/GitHub/CIND820/diabetes_012_health_indicators_BRFSS2015.csv"
df = pd.read_csv(file_path)

# Create interaction features
df["BMI_Age"] = df["BMI"] * df["Age"]
df["HighBP_HighChol"] = df["HighBP"] * df["HighChol"]
df["PhysActivity_BMI"] = df["PhysActivity"] * df["BMI"]

# Separate features and target
X = df.drop(columns=["Diabetes_012"])
y = df["Diabetes_012"]

# Apply Chi-Squared test for feature selection
chi2_selector = SelectKBest(score_func=chi2, k='all')
chi2_selector.fit(X, y)
chi2_scores = chi2_selector.scores_

# Apply ANOVA F-test for feature selection
anova_selector = SelectKBest(score_func=f_classif, k='all')
anova_selector.fit(X, y)
anova_scores = anova_selector.scores_

# Create a DataFrame with the feature selection results
feature_selection_results = pd.DataFrame({
    'Feature': X.columns,
    'Chi2 Score': chi2_scores,
    'ANOVA F Score': anova_scores
}).sort_values(by=['Chi2 Score', 'ANOVA F Score'], ascending=False)

# Display Chi-Square and ANOVA scores
print("Chi-Square and ANOVA F-test Scores:")
print(feature_selection_results)

# Identify the 6 lowest scoring features based on the combined scores
lowest_features = feature_selection_results.nsmallest(6, ['Chi2 Score', 'ANOVA F Score'])['Feature'].tolist()

# Display the 6 lowest-scoring features to be dropped
print("\n6 Lowest-Scoring Features to be Dropped:")
print(lowest_features)

# Drop these features from the dataset
X_reduced = X.drop(columns=lowest_features)

# Display the final 15 features used
print("\nFinal 15 Features Used:")
print(X_reduced.columns.tolist())

# Standardize numerical columns
numerical_features = ["BMI", "Age", "BMI_Age", "PhysActivity_BMI"]
scaler = StandardScaler()
X_reduced[numerical_features] = scaler.fit_transform(X_reduced[numerical_features])

Chi-Square and ANOVA F-test Scores:
                 Feature     Chi2 Score  ANOVA F Score
21               BMI_Age  922213.400451   11905.944685
15              PhysHlth  141598.783225    4078.699854
14              MentHlth   24607.463010     717.117372
3                    BMI   19775.252090    6768.361067
22       HighBP_HighChol   14392.694424   10453.965178
0                 HighBP   10731.721009   10149.140418
16              DiffWalk   10627.556856    6727.221134
13               GenHlth   10595.234173   12832.660123
18                   Age   10225.159975    4560.441068
6   HeartDiseaseorAttack    7468.339377    4260.879233
1               HighChol    6483.776499    5890.843228
20                Income    5380.434934    3913.752954
23      PhysActivity_BMI    3981.998963     244.921534
5                 Stroke    2798.417025    1475.321639
7           PhysActivity     922.529401    1923.358158
19             Education     849.169260    2245.725730
10     HvyAlcoholConsump     

## Splitting unbalanced dataset into Training and Test Sets

In [9]:
# Split the original (unbalanced) dataset into training and testing sets
X_train_unbalanced, X_test_unbalanced, y_train_unbalanced, y_test_unbalanced = train_test_split(
    X_reduced, y, test_size=0.3, random_state=42, stratify=y
)

# Display the shapes of the resulting datasets
print("Training and Testing Dataset Shapes (Unbalanced):")
print(f"X_train_unbalanced: {X_train_unbalanced.shape}, y_train_unbalanced: {y_train_unbalanced.shape}")
print(f"X_test_unbalanced: {X_test_unbalanced.shape}, y_test_unbalanced: {y_test_unbalanced.shape}")

Training and Testing Dataset Shapes (Unbalanced):
X_train_unbalanced: (177576, 18), y_train_unbalanced: (177576,)
X_test_unbalanced: (76104, 18), y_test_unbalanced: (76104,)


# Unbalanced Data Machine Learning Models + Evaluation of Models

## Logistic Regression and Evaluation

In [10]:
# Logistic Regression
lr_unbalanced = LogisticRegression(max_iter=1000)
lr_unbalanced.fit(X_train_unbalanced, y_train_unbalanced)
y_pred_unbalanced = lr_unbalanced.predict(X_test_unbalanced)

# Evaluation
accuracy = accuracy_score(y_test_unbalanced, y_pred_unbalanced)
precision = precision_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
recall = recall_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
roc_auc = roc_auc_score(y_test_unbalanced, lr_unbalanced.predict_proba(X_test_unbalanced), multi_class='ovr')
pr_auc = average_precision_score(y_test_unbalanced, lr_unbalanced.predict_proba(X_test_unbalanced), average='weighted')
conf_matrix = confusion_matrix(y_test_unbalanced, y_pred_unbalanced)

print("Logistic Regression Evaluation (Unbalanced Data):")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression Evaluation (Unbalanced Data):
Accuracy: 0.8470908230842006
Precision: 0.8005741870177647
Recall: 0.8470908230842006
ROC AUC: 0.7788126815121236
PR AUC: 0.8637734630283154
Confusion Matrix:
[[62587     0  1524]
 [ 1252     0   137]
 [ 8724     0  1880]]


## Random Forest and Evaluation

In [11]:
# Random Forest
rf_unbalanced = RandomForestClassifier(random_state=42)
rf_unbalanced.fit(X_train_unbalanced, y_train_unbalanced)
y_pred_unbalanced = rf_unbalanced.predict(X_test_unbalanced)

# Evaluation
accuracy = accuracy_score(y_test_unbalanced, y_pred_unbalanced)
precision = precision_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
recall = recall_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
roc_auc = roc_auc_score(y_test_unbalanced, rf_unbalanced.predict_proba(X_test_unbalanced), multi_class='ovr')
pr_auc = average_precision_score(y_test_unbalanced, rf_unbalanced.predict_proba(X_test_unbalanced), average='weighted')
conf_matrix = confusion_matrix(y_test_unbalanced, y_pred_unbalanced)

print("Random Forest Evaluation (Unbalanced Data):")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Random Forest Evaluation (Unbalanced Data):
Accuracy: 0.8354357195416798
Precision: 0.7919413847306005
Recall: 0.8354357195416798
ROC AUC: 0.7242430981275967
PR AUC: 0.84344441378793
Confusion Matrix:
[[61267   136  2708]
 [ 1192     4   193]
 [ 8266    29  2309]]


## Decision Tree and Evaluation

In [12]:
# Decision Tree
dt_unbalanced = DecisionTreeClassifier(random_state=42)
dt_unbalanced.fit(X_train_unbalanced, y_train_unbalanced)
y_pred_unbalanced = dt_unbalanced.predict(X_test_unbalanced)

# Evaluation
accuracy = accuracy_score(y_test_unbalanced, y_pred_unbalanced)
precision = precision_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
recall = recall_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
roc_auc = roc_auc_score(y_test_unbalanced, dt_unbalanced.predict_proba(X_test_unbalanced), multi_class='ovr')
pr_auc = average_precision_score(y_test_unbalanced, dt_unbalanced.predict_proba(X_test_unbalanced), average='weighted')
conf_matrix = confusion_matrix(y_test_unbalanced, y_pred_unbalanced)

print("Decision Tree Evaluation (Unbalanced Data):")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Decision Tree Evaluation (Unbalanced Data):
Accuracy: 0.7787895511405445
Precision: 0.7819235410992261
Recall: 0.7787895511405445
ROC AUC: 0.5700011456996527
PR AUC: 0.7621174387535227
Confusion Matrix:
[[55890  1207  7014]
 [ 1005    47   337]
 [ 6908   364  3332]]


## Catboost and Evaluation

In [13]:
# CatBoost
cb_unbalanced = CatBoostClassifier(random_state=42, verbose=0)
cb_unbalanced.fit(X_train_unbalanced, y_train_unbalanced)
y_pred_unbalanced = cb_unbalanced.predict(X_test_unbalanced)

# Evaluation
accuracy = accuracy_score(y_test_unbalanced, y_pred_unbalanced)
precision = precision_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
recall = recall_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
roc_auc = roc_auc_score(y_test_unbalanced, cb_unbalanced.predict_proba(X_test_unbalanced), multi_class='ovr')
pr_auc = average_precision_score(y_test_unbalanced, cb_unbalanced.predict_proba(X_test_unbalanced), average='weighted')
conf_matrix = confusion_matrix(y_test_unbalanced, y_pred_unbalanced)

print("CatBoost Evaluation (Unbalanced Data):")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

CatBoost Evaluation (Unbalanced Data):
Accuracy: 0.8485756333438452
Precision: 0.8032305623999324
Recall: 0.8485756333438452
ROC AUC: 0.7759753797830875
PR AUC: 0.8675871220801562
Confusion Matrix:
[[62688     0  1423]
 [ 1249     0   140]
 [ 8710     2  1892]]


## Gradient Boosting and Evaluation

In [14]:
# Gradient Boosting
gb_unbalanced = GradientBoostingClassifier(random_state=42)
gb_unbalanced.fit(X_train_unbalanced, y_train_unbalanced)
y_pred_unbalanced = gb_unbalanced.predict(X_test_unbalanced)

# Evaluation
accuracy = accuracy_score(y_test_unbalanced, y_pred_unbalanced)
precision = precision_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
recall = recall_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
roc_auc = roc_auc_score(y_test_unbalanced, gb_unbalanced.predict_proba(X_test_unbalanced), multi_class='ovr')
pr_auc = average_precision_score(y_test_unbalanced, gb_unbalanced.predict_proba(X_test_unbalanced), average='weighted')
conf_matrix = confusion_matrix(y_test_unbalanced, y_pred_unbalanced)

print("Gradient Boosting Evaluation (Unbalanced Data):")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Gradient Boosting Evaluation (Unbalanced Data):
Accuracy: 0.8497845054136445
Precision: 0.805954937607854
Recall: 0.8497845054136445
ROC AUC: 0.7858889878472901
PR AUC: 0.8688348849470646
Confusion Matrix:
[[62629     0  1482]
 [ 1244     0   145]
 [ 8560     1  2043]]


## XGBoost and Evaluation

In [15]:
# XGBoost
xgb_unbalanced = XGBClassifier(random_state=42)
xgb_unbalanced.fit(X_train_unbalanced, y_train_unbalanced)
y_pred_unbalanced = xgb_unbalanced.predict(X_test_unbalanced)

# Evaluation
accuracy = accuracy_score(y_test_unbalanced, y_pred_unbalanced)
precision = precision_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
recall = recall_score(y_test_unbalanced, y_pred_unbalanced, average='weighted')
roc_auc = roc_auc_score(y_test_unbalanced, xgb_unbalanced.predict_proba(X_test_unbalanced), multi_class='ovr')
pr_auc = average_precision_score(y_test_unbalanced, xgb_unbalanced.predict_proba(X_test_unbalanced), average='weighted')
conf_matrix = confusion_matrix(y_test_unbalanced, y_pred_unbalanced)

print("XGBoost Evaluation (Unbalanced Data):")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

XGBoost Evaluation (Unbalanced Data):
Accuracy: 0.8481025964469673
Precision: 0.8024858420065205
Recall: 0.8481025964469673
ROC AUC: 0.7750223569889069
PR AUC: 0.8667490981502702
Confusion Matrix:
[[62637     2  1472]
 [ 1249     0   140]
 [ 8697     0  1907]]


# Applying SMOTE to balance the target classes

# Machine Learning Models and Evaluation using the balanced data

In [7]:
# Apply SMOTE to balance the target classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_reduced, y)

# Check the distribution of the target variable after SMOTE
print(y_resampled.value_counts())

# Split the data into training and testing sets with a 70-30 split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled
)

# Display the shapes of the resulting datasets
print("Training and Testing Dataset Shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

Diabetes_012
0.0    213703
2.0    213703
1.0    213703
Name: count, dtype: int64
Training and Testing Dataset Shapes:
X_train: (448776, 18), y_train: (448776,)
X_test: (192333, 18), y_test: (192333,)


## Logistic Regression and Evaluation

In [17]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, lr.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, lr.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Logistic Regression Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Logistic Regression Evaluation:
Accuracy: 0.5308189442269398
Precision: 0.5217847413620351
Recall: 0.5308189442269398
ROC AUC: 0.7199222321201696
PR AUC: 0.5587721390821514
Confusion Matrix:
[[42541 11314 10256]
 [17537 21368 25206]
 [10423 15503 38185]]


## Random Forest and Evaluation

In [18]:
# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, rf.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, rf.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Random Forest Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Random Forest Evaluation:
Accuracy: 0.9145076507931557
Precision: 0.9146563682283299
Recall: 0.9145076507931557
ROC AUC: 0.9791985358879257
PR AUC: 0.9605713326310968
Confusion Matrix:
[[58995   240  4876]
 [ 1142 61750  1219]
 [ 6598  2368 55145]]


## Decision Tree and Evaluation

In [19]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, dt.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, dt.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Decision Tree Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Decision Tree Evaluation:
Accuracy: 0.8501661181388529
Precision: 0.849375632513043
Recall: 0.8501661181388529
ROC AUC: 0.8898436677623539
PR AUC: 0.7787489927215748
Confusion Matrix:
[[54830  1474  7807]
 [ 1281 58550  4280]
 [ 8158  5818 50135]]


## CatBoost and Evaluation

In [20]:
# CatBoost
cb = CatBoostClassifier(random_state=42, verbose=0)
cb.fit(X_train, y_train)
y_pred = cb.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, cb.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, cb.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("CatBoost Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

CatBoost Evaluation:
Accuracy: 0.8329563829400051
Precision: 0.8307461699270817
Recall: 0.8329563829400051
ROC AUC: 0.9449059727991486
PR AUC: 0.9038757312961042
Confusion Matrix:
[[61705     1  2405]
 [ 1552 54417  8142]
 [ 8825 11203 44083]]


## Gradient Boosting and Evaluation

In [21]:
# Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, gb.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, gb.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Gradient Boosting Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Gradient Boosting Evaluation:
Accuracy: 0.7169180535841483
Precision: 0.7146309475644166
Recall: 0.7169180535841483
ROC AUC: 0.8769130632125709
PR AUC: 0.773102404677562
Confusion Matrix:
[[54754    96  9261]
 [ 4326 44667 15118]
 [ 8264 17381 38466]]


## XGBoost and Evaluation

In [22]:
# XGBoost
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, xgb.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, xgb.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("XGBoost Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

XGBoost Evaluation:
Accuracy: 0.8218350465078795
Precision: 0.818991530395379
Recall: 0.8218350465078795
ROC AUC: 0.9395522710431691
PR AUC: 0.8936095904496358
Confusion Matrix:
[[61367     0  2744]
 [ 1820 53263  9028]
 [ 8850 11825 43436]]


# Including HyperParameter Boosting using RandomizedSearchCV

 ## Logistic Regression and Evaluation

In [27]:
# Logistic Regression with RandomizedSearchCV
param_dist_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr_random = RandomizedSearchCV(
    LogisticRegression(max_iter=1000),
    param_distributions=param_dist_lr,
    n_iter=10,  
    cv=3,       
    scoring='accuracy',
    n_jobs=-1,  
    random_state=42
)

lr_random.fit(X_train, y_train)

# Best parameters and evaluation
print("Best Parameters for Logistic Regression:", lr_random.best_params_)
y_pred = lr_random.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, lr_random.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, lr_random.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Logistic Regression with RandomizedSearchCV Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Best Parameters for Logistic Regression: {'solver': 'liblinear', 'penalty': 'l2', 'C': 0.01}
Logistic Regression with RandomizedSearchCV Evaluation:
Accuracy: 0.5265815018743534
Precision: 0.5145368328994473
Recall: 0.5265815018743534
ROC AUC: 0.7155030946008178
PR AUC: 0.55349657492202
Confusion Matrix:
[[43600  9634 10877]
 [19011 18124 26976]
 [11583 12973 39555]]


## Random Forest and Evaluation

In [28]:
# Random Forest with RandomizedSearchCV
param_dist_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf_random = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=10,  
    cv=3,       
    scoring='accuracy',
    n_jobs=-1,  
    random_state=42
)

rf_random.fit(X_train, y_train)

# Best parameters and evaluation
print("Best Parameters for Random Forest:", rf_random.best_params_)
y_pred = rf_random.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, rf_random.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, rf_random.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Random Forest with RandomizedSearchCV Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Best Parameters for Random Forest: {'n_estimators': 200, 'min_samples_split': 5, 'max_depth': 30}
Random Forest with RandomizedSearchCV Evaluation:
Accuracy: 0.9145336473720058
Precision: 0.9146205070922732
Recall: 0.9145336473720058
ROC AUC: 0.9805480842860151
PR AUC: 0.9652321666079307
Confusion Matrix:
[[59024   139  4948]
 [ 1127 61594  1390]
 [ 6354  2480 55277]]


## Decision Tree and Evaluation

In [29]:
# Decision Tree with RandomizedSearchCV
param_dist_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_random = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_distributions=param_dist_dt,
    n_iter=10,  
    cv=3,       
    scoring='accuracy',
    n_jobs=-1,  
    random_state=42
)

dt_random.fit(X_train, y_train)

# Best parameters and evaluation
print("Best Parameters for Decision Tree:", dt_random.best_params_)
y_pred = dt_random.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, dt_random.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, dt_random.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Decision Tree with RandomizedSearchCV Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Best Parameters for Decision Tree: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 30}
Decision Tree with RandomizedSearchCV Evaluation:
Accuracy: 0.8473896835176491
Precision: 0.8463380415331306
Recall: 0.8473896835176491
ROC AUC: 0.9118613155441437
PR AUC: 0.8227700865167803
Confusion Matrix:
[[56961   868  6282]
 [ 1609 58572  3930]
 [ 9386  7277 47448]]


## CatBoost and Evaluation

In [8]:
# CatBoost with RandomizedSearchCV
param_dist_cb = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2]
}

cb_random = RandomizedSearchCV(
    CatBoostClassifier(random_state=42, verbose=0),
    param_distributions=param_dist_cb,
    n_iter=10,  
    cv=3,       
    scoring='accuracy',
    n_jobs=-1,  
    random_state=42
)

cb_random.fit(X_train, y_train)

# Best parameters and evaluation
print("Best Parameters for CatBoost:", cb_random.best_params_)
y_pred = cb_random.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, cb_random.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, cb_random.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("CatBoost with RandomizedSearchCV Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Best Parameters for CatBoost: {'learning_rate': 0.2, 'iterations': 300, 'depth': 6}
CatBoost with RandomizedSearchCV Evaluation:
Accuracy: 0.8072353678255941
Precision: 0.8033370692636146
Recall: 0.8072353678255941
ROC AUC: 0.9311714309519452
PR AUC: 0.876704444791425
Confusion Matrix:
[[61301     0  2810]
 [ 1775 51985 10351]
 [ 8939 13200 41972]]


## Gradient Boosting and Evaluation

In [None]:
# Gradient Boosting with RandomizedSearchCV
param_dist_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.2],
    'max_depth': [3, 4]
}

gb_random = RandomizedSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_distributions=param_dist_gb,
    n_iter=5,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

# Fit on the smaller dataset
gb_random.fit(X_train, y_train)

# Best parameters and evaluation
print("Best Parameters for Gradient Boosting:", gb_random.best_params_)
y_pred = gb_random.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, gb_random.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, gb_random.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Gradient Boosting with RandomizedSearchCV Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Best Parameters for Gradient Boosting: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.2}
Gradient Boosting with RandomizedSearchCV Evaluation:
Accuracy: 0.8034086714188413
Precision: 0.7989758449276145
Recall: 0.8034086714188413
ROC AUC: 0.928072320967925
PR AUC: 0.8699826224074192
Confusion Matrix:
[[61681     0  2430]
 [ 1599 51366 11146]
 [ 8834 13802 41475]]


## XGBoost and Evaluation

In [15]:
# XGBoost with RandomizedSearchCV
param_dist_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2]
}

xgb_random = RandomizedSearchCV(
    XGBClassifier(random_state=42),
    param_distributions=param_dist_xgb,
    n_iter=10,  
    cv=3,       
    scoring='accuracy',
    n_jobs=-1,  
    random_state=42
)

xgb_random.fit(X_train, y_train)

# Best parameters and evaluation
print("Best Parameters for XGBoost:", xgb_random.best_params_)
y_pred = xgb_random.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, xgb_random.predict_proba(X_test), multi_class='ovr')
pr_auc = average_precision_score(y_test, xgb_random.predict_proba(X_test), average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("XGBoost with RandomizedSearchCV Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC: {roc_auc}")
print(f"PR AUC: {pr_auc}")
print(f"Confusion Matrix:\n{conf_matrix}")

Best Parameters for XGBoost: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1}
XGBoost with RandomizedSearchCV Evaluation:
Accuracy: 0.7985161152792293
Precision: 0.7942678467390154
Recall: 0.7985161152792293
ROC AUC: 0.9262938062909094
PR AUC: 0.8671469144690347
Confusion Matrix:
[[60880     0  3231]
 [ 2028 51033 11050]
 [ 8797 13646 41668]]
