In [1]:
import pandas as pd

# Read the data set
data = pd.read_csv("new_df_selected3_last3years_adjusted.csv")

unique_company_names = data['company_name'].nunique()
unique_status_labels = data['status_label'].nunique()
unique_divisions = data['Division'].nunique()
unique_majorgroup = data['MajorGroup'].nunique()
unique_last_year = data['last_year'].nunique()

print("Number of unique values in 'company_name' column:", unique_company_names)
print("Number of unique values in 'status_label' column:", unique_status_labels)
print("Number of unique values in 'Division' column:", unique_divisions)
print("Number of unique values in 'MajorGroup' column:", unique_majorgroup)
print("Number of unique values in 'last_year' column:", unique_last_year)



Number of unique values in 'company_name' column: 8971
Number of unique values in 'status_label' column: 2
Number of unique values in 'Division' column: 10
Number of unique values in 'MajorGroup' column: 73
Number of unique values in 'last_year' column: 20


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Columns: 119 entries, company_name to nasdaq_last3year
dtypes: float64(115), int64(1), object(3)
memory usage: 8.1+ MB


In [3]:
# Encoding non-numeric columns

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder object
label_encoder = LabelEncoder()


# Label-encode the company_name column
data['company_name_encoded'] = label_encoder.fit_transform(data['company_name'])


# Label-encode the Division column
data['Division_encoded'] = label_encoder.fit_transform(data['Division'])

# Label-encode MajorGroup columns
data['MajorGroup_encoded'] = label_encoder.fit_transform(data['MajorGroup'])
#When using label encoding for feature encoding, the sequential relationship between categories will not be introduced and will not have an impact on prediction.

# Encode the label of the status_label column
data['status_label_encoded'] = label_encoder.fit_transform(data['status_label'])
#With only two categories, it may be simpler and more appropriate to use label encoding as it maps the categories to 0 and 1, suitable for use in tree-based models. 
#If use one-hot encoding, a new column will be generated

print(data.head())




  company_name status_label Division  MajorGroup  last_year  X1_last1year  \
0          C_1        alive        D          37     2017.0         942.7   
1          C_2        alive        D          36     2010.0        1107.7   
2          C_3        alive        D          38     2008.0       12686.0   
3          C_4        alive        D          28     2007.0      581502.0   
4          C_5        alive        D          35     1999.0       28957.0   

   X1_last2year  X1_last3year  X2_last1year  X2_last2year  ...  \
0         888.5         873.1       1524.70        1504.1  ...   
1         900.2        1077.4       1474.50        1343.6  ...   
2       13454.0       13582.0      21401.00       27171.0  ...   
3      353541.0     1037047.0    1288165.00      927239.0  ...   
4           NaN           NaN         42.21           NaN  ...   

   nyse_last1year  nyse_last2year  nyse_last3year  nasdaq_last1year  \
0    11912.848307    10451.377523    10606.906738       6293.024211  

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Columns: 123 entries, company_name to status_label_encoded
dtypes: float64(115), int32(3), int64(2), object(3)
memory usage: 8.3+ MB


In [5]:
data.shape

(8971, 123)

In [6]:
unique_company_names = data['company_name_encoded'].nunique()
unique_status_labels = data['status_label_encoded'].nunique()
unique_divisions = data['Division_encoded'].nunique()
unique_majorgroup = data['MajorGroup_encoded'].nunique()

print("Number of unique values in 'company_name_encoded' column:", unique_company_names)
print("Number of unique values in 'status_label_encoded' column:", unique_status_labels)
print("Number of unique values in 'Division_encoded' column:", unique_divisions)
print("Number of unique values in 'MajorGroup_encoded' column:", unique_majorgroup)


Number of unique values in 'company_name_encoded' column: 8971
Number of unique values in 'status_label_encoded' column: 2
Number of unique values in 'Division_encoded' column: 10
Number of unique values in 'MajorGroup_encoded' column: 73


In [7]:
unique_divisions = data['Division_encoded'].unique()
print("Unique values in 'Division_encoded' column:", unique_divisions)


Unique values in 'Division_encoded' column: [3 4 2 8 5 6 1 0 7 9]


In [8]:
missing_rows_count = data.isnull().any(axis=1).sum()
print("Number of rows with missing values:", missing_rows_count)


Number of rows with missing values: 2673


In [9]:
# Delete rows with missing values
data_cleaned = data.dropna()
# Delete non-numeric columns that are not encoded
# Delete specified column
data_cleaned = data_cleaned.drop(['company_name', 'status_label', 'Division', 'MajorGroup', 'last_year', 'company_name_encoded'], axis=1)

data_cleaned.shape

(6298, 117)

In [10]:
data_cleaned.head

<bound method NDFrame.head of       X1_last1year  X1_last2year  X1_last3year  X2_last1year  X2_last2year  \
0            942.7         888.5         873.1       1524.70       1504.10   
1           1107.7         900.2        1077.4       1474.50       1343.60   
2          12686.0       13454.0       13582.0      21401.00      27171.00   
3         581502.0      353541.0     1037047.0    1288165.00     927239.00   
5           6838.0        6642.0        5935.0      25088.00      25438.00   
...            ...           ...           ...           ...           ...   
8966       10566.0       11738.0        9599.0      28278.00      26206.00   
8967        3369.0        9049.0       21381.0       3466.00       9198.00   
8968        2482.2        2340.6        2071.2       9401.50      10252.40   
8969         931.6        1032.7         829.3       2810.20       2542.00   
8970       82589.0      135207.0       63971.0       1625.37       1736.11   

      X2_last3year  X3_last1year 

In [11]:
status_counts = data_cleaned['status_label_encoded'].value_counts()
print(status_counts)


0    5784
1     514
Name: status_label_encoded, dtype: int64


### 2. XGBoost

In [12]:
# imbalance 

In [13]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)


# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',         # Logarithmic loss
    'eta': 0.07,                       # Learning rate
    'max_depth': 60,                   # Maximum depth of the tree
    'subsample': 0.9,                 # Subsample ratio of the training instances
    'colsample_bytree': 0.9,          # Subsample ratio of columns when constructing each tree
    'lambda': 1,                      # L2 regularization term (default is 1)
    'alpha': 0,                       # L1 regularization term (default is 0)
    'seed': 42                        # Random seed
}

# Train XGBoost model
num_rounds = 100
watchlist = [(dtrain, 'train'), (dval, 'eval')]
xgb_model = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=10)

# Make predictions on the test set
test_predictions_proba = xgb_model.predict(dtest)
test_predictions = [1 if x > 0.5 else 0 for x in test_predictions_proba]

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


[0]	train-logloss:0.30063	eval-logloss:0.28198
[1]	train-logloss:0.28363	eval-logloss:0.27503
[2]	train-logloss:0.26813	eval-logloss:0.26965
[3]	train-logloss:0.25341	eval-logloss:0.26353
[4]	train-logloss:0.24007	eval-logloss:0.25761
[5]	train-logloss:0.22809	eval-logloss:0.25261
[6]	train-logloss:0.21697	eval-logloss:0.24839
[7]	train-logloss:0.20590	eval-logloss:0.24379
[8]	train-logloss:0.19573	eval-logloss:0.24034
[9]	train-logloss:0.18614	eval-logloss:0.23656
[10]	train-logloss:0.17759	eval-logloss:0.23374
[11]	train-logloss:0.16961	eval-logloss:0.23047
[12]	train-logloss:0.16210	eval-logloss:0.22758
[13]	train-logloss:0.15464	eval-logloss:0.22540
[14]	train-logloss:0.14782	eval-logloss:0.22245
[15]	train-logloss:0.14144	eval-logloss:0.22020
[16]	train-logloss:0.13543	eval-logloss:0.21812
[17]	train-logloss:0.12972	eval-logloss:0.21682
[18]	train-logloss:0.12409	eval-logloss:0.21481
[19]	train-logloss:0.11893	eval-logloss:0.21325
[20]	train-logloss:0.11408	eval-logloss:0.21276
[2

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, test_predictions_proba)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9293650793650794
Precision: 0.8
Recall: 0.12244897959183673
F1 Score: 0.21238938053097345
Micro F1 Score: 0.9293650793650794
Macro F1 Score: 0.5877069461857194
ROC AUC: 0.8784379500509325
Confusion Matrix:
[[1159    3]
 [  86   12]]


### ensemble methods (bagging and boosting) , stacking effect is not good

### 3. boost

In [15]:
# imbalanced + cv

In [16]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)

# Train Gradient Boosting model
gb_classifier.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = gb_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Perform cross-validation on the train_val set
cv_scores = cross_val_score(gb_classifier, X_train_val, y_train_val, cv=5)
# Print cross-validat ion scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# # Retrain the model on the train_val set
# gb_classifier.fit(X_train_val, y_train_val)

# Make predictions on the test set
test_predictions = gb_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.9333333333333333
Cross-validation Scores: [0.92261905 0.91964286 0.91964286 0.92552135 0.9225422 ]
Mean CV Accuracy: 0.9219936634037925
Test Accuracy: 0.9285714285714286


In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = gb_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9285714285714286
Macro F1 Score: 0.5933501635205691
Test Accuracy: 0.9285714285714286
Precision: 0.7222222222222222
Recall: 0.1326530612244898
F1 Score: 0.22413793103448276
ROC AUC Score: 0.8521549755874811
Confusion Matrix:
 [[1157    5]
 [  85   13]]


In [18]:
# GradientBoostingClassifier
# SMOTE + CV

In [19]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import AdaBoostClassifier

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
  #AdaBoost less better result


# Train Gradient Boosting model
gb_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
val_predictions = gb_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Merge resampled training and validation data
X_train_val_reshaped = np.concatenate((X_train_resampled, X_val), axis=0)
y_train_val_reshaped = np.concatenate((y_train_resampled, y_val), axis=0)


# Perform cross-validation on the train_val set
cv_scores = cross_val_score(gb_classifier, X_train_val_reshaped, y_train_val_reshaped, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# # Retrain the model on the train_val set
# gb_classifier.fit(X_train_val_reshaped, y_train_val_reshaped)


# Make predictions on the test set
test_predictions = gb_classifier.predict(X_test)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3449
1    3449
Name: status_label_encoded, dtype: int64
Validation Accuracy: 0.9182539682539682
Cross-validation Scores: [0.88357843 0.98223039 0.98284314 0.98835071 0.94849785]
Mean CV Accuracy: 0.9571001039900938
Test Accuracy: 0.919047619047619


In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = gb_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.919047619047619
Macro F1 Score: 0.647275801121955
Test Accuracy: 0.919047619047619
Precision: 0.4642857142857143
Recall: 0.2653061224489796
F1 Score: 0.3376623376623377
ROC AUC Score: 0.8762162352032036
Confusion Matrix:
 [[1132   30]
 [  72   26]]


In [21]:
# SMOTE + FS + GRID

In [22]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.feature_selection import SelectFromModel

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
  #AdaBoost less better result

# Perform feature selection 
selector = SelectFromModel(estimator=gb_classifier, threshold='mean')  # median: worse recall, f1, AUC
selector.fit(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]
print("Selected Features:")
print(selected_features)

# Transform the datasets
X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Train Gradient Boosting model with the best parameters
best_gb_model = grid_search.best_estimator_

# Make predictions on the validation set
val_predictions = best_gb_model.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)


# # Combine resampled training set and validation set
# X_train_val_combined = pd.concat([pd.DataFrame(X_train_selected), pd.DataFrame(X_val_selected)], axis=0)
# y_train_val_combined = pd.concat([pd.Series(y_train_resampled), pd.Series(y_val)], axis=0)

# # Train XGBoost model with train_val
# best_gb_model.fit(X_train_val_combined, y_train_val_combined)

# Make predictions on the test set
test_predictions = best_gb_model.predict(X_test_selected)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3449
1    3449
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X4_last2year', 'X4_last3year', 'X7_last1year', 'X9_last1year',
       'X9_last3year', 'X10_last2year', 'X10_last3year', 'X11_last1year',
       'X11_last2year', 'X18_last1year', 'X18_last2year', 'X1_last3year_ycr',
       'X5_last3year_ycr', 'X9_last1year_ycr', 'X9_last2year_ycr',
       'X11_last1year_ycr', 'X13_last1year_ycr', 'X15_last1year_ycr',
       'nyse_last1year', 'nyse_last3year', 'nasdaq_last3year',
       'Division_encoded'],
      dtype='object')
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}
Validation Accuracy: 0.9142857142857143
Test Accuracy: 0.9182539682539682


In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_gb_model.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9182539682539682
Macro F1 Score: 0.6733607000923691
Test Accuracy: 0.9182539682539682
Precision: 0.4647887323943662
Recall: 0.336734693877551
F1 Score: 0.3905325443786982
ROC AUC Score: 0.8445150865854087
Confusion Matrix:
 [[1124   38]
 [  65   33]]


In [24]:
# UNDERSAMPLING + FS + GRID

In [25]:

from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.feature_selection import SelectFromModel

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define an undersampler
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
  #AdaBoost less better result

# Perform feature selection 
selector = SelectFromModel(estimator=gb_classifier, threshold='mean')  # median: worse recall, f1, AUC
selector.fit(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]
print("Selected Features:")
print(selected_features)

# Transform the datasets
X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Train Gradient Boosting model with the best parameters
best_gb_model = grid_search.best_estimator_

# Make predictions on the validation set
val_predictions = best_gb_model.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)


# # Combine resampled training set and validation set
# X_train_val_combined = pd.concat([pd.DataFrame(X_train_selected), pd.DataFrame(X_val_selected)], axis=0)
# y_train_val_combined = pd.concat([pd.Series(y_train_resampled), pd.Series(y_val)], axis=0)

# # Train XGBoost model with train_val
# best_gb_model.fit(X_train_val_combined, y_train_val_combined)

# Make predictions on the test set
test_predictions = best_gb_model.predict(X_test_selected)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    329
1    329
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X2_last1year', 'X3_last1year', 'X5_last2year', 'X11_last1year',
       'X11_last2year', 'X12_last1year', 'X13_last1year', 'X14_last1year',
       'X14_last2year', 'X14_last3year', 'X15_last1year', 'X15_last3year',
       'X17_last1year', 'X17_last3year', 'X6_last3year_ycr',
       'X8_last2year_ycr', 'X8_last3year_ycr', 'X9_last1year_ycr',
       'X10_last1year_ycr', 'X10_last3year_ycr', 'X11_last3year_ycr',
       'X12_last1year_ycr', 'X12_last2year_ycr', 'X15_last1year_ycr',
       'X15_last2year_ycr', 'X18_last3year_ycr', 'nyse_last3year',
       'nasdaq_last2year', 'nasdaq_last3year', 'MajorGroup_encoded'],
      dtype='object')
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Validation Accuracy: 0.7317460317460317
Test Accuracy: 0.7388888888888889


In [26]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_gb_model.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.7388888888888889
Macro F1 Score: 0.5826045980552963
Test Accuracy: 0.7388888888888889
Precision: 0.20460358056265984
Recall: 0.8163265306122449
F1 Score: 0.32719836400818
ROC AUC Score: 0.841388879131687
Confusion Matrix:
 [[851 311]
 [ 18  80]]


### 4. bagging method

In [27]:
#XGBOOST :base_estimator
#imbalance dataset + CV 


In [28]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)  #better than base_estimator=DecisionTreeClassifier
# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42)

# Train the model on the training set
bagging_classifier.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = bagging_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Perform cross-validation on the train_val set
cv_scores = cross_val_score(bagging_classifier, X_train_val, y_train_val, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# # Retrain the model on the train_val set
# bagging_classifier.fit(X_train_val, y_train_val)

# Make predictions on the test set
test_predictions = bagging_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.930952380952381
Cross-validation Scores: [0.92162698 0.92261905 0.91865079 0.92552135 0.9225422 ]
Mean CV Accuracy: 0.9221920761022051
Test Accuracy: 0.9325396825396826


In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = bagging_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9325396825396827
Macro F1 Score: 0.6190659112007426
Test Accuracy: 0.9325396825396826
Precision: 0.8421052631578947
Recall: 0.16326530612244897
F1 Score: 0.2735042735042735
ROC AUC Score: 0.8770416944746918
Confusion Matrix:
 [[1159    3]
 [  82   16]]


In [30]:
#XGBOOST :base_estimator
#smote + CV 


In [31]:
# from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Remove feature names from X
X = X.values

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())


# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)

# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42, bootstrap_features=True)

# Train the model on the training set
bagging_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
val_predictions = bagging_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Merge resampled training and validation data
X_train_val_reshaped = np.concatenate((X_train_resampled, X_val), axis=0)
y_train_val_reshaped = np.concatenate((y_train_resampled, y_val), axis=0)


# Perform cross-validation on the train_val set
cv_scores = cross_val_score(bagging_classifier, X_train_val_reshaped, y_train_val_reshaped, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# # Retrain the model on the train_val set
# bagging_classifier.fit(X_train_val_reshaped, y_train_val_reshaped)

# Make predictions on the test set
test_predictions = bagging_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3449
1    3449
Name: status_label_encoded, dtype: int64
Validation Accuracy: 0.930952380952381
Cross-validation Scores: [0.85232843 0.98590686 0.98897059 0.99080319 0.94665849]
Mean CV Accuracy: 0.9529335124607783
Test Accuracy: 0.9301587301587302


In [32]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = bagging_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9301587301587302
Macro F1 Score: 0.6759259259259259
Test Accuracy: 0.9301587301587302
Precision: 0.6086956521739131
Recall: 0.2857142857142857
F1 Score: 0.3888888888888889
ROC AUC Score: 0.8782184130106431
Confusion Matrix:
 [[1144   18]
 [  70   28]]


In [33]:
#XGBOOST :base_estimator 
#smote +  fs + grid research

In [34]:
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import pandas as pd
import numpy as np

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Remove feature names from X
X = X.values

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)
# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42, bootstrap_features=True)

# Perform feature selection
selector = SelectFromModel(estimator=base_estimator, threshold='mean')
selector.fit(X_train_resampled, y_train_resampled)
# print the selected colomn name
support = selector.get_support()
original_feature_names = data_cleaned.drop('status_label_encoded', axis=1).columns
selected_feature_names = original_feature_names[support]
print("Selected Features:")
print(selected_feature_names)


X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)



# Define parameter grid for GridSearchCV
param_grid = {
    'base_estimator__max_depth': [5, 10, 15],
    'n_estimators': [50, 75, 100],
    'bootstrap': [True, False],
    'max_samples': [0.5, 1.0]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)
best_bagging_classifier = grid_search.best_estimator_


# Make predictions on the validation set
val_predictions = best_bagging_classifier.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)


# # Combine resampled training set and validation set
# X_train_val_combined = pd.concat([pd.DataFrame(X_train_selected), pd.DataFrame(X_val_selected)], axis=0)
# y_train_val_combined = pd.concat([pd.Series(y_train_resampled), pd.Series(y_val)], axis=0)

# # Train XGBoost model with train_val
# best_bagging_classifier.fit(X_train_val_combined, y_train_val_combined)


# Make predictions on the test set
test_predictions = best_bagging_classifier.predict(X_test_selected)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3449
1    3449
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X1_last3year', 'X4_last2year', 'X4_last3year', 'X9_last1year',
       'X10_last2year', 'X10_last3year', 'X11_last1year', 'X11_last2year',
       'X12_last1year', 'X16_last3year', 'X18_last1year', 'X18_last2year',
       'X9_last2year_ycr', 'X13_last1year_ycr', 'X13_last2year_ycr',
       'X15_last1year_ycr', 'nyse_last1year', 'nyse_last3year',
       'nasdaq_last2year', 'nasdaq_last3year', 'Division_encoded',
       'MajorGroup_encoded'],
      dtype='object')
Best Parameters: {'base_estimator__max_depth': 15, 'bootstrap': False, 'max_samples': 1.0, 'n_estimators': 75}
Validation Accuracy: 0.9277777777777778
Test Accuracy: 0.9261904761904762


In [35]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_bagging_classifier.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9261904761904762
Macro F1 Score: 0.680338266384778
Test Accuracy: 0.9261904761904762
Precision: 0.543859649122807
Recall: 0.3163265306122449
F1 Score: 0.4
ROC AUC Score: 0.8704204573395624
Confusion Matrix:
 [[1136   26]
 [  67   31]]


In [36]:
#XGBOOST :base_estimator
#undersampling +  fs + grid research

In [37]:
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import pandas as pd
import numpy as np

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Remove feature names from X
X = X.values

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation


# Define an undersampler
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())


# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)
# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42, bootstrap_features=True)

# Perform feature selection
selector = SelectFromModel(estimator=base_estimator, threshold='mean')
selector.fit(X_train_resampled, y_train_resampled)
# change NumPy into pandas DataFrame
#df_train_resampled = pd.DataFrame(X_train_resampled)
# print the selected colomn name
support = selector.get_support()
original_feature_names = data_cleaned.drop('status_label_encoded', axis=1).columns
selected_feature_names = original_feature_names[support]
print("Selected Features:")
print(selected_feature_names)


X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)



# Define parameter grid for GridSearchCV
param_grid = {
    'base_estimator__max_depth': [5, 10, 15],
    'n_estimators': [50, 75, 100],
    'bootstrap': [True, False],
    'max_samples': [0.5, 1.0]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)
best_bagging_classifier = grid_search.best_estimator_


# Make predictions on the validation set
val_predictions = best_bagging_classifier.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# # Combine resampled training set and validation set
# X_train_val_combined = pd.concat([pd.DataFrame(X_train_selected), pd.DataFrame(X_val_selected)], axis=0)
# y_train_val_combined = pd.concat([pd.Series(y_train_resampled), pd.Series(y_val)], axis=0)

# # Train XGBoost model with train_val
# best_bagging_classifier.fit(X_train_val_combined, y_train_val_combined)


# Make predictions on the test set
test_predictions = best_bagging_classifier.predict(X_test_selected)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    329
1    329
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X1_last1year', 'X2_last1year', 'X3_last2year', 'X4_last2year',
       'X5_last1year', 'X5_last3year', 'X7_last1year', 'X10_last1year',
       'X10_last3year', 'X11_last1year', 'X11_last2year', 'X12_last1year',
       'X12_last3year', 'X13_last1year', 'X14_last2year', 'X14_last3year',
       'X15_last2year', 'X16_last1year', 'X16_last2year', 'X5_last3year_ycr',
       'X6_last3year_ycr', 'X7_last2year_ycr', 'X8_last3year_ycr',
       'X9_last1year_ycr', 'X10_last1year_ycr', 'X12_last1year_ycr',
       'X13_last1year_ycr', 'X15_last1year_ycr', 'X16_last1year_ycr',
       'X18_last1year_ycr', 'X18_last2year_ycr', 'nyse_last1year',
       'nyse_last2year', 'nyse_last3year', 'nasdaq_last1year',
       'nasdaq_last2year', 'nasdaq_last3year', 'Division_encoded'],
      dtype='object')
Best Parameters: {'base_estimator__max_depth': 5, 'bootstrap': False, 'max_samples': 1.0, 'n_estimators': 75}
Validation Accu

In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_bagging_classifier.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.7547619047619047
Macro F1 Score: 0.5978852154083065
Test Accuracy: 0.7547619047619047
Precision: 0.21866666666666668
Recall: 0.8367346938775511
F1 Score: 0.34672304439746293
ROC AUC Score: 0.856106642312691
Confusion Matrix:
 [[869 293]
 [ 16  82]]
