In [1]:
import pandas as pd

# Read the data set
data = pd.read_csv("new_df_selected4_last4years_adjusted.csv")

unique_company_names = data['company_name'].nunique()
unique_status_labels = data['status_label'].nunique()
unique_divisions = data['Division'].nunique()
unique_majorgroup = data['MajorGroup'].nunique()
unique_last_year = data['last_year'].nunique()

print("Number of unique values in 'company_name' column:", unique_company_names)
print("Number of unique values in 'status_label' column:", unique_status_labels)
print("Number of unique values in 'Division' column:", unique_divisions)
print("Number of unique values in 'MajorGroup' column:", unique_majorgroup)
print("Number of unique values in 'last_year' column:", unique_last_year)



Number of unique values in 'company_name' column: 8971
Number of unique values in 'status_label' column: 2
Number of unique values in 'Division' column: 10
Number of unique values in 'MajorGroup' column: 73
Number of unique values in 'last_year' column: 20


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Columns: 157 entries, company_name to nasdaq_last4year
dtypes: float64(153), int64(1), object(3)
memory usage: 10.7+ MB


In [3]:
# Encoding non-numeric columns

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder object
label_encoder = LabelEncoder()


# Label-encode the company_name column
data['company_name_encoded'] = label_encoder.fit_transform(data['company_name'])


# Label-encode the Division column
data['Division_encoded'] = label_encoder.fit_transform(data['Division'])

# Label-encode MajorGroup columns
data['MajorGroup_encoded'] = label_encoder.fit_transform(data['MajorGroup'])
#When using label encoding for feature encoding, the sequential relationship between categories will not be introduced and will not have an impact on prediction.

# Encode the label of the status_label column
data['status_label_encoded'] = label_encoder.fit_transform(data['status_label'])
#With only two categories, it may be simpler and more appropriate to use label encoding as it maps the categories to 0 and 1, suitable for use in tree-based models. 
#If use one-hot encoding, a new column will be generated

print(data.head())




  company_name status_label Division  MajorGroup  last_year  X1_last1year  \
0          C_1        alive        D          37     2017.0         942.7   
1          C_2        alive        D          36     2010.0        1107.7   
2          C_3        alive        D          38     2008.0       12686.0   
3          C_4        alive        D          28     2007.0      581502.0   
4          C_5        alive        D          35     1999.0       28957.0   

   X1_last2year  X1_last3year  X1_last4year  X2_last1year  ...  \
0         888.5         873.1         954.1       1524.70  ...   
1         900.2        1077.4        1008.2       1474.50  ...   
2       13454.0       13582.0        7726.0      21401.00  ...   
3      353541.0     1037047.0      672072.0    1288165.00  ...   
4           NaN           NaN           NaN         42.21  ...   

   nyse_last3year  nyse_last4year  nasdaq_last1year  nasdaq_last2year  \
0    10606.906738    10699.956624       6293.024211       5015.9267

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Columns: 161 entries, company_name to status_label_encoded
dtypes: float64(153), int32(3), int64(2), object(3)
memory usage: 10.9+ MB


In [5]:
data.shape

(8971, 161)

In [6]:
unique_company_names = data['company_name_encoded'].nunique()
unique_status_labels = data['status_label_encoded'].nunique()
unique_divisions = data['Division_encoded'].nunique()
unique_majorgroup = data['MajorGroup_encoded'].nunique()

print("Number of unique values in 'company_name_encoded' column:", unique_company_names)
print("Number of unique values in 'status_label_encoded' column:", unique_status_labels)
print("Number of unique values in 'Division_encoded' column:", unique_divisions)
print("Number of unique values in 'MajorGroup_encoded' column:", unique_majorgroup)


Number of unique values in 'company_name_encoded' column: 8971
Number of unique values in 'status_label_encoded' column: 2
Number of unique values in 'Division_encoded' column: 10
Number of unique values in 'MajorGroup_encoded' column: 73


In [7]:
unique_divisions = data['Division_encoded'].unique()
print("Unique values in 'Division_encoded' column:", unique_divisions)


Unique values in 'Division_encoded' column: [3 4 2 8 5 6 1 0 7 9]


In [8]:
missing_rows_count = data.isnull().any(axis=1).sum()
print("Number of rows with missing values:", missing_rows_count)


Number of rows with missing values: 3371


In [9]:
# Delete rows with missing values
data_cleaned = data.dropna()
# Delete non-numeric columns that are not encoded
# Delete specified column
data_cleaned = data_cleaned.drop(['company_name', 'status_label', 'Division', 'MajorGroup', 'last_year', 'company_name_encoded'], axis=1)

data_cleaned.shape

(5600, 155)

In [10]:
data_cleaned.head

<bound method NDFrame.head of       X1_last1year  X1_last2year  X1_last3year  X1_last4year  X2_last1year  \
0            942.7         888.5         873.1       954.100       1524.70   
1           1107.7         900.2        1077.4      1008.200       1474.50   
2          12686.0       13454.0       13582.0      7726.000      21401.00   
3         581502.0      353541.0     1037047.0    672072.000    1288165.00   
5           6838.0        6642.0        5935.0      7229.000      25088.00   
...            ...           ...           ...           ...           ...   
8966       10566.0       11738.0        9599.0      9789.000      28278.00   
8967        3369.0        9049.0       21381.0        58.314       3466.00   
8968        2482.2        2340.6        2071.2      2270.500       9401.50   
8969         931.6        1032.7         829.3       735.100       2810.20   
8970       82589.0      135207.0       63971.0    105559.000       1625.37   

      X2_last2year  X2_last3year 

In [11]:
status_counts = data_cleaned['status_label_encoded'].value_counts()
print(status_counts)


0    5124
1     476
Name: status_label_encoded, dtype: int64


### 2. XGBoost

In [12]:
# imbalance 

In [13]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)


# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',         # Logarithmic loss
    'eta': 0.07,                       # Learning rate
    'max_depth': 60,                   # Maximum depth of the tree
    'subsample': 0.9,                 # Subsample ratio of the training instances
    'colsample_bytree': 0.9,          # Subsample ratio of columns when constructing each tree
    'lambda': 1,                      # L2 regularization term (default is 1)
    'alpha': 0,                       # L1 regularization term (default is 0)
    'seed': 42                        # Random seed
}

# Train XGBoost model
num_rounds = 100
watchlist = [(dtrain, 'train'), (dval, 'eval')]
xgb_model = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=10)

# Make predictions on the test set
test_predictions_proba = xgb_model.predict(dtest)
test_predictions = [1 if x > 0.5 else 0 for x in test_predictions_proba]

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


[0]	train-logloss:0.30629	eval-logloss:0.29318
[1]	train-logloss:0.28823	eval-logloss:0.28284
[2]	train-logloss:0.27100	eval-logloss:0.27448
[3]	train-logloss:0.25533	eval-logloss:0.26657
[4]	train-logloss:0.24119	eval-logloss:0.25948
[5]	train-logloss:0.22811	eval-logloss:0.25254
[6]	train-logloss:0.21598	eval-logloss:0.24637
[7]	train-logloss:0.20498	eval-logloss:0.24135
[8]	train-logloss:0.19439	eval-logloss:0.23546
[9]	train-logloss:0.18472	eval-logloss:0.23053
[10]	train-logloss:0.17625	eval-logloss:0.22592
[11]	train-logloss:0.16801	eval-logloss:0.22255
[12]	train-logloss:0.16003	eval-logloss:0.21940
[13]	train-logloss:0.15250	eval-logloss:0.21689
[14]	train-logloss:0.14569	eval-logloss:0.21417
[15]	train-logloss:0.13928	eval-logloss:0.21134
[16]	train-logloss:0.13283	eval-logloss:0.20914
[17]	train-logloss:0.12696	eval-logloss:0.20658
[18]	train-logloss:0.12128	eval-logloss:0.20541
[19]	train-logloss:0.11632	eval-logloss:0.20357
[20]	train-logloss:0.11120	eval-logloss:0.20204
[2

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, test_predictions_proba)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9294642857142857
Precision: 0.7857142857142857
Recall: 0.12643678160919541
F1 Score: 0.21782178217821782
Micro F1 Score: 0.9294642857142857
Macro F1 Score: 0.5904443179240786
ROC AUC: 0.8547028518654515
Confusion Matrix:
[[1030    3]
 [  76   11]]


### ensemble methods (bagging and boosting) , stacking effect is not good

### 3. boost

In [15]:
# imbalanced + cv

In [16]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)

# Train Gradient Boosting model
gb_classifier.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = gb_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Perform cross-validation on the train_val set
cv_scores = cross_val_score(gb_classifier, X_train_val, y_train_val, cv=5)
# Print cross-validat ion scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


# Make predictions on the test set
test_predictions = gb_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.93125
Cross-validation Scores: [0.91629464 0.91517857 0.91741071 0.92522321 0.91629464]
Mean CV Accuracy: 0.9180803571428573
Test Accuracy: 0.9294642857142857


In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = gb_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9294642857142857
Macro F1 Score: 0.6053083528493365
Test Accuracy: 0.9294642857142857
Precision: 0.7222222222222222
Recall: 0.14942528735632185
F1 Score: 0.24761904761904766
ROC AUC Score: 0.837544925504334
Confusion Matrix:
 [[1028    5]
 [  74   13]]


In [18]:
# GradientBoostingClassifier
# SMOTE + CV

In [19]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import AdaBoostClassifier

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
  #AdaBoost less better result


# Train Gradient Boosting model
gb_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
val_predictions = gb_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Merge resampled training and validation data
X_train_val_reshaped = np.concatenate((X_train_resampled, X_val), axis=0)
y_train_val_reshaped = np.concatenate((y_train_resampled, y_val), axis=0)


# Perform cross-validation on the train_val set
cv_scores = cross_val_score(gb_classifier, X_train_val_reshaped, y_train_val_reshaped, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


# Make predictions on the test set
test_predictions = gb_classifier.predict(X_test)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3057
1    3057
Name: status_label_encoded, dtype: int64
Validation Accuracy: 0.9205357142857142
Cross-validation Scores: [0.89564616 0.97650311 0.97857636 0.98064962 0.94674965]
Mean CV Accuracy: 0.955624982675082
Test Accuracy: 0.9151785714285714


In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = gb_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9151785714285714
Macro F1 Score: 0.6203132415757
Test Accuracy: 0.9151785714285714
Precision: 0.41304347826086957
Recall: 0.21839080459770116
F1 Score: 0.2857142857142857
ROC AUC Score: 0.8288991999643934
Confusion Matrix:
 [[1006   27]
 [  68   19]]


In [21]:
# SMOTE + FS + GRID

In [22]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.feature_selection import SelectFromModel

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
  #AdaBoost less better result

# Perform feature selection 
selector = SelectFromModel(estimator=gb_classifier, threshold='mean')  # median: worse recall, f1, AUC
selector.fit(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]
print("Selected Features:")
print(selected_features)

# Transform the datasets
X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Train Gradient Boosting model with the best parameters
best_gb_model = grid_search.best_estimator_

# Make predictions on the validation set
val_predictions = best_gb_model.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)


# Make predictions on the test set
test_predictions = best_gb_model.predict(X_test_selected)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3057
1    3057
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X1_last1year', 'X1_last4year', 'X4_last1year', 'X4_last2year',
       'X4_last4year', 'X11_last1year', 'X11_last2year', 'X13_last4year',
       'X18_last2year', 'X1_last2year_ycr', 'X1_last3year_ycr',
       'X3_last1year_ycr', 'X3_last4year_ycr', 'X4_last4year_ycr',
       'X6_last4year_ycr', 'X7_last1year_ycr', 'X7_last3year_ycr',
       'X9_last4year_ycr', 'X10_last4year_ycr', 'X13_last1year_ycr',
       'X15_last1year_ycr', 'X15_last4year_ycr', 'X16_last4year_ycr',
       'nyse_last3year', 'nyse_last4year', 'Division_encoded'],
      dtype='object')
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}
Validation Accuracy: 0.9098214285714286
Test Accuracy: 0.9017857142857143


In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_gb_model.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9017857142857143
Macro F1 Score: 0.6118168985682598
Test Accuracy: 0.9017857142857143
Precision: 0.3230769230769231
Recall: 0.2413793103448276
F1 Score: 0.27631578947368424
ROC AUC Score: 0.8118859253819364
Confusion Matrix:
 [[989  44]
 [ 66  21]]


In [24]:
# UNDERSAMPLING + FS + GRID

In [25]:

from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.feature_selection import SelectFromModel

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define an undersampler
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
  #AdaBoost less better result

# Perform feature selection 
selector = SelectFromModel(estimator=gb_classifier, threshold='mean')  # median: worse recall, f1, AUC
selector.fit(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]
print("Selected Features:")
print(selected_features)

# Transform the datasets
X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Train Gradient Boosting model with the best parameters
best_gb_model = grid_search.best_estimator_

# Make predictions on the validation set
val_predictions = best_gb_model.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)


# Make predictions on the test set
test_predictions = best_gb_model.predict(X_test_selected)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    303
1    303
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X1_last3year', 'X2_last3year', 'X4_last2year', 'X5_last3year',
       'X10_last1year', 'X10_last3year', 'X11_last1year', 'X11_last3year',
       'X12_last1year', 'X12_last2year', 'X12_last4year', 'X13_last1year',
       'X13_last2year', 'X14_last2year', 'X17_last2year', 'X18_last4year',
       'X2_last1year_ycr', 'X5_last4year_ycr', 'X8_last4year_ycr',
       'X9_last1year_ycr', 'X10_last3year_ycr', 'X11_last2year_ycr',
       'X12_last4year_ycr', 'X13_last2year_ycr', 'X14_last3year_ycr',
       'X15_last2year_ycr', 'X18_last1year_ycr', 'X18_last3year_ycr',
       'nyse_last3year', 'nasdaq_last2year', 'nasdaq_last4year',
       'MajorGroup_encoded'],
      dtype='object')
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}
Validation Accuracy: 0.7375
Test Accuracy: 0.7098214285714286


In [26]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_gb_model.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.7098214285714286
Macro F1 Score: 0.5518207282913166
Test Accuracy: 0.7098214285714286
Precision: 0.1766304347826087
Recall: 0.7471264367816092
F1 Score: 0.2857142857142857
ROC AUC Score: 0.8097495298817194
Confusion Matrix:
 [[730 303]
 [ 22  65]]


### 4. bagging method

In [27]:
#XGBOOST :base_estimator
#imbalance dataset + CV 


In [28]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)  #better than base_estimator=DecisionTreeClassifier
# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42)

# Train the model on the training set
bagging_classifier.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = bagging_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Perform cross-validation on the train_val set
cv_scores = cross_val_score(bagging_classifier, X_train_val, y_train_val, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


# Make predictions on the test set
test_predictions = bagging_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.9357142857142857
Cross-validation Scores: [0.92299107 0.91741071 0.92410714 0.92857143 0.91294643]
Mean CV Accuracy: 0.9212053571428571
Test Accuracy: 0.93125


In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = bagging_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.93125
Macro F1 Score: 0.6081976820785877
Test Accuracy: 0.93125
Precision: 0.8125
Recall: 0.14942528735632185
F1 Score: 0.2524271844660194
ROC AUC Score: 0.863426466824671
Confusion Matrix:
 [[1030    3]
 [  74   13]]


In [30]:
#XGBOOST :base_estimator
#smote + CV 


In [31]:
# from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Remove feature names from X
X = X.values

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())


# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)

# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42, bootstrap_features=True)

# Train the model on the training set
bagging_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
val_predictions = bagging_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Merge resampled training and validation data
X_train_val_reshaped = np.concatenate((X_train_resampled, X_val), axis=0)
y_train_val_reshaped = np.concatenate((y_train_resampled, y_val), axis=0)


# Perform cross-validation on the train_val set
cv_scores = cross_val_score(bagging_classifier, X_train_val_reshaped, y_train_val_reshaped, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


# Make predictions on the test set
test_predictions = bagging_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3057
1    3057
Name: status_label_encoded, dtype: int64
Validation Accuracy: 0.9276785714285715
Cross-validation Scores: [0.8569454  0.98341396 0.9861783  0.9861783  0.94951591]
Mean CV Accuracy: 0.9524463740022042
Test Accuracy: 0.9223214285714286


In [32]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = bagging_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9223214285714286
Macro F1 Score: 0.642184349971908
Test Accuracy: 0.9223214285714286
Precision: 0.5
Recall: 0.2413793103448276
F1 Score: 0.32558139534883723
ROC AUC Score: 0.8613679607437327
Confusion Matrix:
 [[1012   21]
 [  66   21]]


In [33]:
#XGBOOST :base_estimator 
#smote +  fs + grid research

In [34]:
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import pandas as pd
import numpy as np

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Remove feature names from X
X = X.values

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)
# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42, bootstrap_features=True)

# Perform feature selection
selector = SelectFromModel(estimator=base_estimator, threshold='mean')
selector.fit(X_train_resampled, y_train_resampled)
# print the selected colomn name
support = selector.get_support()
original_feature_names = data_cleaned.drop('status_label_encoded', axis=1).columns
selected_feature_names = original_feature_names[support]
print("Selected Features:")
print(selected_feature_names)


X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)



# Define parameter grid for GridSearchCV
param_grid = {
    'base_estimator__max_depth': [5, 10, 15],
    'n_estimators': [50, 75, 100],
    'bootstrap': [True, False],
    'max_samples': [0.5, 1.0]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)
best_bagging_classifier = grid_search.best_estimator_


# Make predictions on the validation set
val_predictions = best_bagging_classifier.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)



# Make predictions on the test set
test_predictions = best_bagging_classifier.predict(X_test_selected)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3057
1    3057
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X1_last2year', 'X1_last4year', 'X3_last3year', 'X4_last1year',
       'X4_last2year', 'X4_last4year', 'X5_last3year', 'X5_last4year',
       'X6_last1year', 'X9_last1year', 'X9_last2year', 'X10_last2year',
       'X11_last1year', 'X11_last2year', 'X13_last4year', 'X16_last2year',
       'X18_last2year', 'X1_last2year_ycr', 'X1_last3year_ycr',
       'X2_last3year_ycr', 'X3_last1year_ycr', 'X3_last4year_ycr',
       'X4_last4year_ycr', 'X7_last1year_ycr', 'X8_last2year_ycr',
       'X8_last3year_ycr', 'X9_last2year_ycr', 'X9_last4year_ycr',
       'X10_last3year_ycr', 'X10_last4year_ycr', 'X13_last4year_ycr',
       'X15_last4year_ycr', 'X18_last4year_ycr', 'nyse_last1year',
       'nyse_last3year', 'nyse_last4year', 'Division_encoded'],
      dtype='object')
Best Parameters: {'base_estimator__max_depth': 15, 'bootstrap': False, 'max_samples': 1.0, 'n_estimators': 100}
Validation Accuracy: 0.91964285

In [35]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_bagging_classifier.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9080357142857143
Macro F1 Score: 0.6049842658001158
Test Accuracy: 0.9080357142857143
Precision: 0.34615384615384615
Recall: 0.20689655172413793
F1 Score: 0.2589928057553957
ROC AUC Score: 0.8437983331664274
Confusion Matrix:
 [[999  34]
 [ 69  18]]


In [36]:
#XGBOOST :base_estimator
#undersampling +  fs + grid research

In [37]:
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import pandas as pd
import numpy as np

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Remove feature names from X
X = X.values

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation


# Define an undersampler
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())


# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)
# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42, bootstrap_features=True)

# Perform feature selection
selector = SelectFromModel(estimator=base_estimator, threshold='mean')
selector.fit(X_train_resampled, y_train_resampled)
# change NumPy into pandas DataFrame
#df_train_resampled = pd.DataFrame(X_train_resampled)
# print the selected colomn name
support = selector.get_support()
original_feature_names = data_cleaned.drop('status_label_encoded', axis=1).columns
selected_feature_names = original_feature_names[support]
print("Selected Features:")
print(selected_feature_names)


X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)



# Define parameter grid for GridSearchCV
param_grid = {
    'base_estimator__max_depth': [5, 10, 15],
    'n_estimators': [50, 75, 100],
    'bootstrap': [True, False],
    'max_samples': [0.5, 1.0]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)
best_bagging_classifier = grid_search.best_estimator_


# Make predictions on the validation set
val_predictions = best_bagging_classifier.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)


# Make predictions on the test set
test_predictions = best_bagging_classifier.predict(X_test_selected)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    303
1    303
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X1_last2year', 'X1_last3year', 'X2_last3year', 'X3_last1year',
       'X3_last2year', 'X4_last1year', 'X4_last2year', 'X5_last3year',
       'X5_last4year', 'X6_last2year', 'X7_last3year', 'X8_last4year',
       'X9_last1year', 'X9_last4year', 'X10_last1year', 'X10_last2year',
       'X10_last3year', 'X10_last4year', 'X11_last1year', 'X11_last2year',
       'X12_last2year', 'X12_last4year', 'X13_last1year', 'X13_last2year',
       'X14_last2year', 'X15_last2year', 'X15_last3year', 'X16_last2year',
       'X16_last3year', 'X18_last4year', 'X3_last3year_ycr',
       'X4_last1year_ycr', 'X4_last4year_ycr', 'X6_last3year_ycr',
       'X7_last2year_ycr', 'X7_last4year_ycr', 'X8_last4year_ycr',
       'X10_last1year_ycr', 'X11_last2year_ycr', 'X11_last4year_ycr',
       'X13_last2year_ycr', 'X14_last2year_ycr', 'X15_last1year_ycr',
       'X15_last4year_ycr', 'X18_last4year_ycr', 'nyse_last1year',
       'n

In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_bagging_classifier.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.7321428571428571
Macro F1 Score: 0.5698131760078663
Test Accuracy: 0.7321428571428571
Precision: 0.19130434782608696
Recall: 0.7586206896551724
F1 Score: 0.3055555555555555
ROC AUC Score: 0.837533798444437
Confusion Matrix:
 [[754 279]
 [ 21  66]]
