In [1]:
import pandas as pd

# Read the data set
data = pd.read_csv("new_df_selected2_last2years_adjusted.csv")

unique_company_names = data['company_name'].nunique()
unique_status_labels = data['status_label'].nunique()
unique_divisions = data['Division'].nunique()
unique_majorgroup = data['MajorGroup'].nunique()
unique_last_year = data['last_year'].nunique()

print("Number of unique values in 'company_name' column:", unique_company_names)
print("Number of unique values in 'status_label' column:", unique_status_labels)
print("Number of unique values in 'Division' column:", unique_divisions)
print("Number of unique values in 'MajorGroup' column:", unique_majorgroup)
print("Number of unique values in 'last_year' column:", unique_last_year)



Number of unique values in 'company_name' column: 8971
Number of unique values in 'status_label' column: 2
Number of unique values in 'Division' column: 10
Number of unique values in 'MajorGroup' column: 73
Number of unique values in 'last_year' column: 20


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Data columns (total 81 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   company_name       8971 non-null   object 
 1   status_label       8971 non-null   object 
 2   Division           8971 non-null   object 
 3   MajorGroup         8971 non-null   int64  
 4   last_year          8971 non-null   float64
 5   X1_last1year       8971 non-null   float64
 6   X1_last2year       8078 non-null   float64
 7   X2_last1year       8971 non-null   float64
 8   X2_last2year       8078 non-null   float64
 9   X3_last1year       8971 non-null   float64
 10  X3_last2year       8078 non-null   float64
 11  X4_last1year       8971 non-null   float64
 12  X4_last2year       8078 non-null   float64
 13  X5_last1year       8971 non-null   float64
 14  X5_last2year       8078 non-null   float64
 15  X6_last1year       8971 non-null   float64
 16  X6_last2year       8078 

In [3]:
# Encoding non-numeric columns

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder object
label_encoder = LabelEncoder()


# Label-encode the company_name column
data['company_name_encoded'] = label_encoder.fit_transform(data['company_name'])


# Label-encode the Division column
data['Division_encoded'] = label_encoder.fit_transform(data['Division'])

# Label-encode MajorGroup columns
data['MajorGroup_encoded'] = label_encoder.fit_transform(data['MajorGroup'])
#When using label encoding for feature encoding, the sequential relationship between categories will not be introduced and will not have an impact on prediction.

# Encode the label of the status_label column
data['status_label_encoded'] = label_encoder.fit_transform(data['status_label'])
#With only two categories, it may be simpler and more appropriate to use label encoding as it maps the categories to 0 and 1, suitable for use in tree-based models. 
#If use one-hot encoding, a new column will be generated

print(data.head())




  company_name status_label Division  MajorGroup  last_year  X1_last1year  \
0          C_1        alive        D          37     2017.0         942.7   
1          C_2        alive        D          36     2010.0        1107.7   
2          C_3        alive        D          38     2008.0       12686.0   
3          C_4        alive        D          28     2007.0      581502.0   
4          C_5        alive        D          35     1999.0       28957.0   

   X1_last2year  X2_last1year  X2_last2year  X3_last1year  ...  \
0         888.5       1524.70        1504.1       1413.20  ...   
1         900.2       1474.50        1343.6        677.20  ...   
2       13454.0      21401.00       27171.0      19334.00  ...   
3      353541.0    1288165.00      927239.0        267.81  ...   
4           NaN         42.21           NaN      79567.00  ...   

   X18_last1year_ycr  X18_last2year_ycr  nyse_last1year  nyse_last2year  \
0           0.001482           0.061414    11912.848307    10451.

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Data columns (total 85 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   company_name          8971 non-null   object 
 1   status_label          8971 non-null   object 
 2   Division              8971 non-null   object 
 3   MajorGroup            8971 non-null   int64  
 4   last_year             8971 non-null   float64
 5   X1_last1year          8971 non-null   float64
 6   X1_last2year          8078 non-null   float64
 7   X2_last1year          8971 non-null   float64
 8   X2_last2year          8078 non-null   float64
 9   X3_last1year          8971 non-null   float64
 10  X3_last2year          8078 non-null   float64
 11  X4_last1year          8971 non-null   float64
 12  X4_last2year          8078 non-null   float64
 13  X5_last1year          8971 non-null   float64
 14  X5_last2year          8078 non-null   float64
 15  X6_last1year         

In [5]:
data.shape

(8971, 85)

In [6]:
unique_company_names = data['company_name_encoded'].nunique()
unique_status_labels = data['status_label_encoded'].nunique()
unique_divisions = data['Division_encoded'].nunique()
unique_majorgroup = data['MajorGroup_encoded'].nunique()

print("Number of unique values in 'company_name_encoded' column:", unique_company_names)
print("Number of unique values in 'status_label_encoded' column:", unique_status_labels)
print("Number of unique values in 'Division_encoded' column:", unique_divisions)
print("Number of unique values in 'MajorGroup_encoded' column:", unique_majorgroup)


Number of unique values in 'company_name_encoded' column: 8971
Number of unique values in 'status_label_encoded' column: 2
Number of unique values in 'Division_encoded' column: 10
Number of unique values in 'MajorGroup_encoded' column: 73


In [7]:
unique_divisions = data['Division_encoded'].unique()
print("Unique values in 'Division_encoded' column:", unique_divisions)


Unique values in 'Division_encoded' column: [3 4 2 8 5 6 1 0 7 9]


In [8]:
missing_rows_count = data.isnull().any(axis=1).sum()
print("Number of rows with missing values:", missing_rows_count)


Number of rows with missing values: 1870


In [9]:
# Delete rows with missing values
data_cleaned = data.dropna()
# Delete non-numeric columns that are not encoded
# Delete specified column
data_cleaned = data_cleaned.drop(['company_name', 'status_label', 'Division', 'MajorGroup', 'last_year', 'company_name_encoded'], axis=1)

data_cleaned.shape

(7101, 79)

In [10]:
data_cleaned.head

<bound method NDFrame.head of       X1_last1year  X1_last2year  X2_last1year  X2_last2year  X3_last1year  \
0            942.7         888.5       1524.70       1504.10       1413.20   
1           1107.7         900.2       1474.50       1343.60        677.20   
2          12686.0       13454.0      21401.00      27171.00      19334.00   
3         581502.0      353541.0    1288165.00     927239.00        267.81   
5           6838.0        6642.0      25088.00      25438.00      18138.00   
...            ...           ...           ...           ...           ...   
8966       10566.0       11738.0      28278.00      26206.00      31288.00   
8967        3369.0        9049.0       3466.00       9198.00        208.00   
8968        2482.2        2340.6       9401.50      10252.40        966.70   
8969         931.6        1032.7       2810.20       2542.00       1475.90   
8970       82589.0      135207.0       1625.37       1736.11      68817.00   

      X3_last2year  X4_last1year 

In [11]:
status_counts = data_cleaned['status_label_encoded'].value_counts()
print(status_counts)


0    6537
1     564
Name: status_label_encoded, dtype: int64


### 2. XGBoost

In [12]:
# imbalance 

In [13]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)


# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',         # Logarithmic loss
    'eta': 0.07,                       # Learning rate
    'max_depth': 60,                   # Maximum depth of the tree
    'subsample': 0.9,                 # Subsample ratio of the training instances
    'colsample_bytree': 0.9,          # Subsample ratio of columns when constructing each tree
    'lambda': 1,                      # L2 regularization term (default is 1)
    'alpha': 0,                       # L1 regularization term (default is 0)
    'seed': 42                        # Random seed
}

# Train XGBoost model
num_rounds = 100
watchlist = [(dtrain, 'train'), (dval, 'eval')]
xgb_model = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=10)

# Make predictions on the test set
test_predictions_proba = xgb_model.predict(dtest)
test_predictions = [1 if x > 0.5 else 0 for x in test_predictions_proba]

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


[0]	train-logloss:0.28783	eval-logloss:0.29108
[1]	train-logloss:0.27209	eval-logloss:0.28307
[2]	train-logloss:0.25781	eval-logloss:0.27546
[3]	train-logloss:0.24398	eval-logloss:0.26880
[4]	train-logloss:0.23179	eval-logloss:0.26295
[5]	train-logloss:0.22033	eval-logloss:0.25813
[6]	train-logloss:0.20902	eval-logloss:0.25301
[7]	train-logloss:0.19883	eval-logloss:0.24824
[8]	train-logloss:0.18936	eval-logloss:0.24449
[9]	train-logloss:0.18026	eval-logloss:0.24014
[10]	train-logloss:0.17224	eval-logloss:0.23587
[11]	train-logloss:0.16399	eval-logloss:0.23285
[12]	train-logloss:0.15652	eval-logloss:0.22951
[13]	train-logloss:0.14972	eval-logloss:0.22711
[14]	train-logloss:0.14320	eval-logloss:0.22429
[15]	train-logloss:0.13727	eval-logloss:0.22179
[16]	train-logloss:0.13149	eval-logloss:0.21975
[17]	train-logloss:0.12624	eval-logloss:0.21763
[18]	train-logloss:0.12116	eval-logloss:0.21596
[19]	train-logloss:0.11633	eval-logloss:0.21486
[20]	train-logloss:0.11174	eval-logloss:0.21358
[2

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, test_predictions_proba)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9289232934553132
Precision: 0.8421052631578947
Recall: 0.14035087719298245
F1 Score: 0.24060150375939848
Micro F1 Score: 0.9289232934553132
Macro F1 Score: 0.6016591867265062
ROC AUC: 0.855286648142928
Confusion Matrix:
[[1304    3]
 [  98   16]]


### ensemble methods (bagging and boosting) , stacking effect is not good

### 3. boost

In [15]:
# imbalanced + cv

In [16]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)

# Train Gradient Boosting model
gb_classifier.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = gb_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Perform cross-validation on the train_val set
cv_scores = cross_val_score(gb_classifier, X_train_val, y_train_val, cv=5)
# Print cross-validat ion scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


# Make predictions on the test set
test_predictions = gb_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.9267605633802817
Cross-validation Scores: [0.92253521 0.92693662 0.92693662 0.92693662 0.92341549]
Mean CV Accuracy: 0.9253521126760564
Test Accuracy: 0.9211822660098522


In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = gb_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9211822660098522
Macro F1 Score: 0.5349369988545246
Test Accuracy: 0.9211822660098522
Precision: 0.5833333333333334
Recall: 0.06140350877192982
F1 Score: 0.1111111111111111
ROC AUC Score: 0.827695673767433
Confusion Matrix:
 [[1302    5]
 [ 107    7]]


In [18]:
# GradientBoostingClassifier
# SMOTE + CV

In [19]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import AdaBoostClassifier

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
  #AdaBoost less better result


# Train Gradient Boosting model
gb_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
val_predictions = gb_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Merge resampled training and validation data
X_train_val_reshaped = np.concatenate((X_train_resampled, X_val), axis=0)
y_train_val_reshaped = np.concatenate((y_train_resampled, y_val), axis=0)


# Perform cross-validation on the train_val set
cv_scores = cross_val_score(gb_classifier, X_train_val_reshaped, y_train_val_reshaped, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())



# Make predictions on the test set
test_predictions = gb_classifier.predict(X_test)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3919
1    3919
Name: status_label_encoded, dtype: int64
Validation Accuracy: 0.9161971830985915
Cross-validation Scores: [0.89092873 0.97948164 0.98596112 0.98595354 0.94219341]
Mean CV Accuracy: 0.9569036875753344
Test Accuracy: 0.9247009148486981


In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = gb_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9247009148486981
Macro F1 Score: 0.663416988203313
Test Accuracy: 0.9247009148486981
Precision: 0.5636363636363636
Recall: 0.2719298245614035
F1 Score: 0.3668639053254438
ROC AUC Score: 0.8394877783594411
Confusion Matrix:
 [[1283   24]
 [  83   31]]


In [21]:
# SMOTE + FS + GRID

In [22]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.feature_selection import SelectFromModel

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
  #AdaBoost less better result

# Perform feature selection 
selector = SelectFromModel(estimator=gb_classifier, threshold='mean')  # median: worse recall, f1, AUC
selector.fit(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]
print("Selected Features:")
print(selected_features)

# Transform the datasets
X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Train Gradient Boosting model with the best parameters
best_gb_model = grid_search.best_estimator_

# Make predictions on the validation set
val_predictions = best_gb_model.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)



# Make predictions on the test set
test_predictions = best_gb_model.predict(X_test_selected)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3919
1    3919
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X4_last1year', 'X4_last2year', 'X6_last1year', 'X9_last1year',
       'X11_last1year', 'X12_last1year', 'X16_last2year', 'X15_last1year_ycr',
       'X16_last2year_ycr', 'nyse_last1year', 'nyse_last2year',
       'nasdaq_last1year', 'nasdaq_last2year', 'Division_encoded'],
      dtype='object')
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}
Validation Accuracy: 0.9169014084507042
Test Accuracy: 0.9155524278676987


In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_gb_model.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9155524278676987
Macro F1 Score: 0.6441272226396193
Test Accuracy: 0.9155524278676987
Precision: 0.45454545454545453
Recall: 0.2631578947368421
F1 Score: 0.3333333333333333
ROC AUC Score: 0.8323333199103342
Confusion Matrix:
 [[1271   36]
 [  84   30]]


In [24]:
# UNDERSAMPLING + FS + GRID

In [25]:

from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.feature_selection import SelectFromModel

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define an undersampler
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
  #AdaBoost less better result

# Perform feature selection 
selector = SelectFromModel(estimator=gb_classifier, threshold='mean')  # median: worse recall, f1, AUC
selector.fit(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]
print("Selected Features:")
print(selected_features)

# Transform the datasets
X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Train Gradient Boosting model with the best parameters
best_gb_model = grid_search.best_estimator_

# Make predictions on the validation set
val_predictions = best_gb_model.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)



# Make predictions on the test set
test_predictions = best_gb_model.predict(X_test_selected)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    341
1    341
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X1_last2year', 'X2_last1year', 'X4_last2year', 'X5_last2year',
       'X10_last1year', 'X11_last1year', 'X12_last2year', 'X15_last1year',
       'X15_last2year', 'X16_last2year', 'X18_last2year', 'X7_last2year_ycr',
       'X8_last1year_ycr', 'X9_last1year_ycr', 'X9_last2year_ycr',
       'X11_last2year_ycr', 'X15_last1year_ycr', 'X16_last2year_ycr',
       'nyse_last1year', 'nasdaq_last1year', 'nasdaq_last2year',
       'MajorGroup_encoded'],
      dtype='object')
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150}
Validation Accuracy: 0.7429577464788732
Test Accuracy: 0.7424349049964813


In [26]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_gb_model.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.7424349049964813
Macro F1 Score: 0.5839684764245123
Test Accuracy: 0.7424349049964813
Precision: 0.2069767441860465
Recall: 0.7807017543859649
F1 Score: 0.3272058823529412
ROC AUC Score: 0.8427898361051827
Confusion Matrix:
 [[966 341]
 [ 25  89]]


### 4. bagging method

In [27]:
#XGBOOST :base_estimator
#imbalance dataset + CV 


In [28]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)  #better than base_estimator=DecisionTreeClassifier
# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42)

# Train the model on the training set
bagging_classifier.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = bagging_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Perform cross-validation on the train_val set
cv_scores = cross_val_score(bagging_classifier, X_train_val, y_train_val, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())



# Make predictions on the test set
test_predictions = bagging_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.9288732394366197
Cross-validation Scores: [0.92517606 0.9278169  0.92957746 0.9278169  0.92605634]
Mean CV Accuracy: 0.9272887323943662
Test Accuracy: 0.928219563687544


In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = bagging_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.928219563687544
Macro F1 Score: 0.5888869979577944
Test Accuracy: 0.928219563687544
Precision: 0.875
Recall: 0.12280701754385964
F1 Score: 0.2153846153846154
ROC AUC Score: 0.8554947046269078
Confusion Matrix:
 [[1305    2]
 [ 100   14]]


In [30]:
#XGBOOST :base_estimator
#smote + CV 


In [31]:
# from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Remove feature names from X
X = X.values

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())


# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)

# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42, bootstrap_features=True)

# Train the model on the training set
bagging_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
val_predictions = bagging_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Merge resampled training and validation data
X_train_val_reshaped = np.concatenate((X_train_resampled, X_val), axis=0)
y_train_val_reshaped = np.concatenate((y_train_resampled, y_val), axis=0)


# Perform cross-validation on the train_val set
cv_scores = cross_val_score(bagging_classifier, X_train_val_reshaped, y_train_val_reshaped, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())



# Make predictions on the test set
test_predictions = bagging_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3919
1    3919
Name: status_label_encoded, dtype: int64
Validation Accuracy: 0.9225352112676056
Cross-validation Scores: [0.86987041 0.98866091 0.9924406  0.98649379 0.94597515]
Mean CV Accuracy: 0.9566881715913294
Test Accuracy: 0.9232934553131598


In [32]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = bagging_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9232934553131598
Macro F1 Score: 0.6188201535163815
Test Accuracy: 0.9232934553131598
Precision: 0.5675675675675675
Recall: 0.18421052631578946
F1 Score: 0.2781456953642384
ROC AUC Score: 0.8397897958361857
Confusion Matrix:
 [[1291   16]
 [  93   21]]


In [33]:
#XGBOOST :base_estimator 
#smote +  fs + grid research

In [34]:
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import pandas as pd
import numpy as np

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Remove feature names from X
X = X.values

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)
# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42, bootstrap_features=True)

# Perform feature selection
selector = SelectFromModel(estimator=base_estimator, threshold='mean')
selector.fit(X_train_resampled, y_train_resampled)
# print the selected colomn name
support = selector.get_support()
original_feature_names = data_cleaned.drop('status_label_encoded', axis=1).columns
selected_feature_names = original_feature_names[support]
print("Selected Features:")
print(selected_feature_names)


X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)



# Define parameter grid for GridSearchCV
param_grid = {
    'base_estimator__max_depth': [5, 10, 15],
    'n_estimators': [50, 75, 100],
    'bootstrap': [True, False],
    'max_samples': [0.5, 1.0]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)
best_bagging_classifier = grid_search.best_estimator_


# Make predictions on the validation set
val_predictions = best_bagging_classifier.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)



# Make predictions on the test set
test_predictions = best_bagging_classifier.predict(X_test_selected)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3919
1    3919
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X3_last2year', 'X4_last1year', 'X4_last2year', 'X6_last1year',
       'X9_last1year', 'X11_last1year', 'X11_last2year', 'X12_last1year',
       'X16_last2year', 'X18_last2year', 'X9_last1year_ycr',
       'X13_last1year_ycr', 'X13_last2year_ycr', 'X15_last1year_ycr',
       'X16_last2year_ycr', 'X18_last2year_ycr', 'nyse_last1year',
       'nyse_last2year', 'nasdaq_last2year', 'Division_encoded'],
      dtype='object')
Best Parameters: {'base_estimator__max_depth': 15, 'bootstrap': False, 'max_samples': 1.0, 'n_estimators': 100}
Validation Accuracy: 0.9211267605633803
Test Accuracy: 0.9232934553131598


In [35]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_bagging_classifier.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9232934553131598
Macro F1 Score: 0.6281042534545062
Test Accuracy: 0.9232934553131598
Precision: 0.5609756097560976
Recall: 0.20175438596491227
F1 Score: 0.2967741935483871
ROC AUC Score: 0.8468637162914938
Confusion Matrix:
 [[1289   18]
 [  91   23]]


In [36]:
#XGBOOST :base_estimator
#undersampling +  fs + grid research

In [37]:
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import pandas as pd
import numpy as np

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Remove feature names from X
X = X.values

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation


# Define an undersampler
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())


# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)
# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42, bootstrap_features=True)

# Perform feature selection
selector = SelectFromModel(estimator=base_estimator, threshold='mean')
selector.fit(X_train_resampled, y_train_resampled)
# change NumPy into pandas DataFrame
#df_train_resampled = pd.DataFrame(X_train_resampled)
# print the selected colomn name
support = selector.get_support()
original_feature_names = data_cleaned.drop('status_label_encoded', axis=1).columns
selected_feature_names = original_feature_names[support]
print("Selected Features:")
print(selected_feature_names)


X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)



# Define parameter grid for GridSearchCV
param_grid = {
    'base_estimator__max_depth': [5, 10, 15],
    'n_estimators': [50, 75, 100],
    'bootstrap': [True, False],
    'max_samples': [0.5, 1.0]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train_resampled)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)
best_bagging_classifier = grid_search.best_estimator_


# Make predictions on the validation set
val_predictions = best_bagging_classifier.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)



# Make predictions on the test set
test_predictions = best_bagging_classifier.predict(X_test_selected)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    341
1    341
Name: status_label_encoded, dtype: int64
Selected Features:
Index(['X1_last2year', 'X2_last1year', 'X2_last2year', 'X3_last2year',
       'X4_last2year', 'X5_last1year', 'X5_last2year', 'X8_last1year',
       'X10_last2year', 'X11_last1year', 'X11_last2year', 'X12_last1year',
       'X12_last2year', 'X14_last2year', 'X15_last1year', 'X15_last2year',
       'X16_last1year', 'X18_last2year', 'X1_last1year_ycr',
       'X3_last2year_ycr', 'X5_last2year_ycr', 'X7_last2year_ycr',
       'X8_last1year_ycr', 'X9_last2year_ycr', 'X11_last2year_ycr',
       'X14_last2year_ycr', 'X15_last1year_ycr', 'X18_last1year_ycr',
       'nyse_last1year', 'nasdaq_last1year', 'nasdaq_last2year',
       'MajorGroup_encoded'],
      dtype='object')
Best Parameters: {'base_estimator__max_depth': 10, 'bootstrap': False, 'max_samples': 1.0, 'n_estimators': 50}
Validation Accuracy: 0.7464788732394366
Test Accuracy: 0.7459535538353272


In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = best_bagging_classifier.predict_proba(X_test_selected)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.7459535538353272
Macro F1 Score: 0.586744562428453
Test Accuracy: 0.7459535538353272
Precision: 0.20941176470588235
Recall: 0.7807017543859649
F1 Score: 0.3302411873840445
ROC AUC Score: 0.8487295131478275
Confusion Matrix:
 [[971 336]
 [ 25  89]]
