In [1]:
import pandas as pd

# Read the data set
data = pd.read_csv("new_df_selected2_last2years_adjusted.csv")

unique_company_names = data['company_name'].nunique()
unique_status_labels = data['status_label'].nunique()
unique_divisions = data['Division'].nunique()
unique_majorgroup = data['MajorGroup'].nunique()
unique_last_year = data['last_year'].nunique()

print("Number of unique values in 'company_name' column:", unique_company_names)
print("Number of unique values in 'status_label' column:", unique_status_labels)
print("Number of unique values in 'Division' column:", unique_divisions)
print("Number of unique values in 'MajorGroup' column:", unique_majorgroup)
print("Number of unique values in 'last_year' column:", unique_last_year)



Number of unique values in 'company_name' column: 8971
Number of unique values in 'status_label' column: 2
Number of unique values in 'Division' column: 10
Number of unique values in 'MajorGroup' column: 73
Number of unique values in 'last_year' column: 20


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Data columns (total 81 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   company_name       8971 non-null   object 
 1   status_label       8971 non-null   object 
 2   Division           8971 non-null   object 
 3   MajorGroup         8971 non-null   int64  
 4   last_year          8971 non-null   float64
 5   X1_last1year       8971 non-null   float64
 6   X1_last2year       8078 non-null   float64
 7   X2_last1year       8971 non-null   float64
 8   X2_last2year       8078 non-null   float64
 9   X3_last1year       8971 non-null   float64
 10  X3_last2year       8078 non-null   float64
 11  X4_last1year       8971 non-null   float64
 12  X4_last2year       8078 non-null   float64
 13  X5_last1year       8971 non-null   float64
 14  X5_last2year       8078 non-null   float64
 15  X6_last1year       8971 non-null   float64
 16  X6_last2year       8078 

In [3]:
# Encoding non-numeric columns

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder object
label_encoder = LabelEncoder()


# Label-encode the company_name column
data['company_name_encoded'] = label_encoder.fit_transform(data['company_name'])


# Label-encode the Division column
data['Division_encoded'] = label_encoder.fit_transform(data['Division'])

# Label-encode MajorGroup columns
data['MajorGroup_encoded'] = label_encoder.fit_transform(data['MajorGroup'])
#When using label encoding for feature encoding, the sequential relationship between categories will not be introduced and will not have an impact on prediction.

# Encode the label of the status_label column
data['status_label_encoded'] = label_encoder.fit_transform(data['status_label'])
#With only two categories, it may be simpler and more appropriate to use label encoding as it maps the categories to 0 and 1, suitable for use in tree-based models. 
#If use one-hot encoding, a new column will be generated

print(data.head())




  company_name status_label Division  MajorGroup  last_year  X1_last1year  \
0          C_1        alive        D          37     2017.0         942.7   
1          C_2        alive        D          36     2010.0        1107.7   
2          C_3        alive        D          38     2008.0       12686.0   
3          C_4        alive        D          28     2007.0      581502.0   
4          C_5        alive        D          35     1999.0       28957.0   

   X1_last2year  X2_last1year  X2_last2year  X3_last1year  ...  \
0         888.5       1524.70        1504.1       1413.20  ...   
1         900.2       1474.50        1343.6        677.20  ...   
2       13454.0      21401.00       27171.0      19334.00  ...   
3      353541.0    1288165.00      927239.0        267.81  ...   
4           NaN         42.21           NaN      79567.00  ...   

   X18_last1year_ycr  X18_last2year_ycr  nyse_last1year  nyse_last2year  \
0           0.001482           0.061414    11912.848307    10451.

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Data columns (total 85 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   company_name          8971 non-null   object 
 1   status_label          8971 non-null   object 
 2   Division              8971 non-null   object 
 3   MajorGroup            8971 non-null   int64  
 4   last_year             8971 non-null   float64
 5   X1_last1year          8971 non-null   float64
 6   X1_last2year          8078 non-null   float64
 7   X2_last1year          8971 non-null   float64
 8   X2_last2year          8078 non-null   float64
 9   X3_last1year          8971 non-null   float64
 10  X3_last2year          8078 non-null   float64
 11  X4_last1year          8971 non-null   float64
 12  X4_last2year          8078 non-null   float64
 13  X5_last1year          8971 non-null   float64
 14  X5_last2year          8078 non-null   float64
 15  X6_last1year         

In [5]:
data.shape

(8971, 85)

In [6]:
unique_company_names = data['company_name_encoded'].nunique()
unique_status_labels = data['status_label_encoded'].nunique()
unique_divisions = data['Division_encoded'].nunique()
unique_majorgroup = data['MajorGroup_encoded'].nunique()

print("Number of unique values in 'company_name_encoded' column:", unique_company_names)
print("Number of unique values in 'status_label_encoded' column:", unique_status_labels)
print("Number of unique values in 'Division_encoded' column:", unique_divisions)
print("Number of unique values in 'MajorGroup_encoded' column:", unique_majorgroup)


Number of unique values in 'company_name_encoded' column: 8971
Number of unique values in 'status_label_encoded' column: 2
Number of unique values in 'Division_encoded' column: 10
Number of unique values in 'MajorGroup_encoded' column: 73


In [7]:
unique_divisions = data['Division_encoded'].unique()
print("Unique values in 'Division_encoded' column:", unique_divisions)


Unique values in 'Division_encoded' column: [3 4 2 8 5 6 1 0 7 9]


In [8]:
missing_rows_count = data.isnull().any(axis=1).sum()
print("Number of rows with missing values:", missing_rows_count)


Number of rows with missing values: 1870


In [9]:
# Delete rows with missing values
data_cleaned = data.dropna()
# Delete non-numeric columns that are not encoded
# Delete specified column
data_cleaned = data_cleaned.drop(['company_name', 'status_label', 'Division', 'MajorGroup', 'last_year', 'company_name_encoded'], axis=1)

data_cleaned.shape

(7101, 79)

In [10]:
data_cleaned.head

<bound method NDFrame.head of       X1_last1year  X1_last2year  X2_last1year  X2_last2year  X3_last1year  \
0            942.7         888.5       1524.70       1504.10       1413.20   
1           1107.7         900.2       1474.50       1343.60        677.20   
2          12686.0       13454.0      21401.00      27171.00      19334.00   
3         581502.0      353541.0    1288165.00     927239.00        267.81   
5           6838.0        6642.0      25088.00      25438.00      18138.00   
...            ...           ...           ...           ...           ...   
8966       10566.0       11738.0      28278.00      26206.00      31288.00   
8967        3369.0        9049.0       3466.00       9198.00        208.00   
8968        2482.2        2340.6       9401.50      10252.40        966.70   
8969         931.6        1032.7       2810.20       2542.00       1475.90   
8970       82589.0      135207.0       1625.37       1736.11      68817.00   

      X3_last2year  X4_last1year 

In [11]:
status_counts = data_cleaned['status_label_encoded'].value_counts()
print(status_counts)


0    6537
1     564
Name: status_label_encoded, dtype: int64


### 1. RF

In [12]:
#the impact of imbalanced datasets


#### 1.1 imbalance dataset + CV

In [13]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Define a random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model on the training set
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = rf_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Perform cross-validation on the training set
cv_scores = cross_val_score(rf_classifier, X_train_val, y_train_val, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
# Retrain the model on the entire training set


# Make predictions on the test set
test_predictions = rf_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.926056338028169
Cross-validation Scores: [0.92517606 0.92341549 0.92165493 0.92693662 0.92253521]
Mean CV Accuracy: 0.923943661971831
Test Accuracy: 0.9204785362420831


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculation precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = rf_classifier.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9204785362420831
Precision: 0.6666666666666666
Recall: 0.017543859649122806
F1 Score: 0.03418803418803419
Micro F1 Score: 0.9204785362420831
Macro F1 Score: 0.4963600721398886
ROC AUC: 0.811735727996349
Confusion Matrix:
[[1306    1]
 [ 112    2]]


#### 1.2 SMOTE

##### 1.2.1 SMOTE + cv

In [15]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define a random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model on the training set
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
val_predictions = rf_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Combine resampled training set and validation set
X_train_val_reshape = pd.concat([pd.DataFrame(X_train_resampled), pd.DataFrame(X_val)], axis=0)
y_train_val_reshape = pd.concat([pd.Series(y_train_resampled), pd.Series(y_val)], axis=0)


# Perform cross-validation on the train_val set
cv_scores = cross_val_score(rf_classifier, X_train_val_reshape, y_train_val_reshape, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
# Retrain the model on the train_val set



# Make predictions on the test set
test_predictions = rf_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)



0    3919
1    3919
Name: status_label_encoded, dtype: int64
Validation Accuracy: 0.9169014084507042
Cross-validation Scores: [0.90226782 0.97678186 0.98542117 0.98487304 0.94111291]
Mean CV Accuracy: 0.9580913591742481
Test Accuracy: 0.9183673469387755


In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = rf_classifier.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9183673469387755
Precision: 0.48333333333333334
Recall: 0.2543859649122807
F1 Score: 0.33333333333333337
Micro F1 Score: 0.9183673469387755
Macro F1 Score: 0.644927536231884
ROC AUC: 0.8333702465804911
Confusion Matrix:
[[1276   31]
 [  85   29]]


##### 1.2.2 SMOTE + CV + FS

##### 1.2.3 SMOTE + GRID + FS

In [17]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define a random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Use SelectFromModel for feature selection
selector = SelectFromModel(estimator=rf_classifier, threshold='median')  #better precision, worse recall&f1, compare with mean
X_train_resampled_selected = selector.fit_transform(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]   #Select a list of column names for the characteristics.
print("Selected Features:")
print(selected_features)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train_resampled_selected, y_train_resampled)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Get the best model
best_rf_classifier = grid_search.best_estimator_

# Make predictions on the validation set with selected features
X_val_selected = selector.transform(X_val)
val_predictions = best_rf_classifier.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy with Best Model:", val_accuracy)




# Make predictions on the test set with the best model
X_test_selected = selector.transform(X_test)
test_predictions = best_rf_classifier.predict(X_test_selected)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy with Best Model:", test_accuracy)


Selected Features:
Index(['X1_last2year', 'X3_last1year', 'X4_last1year', 'X4_last2year',
       'X5_last2year', 'X6_last1year', 'X9_last1year', 'X9_last2year',
       'X10_last1year', 'X10_last2year', 'X11_last1year', 'X11_last2year',
       'X12_last1year', 'X12_last2year', 'X15_last1year', 'X16_last1year',
       'X16_last2year', 'X18_last2year', 'X1_last1year_ycr',
       'X1_last2year_ycr', 'X4_last1year_ycr', 'X4_last2year_ycr',
       'X5_last2year_ycr', 'X9_last1year_ycr', 'X9_last2year_ycr',
       'X10_last1year_ycr', 'X10_last2year_ycr', 'X13_last1year_ycr',
       'X13_last2year_ycr', 'X14_last2year_ycr', 'X15_last1year_ycr',
       'X16_last2year_ycr', 'X18_last2year_ycr', 'nyse_last1year',
       'nyse_last2year', 'nasdaq_last1year', 'nasdaq_last2year',
       'Division_encoded', 'MajorGroup_encoded'],
      dtype='object')
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.9566268737871664
Validation Acc

In [18]:
#The evaluation metrics on the test set are calculated directly without threshold selection and re-prediction.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

#Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = best_rf_classifier.predict_proba(X_test_selected)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9106263194933145
Precision: 0.4084507042253521
Recall: 0.2543859649122807
F1 Score: 0.31351351351351353
Micro F1 Score: 0.9106263194933145
Macro F1 Score: 0.6328576223946942
ROC AUC: 0.840454234285024
Confusion Matrix:
[[1265   42]
 [  85   29]]


#### 1.3 processing imbalanced dataset, Undersampling

In [19]:
# undersampling +grid +fs

In [20]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define an undersampler
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Define a random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Use SelectFromModel for feature selection
selector = SelectFromModel(estimator=rf_classifier, threshold='median')  #better precision, worse recall&f1, compare with mean
X_train_resampled_selected = selector.fit_transform(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]   #Select a list of column names for the characteristics.
print("Selected Features:")
print(selected_features)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train_resampled_selected, y_train_resampled)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Get the best model
best_rf_classifier = grid_search.best_estimator_

# Make predictions on the validation set with selected features
X_val_selected = selector.transform(X_val)
val_predictions = best_rf_classifier.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy with Best Model:", val_accuracy)


# Make predictions on the test set with the best model
X_test_selected = selector.transform(X_test)
test_predictions = best_rf_classifier.predict(X_test_selected)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy with Best Model:", test_accuracy)


Selected Features:
Index(['X2_last1year', 'X3_last1year', 'X4_last2year', 'X5_last1year',
       'X5_last2year', 'X6_last1year', 'X6_last2year', 'X7_last1year',
       'X11_last1year', 'X11_last2year', 'X12_last1year', 'X12_last2year',
       'X15_last1year', 'X15_last2year', 'X16_last1year', 'X16_last2year',
       'X18_last2year', 'X1_last1year_ycr', 'X2_last1year_ycr',
       'X2_last2year_ycr', 'X4_last1year_ycr', 'X4_last2year_ycr',
       'X7_last2year_ycr', 'X8_last1year_ycr', 'X10_last1year_ycr',
       'X11_last1year_ycr', 'X11_last2year_ycr', 'X13_last2year_ycr',
       'X15_last1year_ycr', 'X15_last2year_ycr', 'X16_last1year_ycr',
       'X16_last2year_ycr', 'X18_last1year_ycr', 'X18_last2year_ycr',
       'nyse_last1year', 'nyse_last2year', 'nasdaq_last1year',
       'nasdaq_last2year', 'MajorGroup_encoded'],
      dtype='object')
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best Score: 0.7786389008158007
Validatio

In [21]:
#The evaluation metrics on the test set are calculated directly without threshold selection and re-prediction.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

#Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = best_rf_classifier.predict_proba(X_test_selected)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.7325826882477129
Precision: 0.20045045045045046
Recall: 0.7807017543859649
F1 Score: 0.31899641577060933
Micro F1 Score: 0.7325826882477129
Macro F1 Score: 0.5763108173423975
ROC AUC: 0.847289896508678
Confusion Matrix:
[[952 355]
 [ 25  89]]


### 2. XGBOOST

In [22]:
#2.1 SMOTE


In [23]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train_resampled, label=y_train_resampled)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',         # Logarithmic loss
    'eta': 0.07,                       # Learning rate
    'max_depth': 60,                   # Maximum depth of the tree
    'subsample': 0.9,                 # Subsample ratio of the training instances
    'colsample_bytree': 0.9,          # Subsample ratio of columns when constructing each tree
    'lambda': 1,                      # L2 regularization term (default is 1)
    'alpha': 0,                       # L1 regularization term (default is 0)
    #'num_rounds': 100,                # Number of boosting rounds
    'seed': 42                        # Random seed
}

# Train XGBoost model
num_rounds = 100
watchlist = [(dtrain, 'train'), (dval, 'eval')]
xgb_model = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=10)

# Make predictions on the test set
test_predictions_proba = xgb_model.predict(dtest)
test_predictions = [1 if x > 0.5 else 0 for x in test_predictions_proba]

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    3919
1    3919
Name: status_label_encoded, dtype: int64
[0]	train-logloss:0.63582	eval-logloss:0.64850
[1]	train-logloss:0.58642	eval-logloss:0.61046
[2]	train-logloss:0.54256	eval-logloss:0.57689
[3]	train-logloss:0.50294	eval-logloss:0.54653
[4]	train-logloss:0.46687	eval-logloss:0.51802
[5]	train-logloss:0.43478	eval-logloss:0.49362
[6]	train-logloss:0.40536	eval-logloss:0.47202
[7]	train-logloss:0.37844	eval-logloss:0.45221
[8]	train-logloss:0.35386	eval-logloss:0.43448
[9]	train-logloss:0.33148	eval-logloss:0.41826
[10]	train-logloss:0.31078	eval-logloss:0.40376
[11]	train-logloss:0.29201	eval-logloss:0.39025
[12]	train-logloss:0.27442	eval-logloss:0.37828
[13]	train-logloss:0.25815	eval-logloss:0.36630
[14]	train-logloss:0.24311	eval-logloss:0.35527
[15]	train-logloss:0.22933	eval-logloss:0.34650
[16]	train-logloss:0.21677	eval-logloss:0.33846
[17]	train-logloss:0.20485	eval-logloss:0.33027
[18]	train-logloss:0.19347	eval-logloss:0.32245
[19]	train-logloss:0.18304	eval-loglo

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, test_predictions_proba)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9261083743842364
Precision: 0.5818181818181818
Recall: 0.2807017543859649
F1 Score: 0.378698224852071
Micro F1 Score: 0.9261083743842364
Macro F1 Score: 0.6697082594518492
ROC AUC: 0.8486556866535121
Confusion Matrix:
[[1284   23]
 [  82   32]]


In [25]:
# 2.2 SMOTE + FS

In [26]:
# 2.3 SMOTE + FS + GRID

In [27]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd
from sklearn.feature_selection import SelectFromModel

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define a XGBoost classifier
xgb_classifier = xgb.XGBClassifier(random_state=42)

# Perform feature selection 
selector = SelectFromModel(estimator=xgb_classifier, threshold='mean')  # threshold: mean
selector.fit(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]
print("Selected Features:")
print(selected_features)

# Transform the datasets
X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)


# Define parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [10, 50, 100],
    'max_depth': [5, 25, 70],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0], 
}

# Create cross-validation folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=cv, scoring='accuracy')

# Fit GridSearchCV to data
grid_search.fit(X_train_selected, y_train_resampled)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Train XGBoost model with best parameters
best_params = grid_search.best_params_
best_xgb_classifier = grid_search.best_estimator_



# Make predictions on the validation set
val_predictions = best_xgb_classifier.predict(X_val_selected)

# Calculate accuracy on test set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Val Accuracy:", val_accuracy)



# Make predictions on the test set
test_predictions = best_xgb_classifier.predict(X_test_selected)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Selected Features:
Index(['X4_last1year', 'X4_last2year', 'X9_last1year', 'X9_last2year',
       'X11_last1year', 'X11_last2year', 'X16_last2year', 'X18_last2year',
       'X4_last2year_ycr', 'X10_last1year_ycr', 'X15_last1year_ycr',
       'nyse_last1year', 'nyse_last2year', 'nasdaq_last2year',
       'Division_encoded'],
      dtype='object')
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 70, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1.0}
Best Score: 0.9584086809580246
Val Accuracy: 0.9126760563380282
Test Accuracy: 0.9148486980999296


In [28]:
#The evaluation metrics on the test set are calculated directly without threshold selection and re-prediction.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

#Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = best_xgb_classifier.predict_proba(X_test_selected)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9148486980999296
Precision: 0.45977011494252873
Recall: 0.3508771929824561
F1 Score: 0.39800995024875624
Micro F1 Score: 0.9148486980999296
Macro F1 Score: 0.6760969857264228
ROC AUC: 0.8511926334581671
Confusion Matrix:
[[1260   47]
 [  74   40]]


In [29]:
#2.4 UNDERSAMPLING + FS +GRID

In [30]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd
from sklearn.feature_selection import SelectFromModel

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define an undersampler
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Define a XGBoost classifier
xgb_classifier = xgb.XGBClassifier(random_state=42)

# Perform feature selection 
selector = SelectFromModel(estimator=xgb_classifier, threshold='mean')  # threshold: mean
selector.fit(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]
print("Selected Features:")
print(selected_features)

# Transform the datasets
X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)


# Define parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [10, 50, 100],
    'max_depth': [5, 25, 70],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],     
}

# Create cross-validation folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=cv, scoring='accuracy')

# Fit GridSearchCV to data
grid_search.fit(X_train_selected, y_train_resampled)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Train XGBoost model with best parameters
best_params = grid_search.best_params_
best_xgb_classifier = grid_search.best_estimator_

# Make predictions on the validation set
val_predictions = best_xgb_classifier.predict(X_val_selected)

# Calculate accuracy on test set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Val Accuracy:", val_accuracy)


# Make predictions on the test set
test_predictions = best_xgb_classifier.predict(X_test_selected)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Selected Features:
Index(['X1_last2year', 'X2_last1year', 'X2_last2year', 'X3_last2year',
       'X4_last2year', 'X5_last1year', 'X5_last2year', 'X8_last1year',
       'X10_last2year', 'X11_last1year', 'X11_last2year', 'X12_last1year',
       'X12_last2year', 'X14_last2year', 'X15_last1year', 'X15_last2year',
       'X16_last1year', 'X18_last2year', 'X1_last1year_ycr',
       'X3_last2year_ycr', 'X5_last2year_ycr', 'X7_last2year_ycr',
       'X8_last1year_ycr', 'X9_last2year_ycr', 'X11_last2year_ycr',
       'X14_last2year_ycr', 'X15_last1year_ycr', 'X18_last1year_ycr',
       'nyse_last1year', 'nasdaq_last1year', 'nasdaq_last2year',
       'MajorGroup_encoded'],
      dtype='object')
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.9}
Best Score: 0.7830721339630743
Val Accuracy: 0.7323943661971831
Test Accuracy: 0.7361013370865588


In [31]:
#The evaluation metrics on the test set are calculated directly without threshold selection and re-prediction.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

#Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = best_xgb_classifier.predict_proba(X_test_selected)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.7361013370865588
Precision: 0.2054176072234763
Recall: 0.7982456140350878
F1 Score: 0.3267504488330341
Micro F1 Score: 0.7361013370865588
Macro F1 Score: 0.5813183316375237
ROC AUC: 0.8414609592075061
Confusion Matrix:
[[955 352]
 [ 23  91]]
