In [1]:
import pandas as pd

# Read the data set
data = pd.read_csv("new_df_selected5_last5years_adjusted.csv")

unique_company_names = data['company_name'].nunique()
unique_status_labels = data['status_label'].nunique()
unique_divisions = data['Division'].nunique()
unique_majorgroup = data['MajorGroup'].nunique()
unique_last_year = data['last_year'].nunique()

print("Number of unique values in 'company_name' column:", unique_company_names)
print("Number of unique values in 'status_label' column:", unique_status_labels)
print("Number of unique values in 'Division' column:", unique_divisions)
print("Number of unique values in 'MajorGroup' column:", unique_majorgroup)
print("Number of unique values in 'last_year' column:", unique_last_year)



Number of unique values in 'company_name' column: 8971
Number of unique values in 'status_label' column: 2
Number of unique values in 'Division' column: 10
Number of unique values in 'MajorGroup' column: 73
Number of unique values in 'last_year' column: 20


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Columns: 195 entries, company_name to nasdaq_last5year
dtypes: float64(191), int64(1), object(3)
memory usage: 13.3+ MB


In [3]:
# Encoding non-numeric columns

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder object
label_encoder = LabelEncoder()


# Label-encode the company_name column
data['company_name_encoded'] = label_encoder.fit_transform(data['company_name'])


# Label-encode the Division column
data['Division_encoded'] = label_encoder.fit_transform(data['Division'])

# Label-encode MajorGroup columns
data['MajorGroup_encoded'] = label_encoder.fit_transform(data['MajorGroup'])
#When using label encoding for feature encoding, the sequential relationship between categories will not be introduced and will not have an impact on prediction.

# Encode the label of the status_label column
data['status_label_encoded'] = label_encoder.fit_transform(data['status_label'])
#With only two categories, it may be simpler and more appropriate to use label encoding as it maps the categories to 0 and 1, suitable for use in tree-based models. 
#If use one-hot encoding, a new column will be generated

print(data.head())




  company_name status_label Division  MajorGroup  last_year  X1_last1year  \
0          C_1        alive        D          37     2017.0         942.7   
1          C_2        alive        D          36     2010.0        1107.7   
2          C_3        alive        D          38     2008.0       12686.0   
3          C_4        alive        D          28     2007.0      581502.0   
4          C_5        alive        D          35     1999.0       28957.0   

   X1_last2year  X1_last3year  X1_last4year  X1_last5year  ...  \
0         888.5         873.1         954.1      1116.900  ...   
1         900.2        1077.4        1008.2       942.700  ...   
2       13454.0       13582.0        7726.0      5807.000  ...   
3      353541.0     1037047.0      672072.0       692.991  ...   
4           NaN           NaN           NaN           NaN  ...   

   nyse_last5year  nasdaq_last1year  nasdaq_last2year  nasdaq_last3year  \
0     9467.185872       6293.024211       5015.926717       4932.

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Columns: 199 entries, company_name to status_label_encoded
dtypes: float64(191), int32(3), int64(2), object(3)
memory usage: 13.5+ MB


In [5]:
data.shape

(8971, 199)

In [6]:
unique_company_names = data['company_name_encoded'].nunique()
unique_status_labels = data['status_label_encoded'].nunique()
unique_divisions = data['Division_encoded'].nunique()
unique_majorgroup = data['MajorGroup_encoded'].nunique()

print("Number of unique values in 'company_name_encoded' column:", unique_company_names)
print("Number of unique values in 'status_label_encoded' column:", unique_status_labels)
print("Number of unique values in 'Division_encoded' column:", unique_divisions)
print("Number of unique values in 'MajorGroup_encoded' column:", unique_majorgroup)


Number of unique values in 'company_name_encoded' column: 8971
Number of unique values in 'status_label_encoded' column: 2
Number of unique values in 'Division_encoded' column: 10
Number of unique values in 'MajorGroup_encoded' column: 73


In [7]:
unique_divisions = data['Division_encoded'].unique()
print("Unique values in 'Division_encoded' column:", unique_divisions)


Unique values in 'Division_encoded' column: [3 4 2 8 5 6 1 0 7 9]


In [8]:
missing_rows_count = data.isnull().any(axis=1).sum()
print("Number of rows with missing values:", missing_rows_count)


Number of rows with missing values: 3993


In [9]:
# Delete rows with missing values
data_cleaned = data.dropna()
# Delete non-numeric columns that are not encoded
# Delete specified column
data_cleaned = data_cleaned.drop(['company_name', 'status_label', 'Division', 'MajorGroup', 'last_year', 'company_name_encoded'], axis=1)

data_cleaned.shape

(4978, 193)

In [10]:
data_cleaned.head

<bound method NDFrame.head of       X1_last1year  X1_last2year  X1_last3year  X1_last4year  X1_last5year  \
0           942.70        888.50       873.100         954.1      1116.900   
1          1107.70        900.20      1077.400        1008.2       942.700   
3        581502.00     353541.00   1037047.000      672072.0       692.991   
5          6838.00       6642.00      5935.000        7229.0      6902.000   
6        160865.00     173942.00    212978.000      228456.0    142967.000   
...            ...           ...           ...           ...           ...   
8959       8218.00         21.33     16699.000       18523.0     16814.000   
8963        362.33     310358.00    405282.000      359824.0    331465.000   
8965      22026.00      26515.00        13.256        1801.0      5941.000   
8966      10566.00      11738.00      9599.000        9789.0     11645.000   
8969        931.60       1032.70       829.300         735.1       973.800   

      X2_last1year  X2_last2year 

In [11]:
status_counts = data_cleaned['status_label_encoded'].value_counts()
print(status_counts)


0    4548
1     430
Name: status_label_encoded, dtype: int64


### 1. RF

In [12]:
#the impact of imbalanced datasets


#### 1.1 imbalance dataset + CV

In [13]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Define a random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model on the training set
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = rf_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Perform cross-validation on the training set
cv_scores = cross_val_score(rf_classifier, X_train_val, y_train_val, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
# Retrain the model on the entire training set


# Make predictions on the test set
test_predictions = rf_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.9076305220883534
Cross-validation Scores: [0.91969887 0.92220828 0.91708543 0.92211055 0.92085427]
Mean CV Accuracy: 0.9203914806151208
Test Accuracy: 0.9046184738955824


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculation precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = rf_classifier.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9046184738955824
Precision: 1.0
Recall: 0.020618556701030927
F1 Score: 0.04040404040404041
Micro F1 Score: 0.9046184738955824
Macro F1 Score: 0.49510957434887704
ROC AUC: 0.8492311044344805
Confusion Matrix:
[[899   0]
 [ 95   2]]


#### 1.2 SMOTE

##### 1.2.1 SMOTE + cv

In [15]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define a random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model on the training set
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the validation set
val_predictions = rf_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Combine resampled training set and validation set
X_train_val_reshape = pd.concat([pd.DataFrame(X_train_resampled), pd.DataFrame(X_val)], axis=0)
y_train_val_reshape = pd.concat([pd.Series(y_train_resampled), pd.Series(y_val)], axis=0)


# Perform cross-validation on the train_val set
cv_scores = cross_val_score(rf_classifier, X_train_val_reshape, y_train_val_reshape, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
# Retrain the model on the train_val set



# Make predictions on the test set
test_predictions = rf_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)



0    2747
1    2747
Name: status_label_encoded, dtype: int64
Validation Accuracy: 0.9076305220883534
Cross-validation Scores: [0.92372881 0.98459168 0.98228043 0.98844376 0.93066256]
Mean CV Accuracy: 0.9619414483821263
Test Accuracy: 0.9066265060240963


In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = rf_classifier.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9066265060240963
Precision: 0.5714285714285714
Recall: 0.16494845360824742
F1 Score: 0.256
Micro F1 Score: 0.9066265060240963
Macro F1 Score: 0.6030937332619175
ROC AUC: 0.8306652294072452
Confusion Matrix:
[[887  12]
 [ 81  16]]


##### 1.2.2 SMOTE + CV + FS

##### 1.2.3 SMOTE + GRID + FS

In [17]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define a random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Use SelectFromModel for feature selection
selector = SelectFromModel(estimator=rf_classifier, threshold='median')  #better precision, worse recall&f1, compare with mean
X_train_resampled_selected = selector.fit_transform(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]   #Select a list of column names for the characteristics.
print("Selected Features:")
print(selected_features)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train_resampled_selected, y_train_resampled)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Get the best model
best_rf_classifier = grid_search.best_estimator_

# Make predictions on the validation set with selected features
X_val_selected = selector.transform(X_val)
val_predictions = best_rf_classifier.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy with Best Model:", val_accuracy)




# Make predictions on the test set with the best model
X_test_selected = selector.transform(X_test)
test_predictions = best_rf_classifier.predict(X_test_selected)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy with Best Model:", test_accuracy)


Selected Features:
Index(['X1_last3year', 'X3_last3year', 'X3_last4year', 'X3_last5year',
       'X4_last1year', 'X4_last2year', 'X4_last3year', 'X4_last4year',
       'X4_last5year', 'X5_last3year', 'X5_last5year', 'X6_last1year',
       'X6_last2year', 'X7_last1year', 'X9_last1year', 'X9_last2year',
       'X9_last3year', 'X9_last4year', 'X9_last5year', 'X10_last1year',
       'X10_last2year', 'X10_last3year', 'X10_last5year', 'X11_last1year',
       'X11_last2year', 'X11_last3year', 'X11_last4year', 'X11_last5year',
       'X12_last1year', 'X12_last2year', 'X12_last3year', 'X12_last4year',
       'X13_last3year', 'X13_last4year', 'X15_last1year', 'X16_last2year',
       'X16_last3year', 'X16_last4year', 'X18_last2year', 'X18_last3year',
       'X18_last4year', 'X1_last2year_ycr', 'X1_last3year_ycr',
       'X1_last4year_ycr', 'X1_last5year_ycr', 'X2_last1year_ycr',
       'X2_last5year_ycr', 'X3_last1year_ycr', 'X4_last2year_ycr',
       'X4_last3year_ycr', 'X5_last2year_ycr', 'X5_l

In [18]:
#The evaluation metrics on the test set are calculated directly without threshold selection and re-prediction.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

#Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = best_rf_classifier.predict_proba(X_test_selected)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9076305220883534
Precision: 0.5714285714285714
Recall: 0.20618556701030927
F1 Score: 0.30303030303030304
Micro F1 Score: 0.9076305220883534
Macro F1 Score: 0.6267839687194526
ROC AUC: 0.8447186450007454
Confusion Matrix:
[[884  15]
 [ 77  20]]


#### 1.3 processing imbalanced dataset, Undersampling

In [19]:
# undersampling +grid +fs

In [20]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define an undersampler
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Define a random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Use SelectFromModel for feature selection
selector = SelectFromModel(estimator=rf_classifier, threshold='median')  #better precision, worse recall&f1, compare with mean
X_train_resampled_selected = selector.fit_transform(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]   #Select a list of column names for the characteristics.
print("Selected Features:")
print(selected_features)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train_resampled_selected, y_train_resampled)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Get the best model
best_rf_classifier = grid_search.best_estimator_

# Make predictions on the validation set with selected features
X_val_selected = selector.transform(X_val)
val_predictions = best_rf_classifier.predict(X_val_selected)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy with Best Model:", val_accuracy)


# Make predictions on the test set with the best model
X_test_selected = selector.transform(X_test)
test_predictions = best_rf_classifier.predict(X_test_selected)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy with Best Model:", test_accuracy)


Selected Features:
Index(['X1_last4year', 'X1_last5year', 'X3_last2year', 'X3_last3year',
       'X3_last4year', 'X4_last1year', 'X4_last2year', 'X4_last3year',
       'X4_last5year', 'X5_last3year', 'X6_last1year', 'X6_last4year',
       'X7_last1year', 'X8_last2year', 'X8_last4year', 'X9_last2year',
       'X9_last3year', 'X10_last5year', 'X11_last1year', 'X11_last2year',
       'X11_last3year', 'X11_last4year', 'X11_last5year', 'X12_last1year',
       'X12_last2year', 'X12_last3year', 'X12_last4year', 'X13_last1year',
       'X13_last3year', 'X13_last4year', 'X15_last1year', 'X15_last2year',
       'X15_last3year', 'X18_last1year', 'X18_last2year', 'X18_last4year',
       'X1_last1year_ycr', 'X1_last2year_ycr', 'X1_last3year_ycr',
       'X1_last4year_ycr', 'X1_last5year_ycr', 'X2_last1year_ycr',
       'X2_last3year_ycr', 'X2_last4year_ycr', 'X3_last2year_ycr',
       'X4_last1year_ycr', 'X5_last2year_ycr', 'X5_last3year_ycr',
       'X5_last4year_ycr', 'X6_last1year_ycr', 'X6_last

In [21]:
#The evaluation metrics on the test set are calculated directly without threshold selection and re-prediction.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

#Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = best_rf_classifier.predict_proba(X_test_selected)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.7670682730923695
Precision: 0.26804123711340205
Recall: 0.8041237113402062
F1 Score: 0.4020618556701031
Micro F1 Score: 0.7670682730923695
Macro F1 Score: 0.6287117258400391
ROC AUC: 0.8441624714746052
Confusion Matrix:
[[686 213]
 [ 19  78]]


### 2. XGBOOST

In [22]:
#2.1 SMOTE


In [23]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


# Print the number of samples after oversampling
print(pd.Series(y_train_resampled).value_counts())

# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train_resampled, label=y_train_resampled)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',         # Logarithmic loss
    'eta': 0.07,                       # Learning rate
    'max_depth': 60,                   # Maximum depth of the tree
    'subsample': 0.9,                 # Subsample ratio of the training instances
    'colsample_bytree': 0.9,          # Subsample ratio of columns when constructing each tree
    'lambda': 1,                      # L2 regularization term (default is 1)
    'alpha': 0,                       # L1 regularization term (default is 0)
    #'num_rounds': 100,                # Number of boosting rounds
    'seed': 42                        # Random seed
}

# Train XGBoost model
num_rounds = 100
watchlist = [(dtrain, 'train'), (dval, 'eval')]
xgb_model = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=10)

# Make predictions on the test set
test_predictions_proba = xgb_model.predict(dtest)
test_predictions = [1 if x > 0.5 else 0 for x in test_predictions_proba]

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


0    2747
1    2747
Name: status_label_encoded, dtype: int64
[0]	train-logloss:0.63392	eval-logloss:0.64883
[1]	train-logloss:0.58283	eval-logloss:0.61033
[2]	train-logloss:0.53739	eval-logloss:0.57775
[3]	train-logloss:0.49657	eval-logloss:0.54928
[4]	train-logloss:0.45999	eval-logloss:0.52286
[5]	train-logloss:0.42704	eval-logloss:0.49926
[6]	train-logloss:0.39665	eval-logloss:0.47686
[7]	train-logloss:0.36937	eval-logloss:0.45837
[8]	train-logloss:0.34414	eval-logloss:0.44103
[9]	train-logloss:0.32140	eval-logloss:0.42586
[10]	train-logloss:0.30027	eval-logloss:0.41063
[11]	train-logloss:0.28114	eval-logloss:0.39761
[12]	train-logloss:0.26349	eval-logloss:0.38446
[13]	train-logloss:0.24687	eval-logloss:0.37368
[14]	train-logloss:0.23180	eval-logloss:0.36414
[15]	train-logloss:0.21758	eval-logloss:0.35438
[16]	train-logloss:0.20463	eval-logloss:0.34600
[17]	train-logloss:0.19276	eval-logloss:0.33837
[18]	train-logloss:0.18150	eval-logloss:0.33127
[19]	train-logloss:0.17130	eval-loglo

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, test_predictions_proba)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.8995983935742972
Precision: 0.46511627906976744
Recall: 0.20618556701030927
F1 Score: 0.2857142857142857
Micro F1 Score: 0.8995983935742972
Macro F1 Score: 0.6158593026843566
ROC AUC: 0.8464273018130108
Confusion Matrix:
[[876  23]
 [ 77  20]]


In [25]:
# 2.2 SMOTE + FS

In [26]:
# 2.3 SMOTE + FS + GRID

In [27]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd
from sklearn.feature_selection import SelectFromModel

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define a XGBoost classifier
xgb_classifier = xgb.XGBClassifier(random_state=42)

# Perform feature selection 
selector = SelectFromModel(estimator=xgb_classifier, threshold='mean')  # threshold: mean
selector.fit(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]
print("Selected Features:")
print(selected_features)

# Transform the datasets
X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)


# Define parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [10, 50, 100],
    'max_depth': [5, 25, 70],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0], 
}

# Create cross-validation folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=cv, scoring='accuracy')

# Fit GridSearchCV to data
grid_search.fit(X_train_selected, y_train_resampled)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Train XGBoost model with best parameters
best_params = grid_search.best_params_
best_xgb_classifier = grid_search.best_estimator_



# Make predictions on the validation set
val_predictions = best_xgb_classifier.predict(X_val_selected)

# Calculate accuracy on test set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Val Accuracy:", val_accuracy)



# Make predictions on the test set
test_predictions = best_xgb_classifier.predict(X_test_selected)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Selected Features:
Index(['X3_last3year', 'X3_last4year', 'X4_last2year', 'X4_last4year',
       'X6_last1year', 'X8_last3year', 'X9_last2year', 'X9_last4year',
       'X10_last5year', 'X11_last1year', 'X11_last2year', 'X12_last1year',
       'X12_last3year', 'X16_last2year', 'X16_last4year', 'X2_last2year_ycr',
       'X4_last3year_ycr', 'X5_last3year_ycr', 'X6_last2year_ycr',
       'X7_last1year_ycr', 'X7_last5year_ycr', 'X8_last2year_ycr',
       'X9_last3year_ycr', 'X9_last4year_ycr', 'X10_last2year_ycr',
       'X10_last3year_ycr', 'X13_last1year_ycr', 'X13_last3year_ycr',
       'X15_last1year_ycr', 'nyse_last1year', 'nyse_last3year',
       'nyse_last4year', 'nyse_last5year', 'nasdaq_last4year',
       'Division_encoded'],
      dtype='object')
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 25, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.9}
Best Score: 0.9632325130811088
Val Accuracy: 0.9036144578313253
Test Accuracy: 0.913654618473

In [28]:
#The evaluation metrics on the test set are calculated directly without threshold selection and re-prediction.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

#Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = best_xgb_classifier.predict_proba(X_test_selected)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9136546184738956
Precision: 0.6
Recall: 0.3402061855670103
F1 Score: 0.4342105263157895
Micro F1 Score: 0.9136546184738956
Macro F1 Score: 0.6937356979405035
ROC AUC: 0.8486749309083402
Confusion Matrix:
[[877  22]
 [ 64  33]]


In [29]:
#2.4 UNDERSAMPLING + FS +GRID

In [30]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd
from sklearn.feature_selection import SelectFromModel

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define an undersampler
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Define a XGBoost classifier
xgb_classifier = xgb.XGBClassifier(random_state=42)

# Perform feature selection 
selector = SelectFromModel(estimator=xgb_classifier, threshold='mean')  # threshold: mean
selector.fit(X_train_resampled, y_train_resampled)
selected_features = X_train_resampled.columns[selector.get_support()]
print("Selected Features:")
print(selected_features)

# Transform the datasets
X_train_selected = selector.transform(X_train_resampled)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(X_test)


# Define parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [10, 50, 100],
    'max_depth': [5, 25, 70],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],     
}

# Create cross-validation folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=cv, scoring='accuracy')

# Fit GridSearchCV to data
grid_search.fit(X_train_selected, y_train_resampled)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Train XGBoost model with best parameters
best_params = grid_search.best_params_
best_xgb_classifier = grid_search.best_estimator_

# Make predictions on the validation set
val_predictions = best_xgb_classifier.predict(X_val_selected)

# Calculate accuracy on test set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Val Accuracy:", val_accuracy)


# Make predictions on the test set
test_predictions = best_xgb_classifier.predict(X_test_selected)

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Selected Features:
Index(['X1_last2year', 'X1_last3year', 'X1_last5year', 'X2_last1year',
       'X2_last2year', 'X2_last4year', 'X2_last5year', 'X3_last3year',
       'X3_last5year', 'X4_last2year', 'X4_last3year', 'X5_last1year',
       'X5_last3year', 'X6_last3year', 'X8_last3year', 'X8_last5year',
       'X9_last2year', 'X10_last5year', 'X11_last1year', 'X11_last5year',
       'X12_last2year', 'X13_last3year', 'X14_last1year', 'X14_last2year',
       'X15_last1year', 'X15_last3year', 'X16_last4year', 'X18_last1year',
       'X1_last1year_ycr', 'X1_last2year_ycr', 'X3_last2year_ycr',
       'X4_last3year_ycr', 'X4_last4year_ycr', 'X4_last5year_ycr',
       'X5_last1year_ycr', 'X5_last2year_ycr', 'X5_last4year_ycr',
       'X6_last1year_ycr', 'X6_last4year_ycr', 'X7_last1year_ycr',
       'X7_last2year_ycr', 'X7_last3year_ycr', 'X8_last2year_ycr',
       'X8_last3year_ycr', 'X8_last4year_ycr', 'X8_last5year_ycr',
       'X9_last1year_ycr', 'X9_last3year_ycr', 'X9_last4year_ycr',
    

In [31]:
#The evaluation metrics on the test set are calculated directly without threshold selection and re-prediction.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

#Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = best_xgb_classifier.predict_proba(X_test_selected)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.7710843373493976
Precision: 0.2600732600732601
Recall: 0.7319587628865979
F1 Score: 0.38378378378378375
Micro F1 Score: 0.7710843373493976
Macro F1 Score: 0.6216082913986736
ROC AUC: 0.8381477701455224
Confusion Matrix:
[[697 202]
 [ 26  71]]
