In [1]:
import pandas as pd

# Read the data set
data = pd.read_csv("new_df_selected2_last2years_adjusted.csv")

unique_company_names = data['company_name'].nunique()
unique_status_labels = data['status_label'].nunique()
unique_divisions = data['Division'].nunique()
unique_majorgroup = data['MajorGroup'].nunique()
unique_last_year = data['last_year'].nunique()

print("Number of unique values in 'company_name' column:", unique_company_names)
print("Number of unique values in 'status_label' column:", unique_status_labels)
print("Number of unique values in 'Division' column:", unique_divisions)
print("Number of unique values in 'MajorGroup' column:", unique_majorgroup)
print("Number of unique values in 'last_year' column:", unique_last_year)



Number of unique values in 'company_name' column: 8971
Number of unique values in 'status_label' column: 2
Number of unique values in 'Division' column: 10
Number of unique values in 'MajorGroup' column: 73
Number of unique values in 'last_year' column: 20


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Data columns (total 81 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   company_name       8971 non-null   object 
 1   status_label       8971 non-null   object 
 2   Division           8971 non-null   object 
 3   MajorGroup         8971 non-null   int64  
 4   last_year          8971 non-null   float64
 5   X1_last1year       8971 non-null   float64
 6   X1_last2year       8078 non-null   float64
 7   X2_last1year       8971 non-null   float64
 8   X2_last2year       8078 non-null   float64
 9   X3_last1year       8971 non-null   float64
 10  X3_last2year       8078 non-null   float64
 11  X4_last1year       8971 non-null   float64
 12  X4_last2year       8078 non-null   float64
 13  X5_last1year       8971 non-null   float64
 14  X5_last2year       8078 non-null   float64
 15  X6_last1year       8971 non-null   float64
 16  X6_last2year       8078 

In [3]:
# Encoding non-numeric columns

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder object
label_encoder = LabelEncoder()


# Label-encode the company_name column
data['company_name_encoded'] = label_encoder.fit_transform(data['company_name'])


# Label-encode the Division column
data['Division_encoded'] = label_encoder.fit_transform(data['Division'])

# Label-encode MajorGroup columns
data['MajorGroup_encoded'] = label_encoder.fit_transform(data['MajorGroup'])
#When using label encoding for feature encoding, the sequential relationship between categories will not be introduced and will not have an impact on prediction.

# Encode the label of the status_label column
data['status_label_encoded'] = label_encoder.fit_transform(data['status_label'])
#With only two categories, it may be simpler and more appropriate to use label encoding as it maps the categories to 0 and 1, suitable for use in tree-based models. 
#If use one-hot encoding, a new column will be generated

print(data.head())




  company_name status_label Division  MajorGroup  last_year  X1_last1year  \
0          C_1        alive        D          37     2017.0         942.7   
1          C_2        alive        D          36     2010.0        1107.7   
2          C_3        alive        D          38     2008.0       12686.0   
3          C_4        alive        D          28     2007.0      581502.0   
4          C_5        alive        D          35     1999.0       28957.0   

   X1_last2year  X2_last1year  X2_last2year  X3_last1year  ...  \
0         888.5       1524.70        1504.1       1413.20  ...   
1         900.2       1474.50        1343.6        677.20  ...   
2       13454.0      21401.00       27171.0      19334.00  ...   
3      353541.0    1288165.00      927239.0        267.81  ...   
4           NaN         42.21           NaN      79567.00  ...   

   X18_last1year_ycr  X18_last2year_ycr  nyse_last1year  nyse_last2year  \
0           0.001482           0.061414    11912.848307    10451.

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8971 entries, 0 to 8970
Data columns (total 85 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   company_name          8971 non-null   object 
 1   status_label          8971 non-null   object 
 2   Division              8971 non-null   object 
 3   MajorGroup            8971 non-null   int64  
 4   last_year             8971 non-null   float64
 5   X1_last1year          8971 non-null   float64
 6   X1_last2year          8078 non-null   float64
 7   X2_last1year          8971 non-null   float64
 8   X2_last2year          8078 non-null   float64
 9   X3_last1year          8971 non-null   float64
 10  X3_last2year          8078 non-null   float64
 11  X4_last1year          8971 non-null   float64
 12  X4_last2year          8078 non-null   float64
 13  X5_last1year          8971 non-null   float64
 14  X5_last2year          8078 non-null   float64
 15  X6_last1year         

In [5]:
data.shape

(8971, 85)

In [6]:
unique_company_names = data['company_name_encoded'].nunique()
unique_status_labels = data['status_label_encoded'].nunique()
unique_divisions = data['Division_encoded'].nunique()
unique_majorgroup = data['MajorGroup_encoded'].nunique()

print("Number of unique values in 'company_name_encoded' column:", unique_company_names)
print("Number of unique values in 'status_label_encoded' column:", unique_status_labels)
print("Number of unique values in 'Division_encoded' column:", unique_divisions)
print("Number of unique values in 'MajorGroup_encoded' column:", unique_majorgroup)


Number of unique values in 'company_name_encoded' column: 8971
Number of unique values in 'status_label_encoded' column: 2
Number of unique values in 'Division_encoded' column: 10
Number of unique values in 'MajorGroup_encoded' column: 73


In [7]:
unique_divisions = data['Division_encoded'].unique()
print("Unique values in 'Division_encoded' column:", unique_divisions)


Unique values in 'Division_encoded' column: [3 4 2 8 5 6 1 0 7 9]


In [8]:
missing_rows_count = data.isnull().any(axis=1).sum()
print("Number of rows with missing values:", missing_rows_count)


Number of rows with missing values: 1870


In [9]:
# Delete rows with missing values
data_cleaned = data.dropna()
# Delete non-numeric columns that are not encoded
# List of static columns to drop
static_columns_to_drop = [
    'company_name', 'status_label', 'Division', 'MajorGroup', 'last_year', 'company_name_encoded',
    'Division_encoded', 'MajorGroup_encoded'
]

# List of dynamic columns to drop (nyse and nasdaq columns for 1 year and 2 years)
nyse_nasdaq_columns_to_drop = [f'{exchange}_last{year}year' for exchange in ('nyse', 'nasdaq') for year in (1, 2)]

# Add X1_last1year_ycr to X18_last1year_ycr and X1_last2year_ycr to X18_last2year_ycr columns to the list
ycr_columns_to_drop = [f'X{i}_last{year}year_ycr' for i in range(1, 19) for year in (1, 2)]

# Combine all columns to drop
columns_to_drop = static_columns_to_drop + nyse_nasdaq_columns_to_drop + ycr_columns_to_drop

# Drop the columns from the data
data_cleaned = data_cleaned.drop(columns=columns_to_drop)



data_cleaned.shape

(7101, 37)

In [10]:
data_cleaned.head

<bound method NDFrame.head of       X1_last1year  X1_last2year  X2_last1year  X2_last2year  X3_last1year  \
0            942.7         888.5       1524.70       1504.10       1413.20   
1           1107.7         900.2       1474.50       1343.60        677.20   
2          12686.0       13454.0      21401.00      27171.00      19334.00   
3         581502.0      353541.0    1288165.00     927239.00        267.81   
5           6838.0        6642.0      25088.00      25438.00      18138.00   
...            ...           ...           ...           ...           ...   
8966       10566.0       11738.0      28278.00      26206.00      31288.00   
8967        3369.0        9049.0       3466.00       9198.00        208.00   
8968        2482.2        2340.6       9401.50      10252.40        966.70   
8969         931.6        1032.7       2810.20       2542.00       1475.90   
8970       82589.0      135207.0       1625.37       1736.11      68817.00   

      X3_last2year  X4_last1year 

In [11]:
status_counts = data_cleaned['status_label_encoded'].value_counts()
print(status_counts)


0    6537
1     564
Name: status_label_encoded, dtype: int64


### 1. RF

In [12]:
#the impact of imbalanced datasets


#### 1.1 imbalance dataset + CV

In [13]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Define a random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model on the training set
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = rf_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Perform cross-validation on the training set
cv_scores = cross_val_score(rf_classifier, X_train_val, y_train_val, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
# Retrain the model on the entire training set


# Make predictions on the test set
test_predictions = rf_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.9253521126760563
Cross-validation Scores: [0.92253521 0.92077465 0.92077465 0.92341549 0.92517606]
Mean CV Accuracy: 0.9225352112676056
Test Accuracy: 0.9218859957776214


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculation precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
y_prob = rf_classifier.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9218859957776214
Precision: 0.8
Recall: 0.03508771929824561
F1 Score: 0.06722689075630252
Micro F1 Score: 0.9218859957776214
Macro F1 Score: 0.5132315136851656
ROC AUC: 0.7864031731969556
Confusion Matrix:
[[1306    1]
 [ 110    4]]


### 2. XGBOOST

In [15]:
#2.1 imbalance


In [16]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)


# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',         # Logarithmic loss
    'eta': 0.07,                       # Learning rate
    'max_depth': 60,                   # Maximum depth of the tree
    'subsample': 0.9,                 # Subsample ratio of the training instances
    'colsample_bytree': 0.9,          # Subsample ratio of columns when constructing each tree
    'lambda': 1,                      # L2 regularization term (default is 1)
    'alpha': 0,                       # L1 regularization term (default is 0)
    'seed': 42                        # Random seed
}

# Train XGBoost model
num_rounds = 100
watchlist = [(dtrain, 'train'), (dval, 'eval')]
xgb_model = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=10)

# Make predictions on the test set
test_predictions_proba = xgb_model.predict(dtest)
test_predictions = [1 if x > 0.5 else 0 for x in test_predictions_proba]

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


[0]	train-logloss:0.29022	eval-logloss:0.29280
[1]	train-logloss:0.27606	eval-logloss:0.28575
[2]	train-logloss:0.26279	eval-logloss:0.27930
[3]	train-logloss:0.25095	eval-logloss:0.27275
[4]	train-logloss:0.23990	eval-logloss:0.26773
[5]	train-logloss:0.22953	eval-logloss:0.26338
[6]	train-logloss:0.21956	eval-logloss:0.25906
[7]	train-logloss:0.21001	eval-logloss:0.25567
[8]	train-logloss:0.20104	eval-logloss:0.25277
[9]	train-logloss:0.19286	eval-logloss:0.24876
[10]	train-logloss:0.18520	eval-logloss:0.24527
[11]	train-logloss:0.17809	eval-logloss:0.24293
[12]	train-logloss:0.17081	eval-logloss:0.24083
[13]	train-logloss:0.16420	eval-logloss:0.23837
[14]	train-logloss:0.15746	eval-logloss:0.23613
[15]	train-logloss:0.15179	eval-logloss:0.23415
[16]	train-logloss:0.14624	eval-logloss:0.23146
[17]	train-logloss:0.14132	eval-logloss:0.22987
[18]	train-logloss:0.13618	eval-logloss:0.22856
[19]	train-logloss:0.13118	eval-logloss:0.22768
[20]	train-logloss:0.12652	eval-logloss:0.22641
[2

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, test_predictions)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, test_predictions)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)
print("F1 Score:", f1)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, test_predictions_proba)
print("ROC AUC:", roc_auc)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9197748064743139
Precision: 0.5
Recall: 0.043859649122807015
F1 Score: 0.08064516129032258
Micro F1 Score: 0.9197748064743139
Macro F1 Score: 0.5193512782169052
ROC AUC: 0.7948294607981314
Confusion Matrix:
[[1302    5]
 [ 109    5]]


### 3. GB


In [18]:
#imbalance+cv

In [19]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Define a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)

# Train Gradient Boosting model
gb_classifier.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = gb_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Perform cross-validation on the train_val set
cv_scores = cross_val_score(gb_classifier, X_train_val, y_train_val, cv=5)
# Print cross-validat ion scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


# Make predictions on the test set
test_predictions = gb_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.9246478873239437
Cross-validation Scores: [0.92077465 0.91989437 0.92429577 0.92429577 0.92429577]
Mean CV Accuracy: 0.9227112676056338
Test Accuracy: 0.9204785362420831


In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = gb_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9204785362420831
Macro F1 Score: 0.5198707080855288
Test Accuracy: 0.9204785362420831
Precision: 0.5555555555555556
Recall: 0.043859649122807015
F1 Score: 0.08130081300813007
ROC AUC Score: 0.789930066175385
Confusion Matrix:
 [[1303    4]
 [ 109    5]]


### 4. Bagging

In [21]:
# imbalanced + cv

In [22]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into features and labels
X = data_cleaned.drop('status_label_encoded', axis=1)
y = data_cleaned['status_label_encoded']

# Split the data set into training set, validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 80% training, 20% validation

# Define a XGBoost base estimator
base_estimator = XGBClassifier(n_estimators=100, random_state=42)  #better than base_estimator=DecisionTreeClassifier
# Define a bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42)

# Train the model on the training set
bagging_classifier.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = bagging_classifier.predict(X_val)
# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

# Perform cross-validation on the train_val set
cv_scores = cross_val_score(bagging_classifier, X_train_val, y_train_val, cv=5)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())



# Make predictions on the test set
test_predictions = bagging_classifier.predict(X_test)
# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)


Validation Accuracy: 0.9267605633802817
Cross-validation Scores: [0.91989437 0.92605634 0.92517606 0.92605634 0.92341549]
Mean CV Accuracy: 0.9241197183098592
Test Accuracy: 0.9247009148486981


In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


# Calculate accuracy on test set
test_accuracy = accuracy_score(y_test, test_predictions)

# Calculate precision
precision = precision_score(y_test, test_predictions)

# Calculate recall
recall = recall_score(y_test, test_predictions)

# Calculate F1 score
f1 = f1_score(y_test, test_predictions)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, test_predictions, average='micro')
print("Micro F1 Score:", micro_f1)

# Calculate macro F1 score
macro_f1 = f1_score(y_test, test_predictions, average='macro')
print("Macro F1 Score:", macro_f1)

# Calculate ROC AUC score
test_predictions_proba = bagging_classifier.predict_proba(X_test)[:, 1]  # Probabilities for positive class
roc_auc = roc_auc_score(y_test, test_predictions_proba)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)


Micro F1 Score: 0.9247009148486981
Macro F1 Score: 0.5453642988066512
Test Accuracy: 0.9247009148486981
Precision: 0.8888888888888888
Recall: 0.07017543859649122
F1 Score: 0.13008130081300814
ROC AUC Score: 0.7904267171371427
Confusion Matrix:
 [[1306    1]
 [ 106    8]]
