In [1]:
#LSTM


In [2]:
import pandas as pd

# Read the data set
data = pd.read_csv("new_df_selected5_last5years_adjusted.csv")


unique_company_names = data['company_name'].nunique()
unique_status_labels = data['status_label'].nunique()
unique_divisions = data['Division'].nunique()
unique_majorgroup = data['MajorGroup'].nunique()
unique_last_year = data['last_year'].nunique()

print("Number of unique values in 'company_name' column:", unique_company_names)
print("Number of unique values in 'status_label' column:", unique_status_labels)
print("Number of unique values in 'Division' column:", unique_divisions)
print("Number of unique values in 'MajorGroup' column:", unique_majorgroup)
print("Number of unique values in 'last_year' column:", unique_last_year)



Number of unique values in 'company_name' column: 8971
Number of unique values in 'status_label' column: 2
Number of unique values in 'Division' column: 10
Number of unique values in 'MajorGroup' column: 73
Number of unique values in 'last_year' column: 20


In [3]:
data.shape

(8971, 195)

In [4]:
# Encoding non-numeric columns

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder object
label_encoder = LabelEncoder()


# Label-encode the company_name column
data['company_name_encoded'] = label_encoder.fit_transform(data['company_name'])


# Label-encode the Division column
data['Division_encoded'] = label_encoder.fit_transform(data['Division'])

# Label-encode MajorGroup columns
data['MajorGroup_encoded'] = label_encoder.fit_transform(data['MajorGroup'])
#When using label encoding for feature encoding, the sequential relationship between categories will not be introduced and will not have an impact on prediction.

# Encode the label of the status_label column
data['status_label_encoded'] = label_encoder.fit_transform(data['status_label'])
#With only two categories, it may be simpler and more appropriate to use label encoding as it maps the categories to 0 and 1, suitable for use in tree-based models. 
#If use one-hot encoding, a new column will be generated

print(data.head())




  company_name status_label Division  MajorGroup  last_year  X1_last1year  \
0          C_1        alive        D          37     2017.0         942.7   
1          C_2        alive        D          36     2010.0        1107.7   
2          C_3        alive        D          38     2008.0       12686.0   
3          C_4        alive        D          28     2007.0      581502.0   
4          C_5        alive        D          35     1999.0       28957.0   

   X1_last2year  X1_last3year  X1_last4year  X1_last5year  ...  \
0         888.5         873.1         954.1      1116.900  ...   
1         900.2        1077.4        1008.2       942.700  ...   
2       13454.0       13582.0        7726.0      5807.000  ...   
3      353541.0     1037047.0      672072.0       692.991  ...   
4           NaN           NaN           NaN           NaN  ...   

   nyse_last5year  nasdaq_last1year  nasdaq_last2year  nasdaq_last3year  \
0     9467.185872       6293.024211       5015.926717       4932.

In [5]:
data.shape

(8971, 199)

In [6]:
missing_rows_count = data.isnull().any(axis=1).sum()
print("Number of rows with missing values:", missing_rows_count)


Number of rows with missing values: 3993


In [7]:
# Delete rows with missing values
data_cleaned = data.dropna()
# Delete non-numeric columns that are not encoded
# Delete specified column
data_cleaned = data_cleaned.drop(['company_name', 'status_label', 'Division', 'MajorGroup', 'last_year', 'company_name_encoded'], axis=1)

data_cleaned.shape

(4978, 193)

In [8]:
data_cleaned.head

<bound method NDFrame.head of       X1_last1year  X1_last2year  X1_last3year  X1_last4year  X1_last5year  \
0           942.70        888.50       873.100         954.1      1116.900   
1          1107.70        900.20      1077.400        1008.2       942.700   
3        581502.00     353541.00   1037047.000      672072.0       692.991   
5          6838.00       6642.00      5935.000        7229.0      6902.000   
6        160865.00     173942.00    212978.000      228456.0    142967.000   
...            ...           ...           ...           ...           ...   
8959       8218.00         21.33     16699.000       18523.0     16814.000   
8963        362.33     310358.00    405282.000      359824.0    331465.000   
8965      22026.00      26515.00        13.256        1801.0      5941.000   
8966      10566.00      11738.00      9599.000        9789.0     11645.000   
8969        931.60       1032.70       829.300         735.1       973.800   

      X2_last1year  X2_last2year 

In [9]:
status_counts = data_cleaned['status_label_encoded'].value_counts()
print(status_counts)


0    4548
1     430
Name: status_label_encoded, dtype: int64


In [10]:
#the impact of imbalanced datasets

### 1.1 imbalance dataset

In [11]:
# Feature column names
last1year_features = []
last2year_features = []
last3year_features = []
last4year_features = []
last5year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last1year_features.append(f'X{i}_last1year_ycr')
    last2year_features.append(f'X{i}_last2year')
    last2year_features.append(f'X{i}_last2year_ycr')
    last3year_features.append(f'X{i}_last3year')
    last3year_features.append(f'X{i}_last3year_ycr')
    last4year_features.append(f'X{i}_last4year')
    last4year_features.append(f'X{i}_last4year_ycr')
    last5year_features.append(f'X{i}_last5year')
    last5year_features.append(f'X{i}_last5year_ycr')

last1year_features.extend(['nyse_last1year', 'nasdaq_last1year', 'Division_encoded', 'MajorGroup_encoded'])
last2year_features.extend(['nyse_last2year', 'nasdaq_last2year', 'Division_encoded', 'MajorGroup_encoded'])
last3year_features.extend(['nyse_last3year', 'nasdaq_last3year', 'Division_encoded', 'MajorGroup_encoded'])
last4year_features.extend(['nyse_last4year', 'nasdaq_last4year', 'Division_encoded', 'MajorGroup_encoded'])
last5year_features.extend(['nyse_last5year', 'nasdaq_last5year', 'Division_encoded', 'MajorGroup_encoded'])

# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
X_last3year = data_cleaned[last3year_features].values
X_last4year = data_cleaned[last4year_features].values
X_last5year = data_cleaned[last5year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of X_last3year:", X_last3year.shape)
print("Shape of X_last4year:", X_last4year.shape)
print("Shape of X_last5year:", X_last5year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year, X_last3year, X_last4year,  X_last5year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)
X_last3year_scaled = scaler.transform(X_last3year)
X_last4year_scaled = scaler.transform(X_last4year)
X_last5year_scaled = scaler.transform(X_last5year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))
X_last3year_reshaped = np.reshape(X_last3year_scaled, (X_last3year_scaled.shape[0], 1, X_last3year_scaled.shape[1]))
X_last4year_reshaped = np.reshape(X_last4year_scaled, (X_last4year_scaled.shape[0], 1, X_last4year_scaled.shape[1]))
X_last5year_reshaped = np.reshape(X_last5year_scaled, (X_last5year_scaled.shape[0], 1, X_last5year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last5year_reshaped, X_last4year_reshaped, X_last3year_reshaped, X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_last3year_scaled:", X_last3year_scaled.shape)
print("Shape of X_last4year_scaled:", X_last4year_scaled.shape)
print("Shape of X_last5year_scaled:", X_last5year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)




from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(5, X_train.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X_last1year: (4978, 40)
Shape of X_last2year: (4978, 40)
Shape of X_last3year: (4978, 40)
Shape of X_last4year: (4978, 40)
Shape of X_last5year: (4978, 40)
Shape of y: (4978,)
Shape of X_last1year_scaled: (4978, 40)
Shape of X_last2year_scaled: (4978, 40)
Shape of X_last3year_scaled: (4978, 40)
Shape of X_last4year_scaled: (4978, 40)
Shape of X_last5year_scaled: (4978, 40)
Shape of X_combined_reshaped: (4978, 5, 40)
Shape of X_train: (2986, 5, 40)
Shape of X_val: (996, 5, 40)
Shape of X_test: (996, 5, 40)
Epoch 1/100
187/187 - 3s - 16ms/step - accuracy: 0.8999 - loss: 0.3003 - val_accuracy: 0.9046 - val_loss: 0.2554
Epoch 2/100
187/187 - 1s - 4ms/step - accuracy: 0.9213 - loss: 0.2336 - val_accuracy: 0.9096 - val_loss: 0.2547
Epoch 3/100
187/187 - 1s - 4ms/step - accuracy: 0.9240 - loss: 0.2198 - val_accuracy: 0.9056 - val_loss: 0.2556
Epoch 4/100
187/187 - 1s - 4ms/step - accuracy: 0.9246 - loss: 0.2128 - val_accuracy: 0.9056 - val_loss: 0.2598
Epoch 5/100
187/187 - 1s - 4ms/

In [12]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
Accuracy: 0.9026104417670683
Recall: 0.26804123711340205
Precision: 0.5
F1 Score: 0.348993288590604
Micro F1 Score: 0.9026104417670683
Macro F1 Score: 0.6481808548216178
ROC AUC: 0.761808653372017
Confusion Matrix:
[[873  26]
 [ 71  26]]


### 1.2 SMOTE

In [13]:
# Feature column names
last1year_features = []
last2year_features = []
last3year_features = []
last4year_features = []
last5year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last1year_features.append(f'X{i}_last1year_ycr')
    last2year_features.append(f'X{i}_last2year')
    last2year_features.append(f'X{i}_last2year_ycr')
    last3year_features.append(f'X{i}_last3year')
    last3year_features.append(f'X{i}_last3year_ycr')
    last4year_features.append(f'X{i}_last4year')
    last4year_features.append(f'X{i}_last4year_ycr')
    last5year_features.append(f'X{i}_last5year')
    last5year_features.append(f'X{i}_last5year_ycr')

last1year_features.extend(['nyse_last1year', 'nasdaq_last1year', 'Division_encoded', 'MajorGroup_encoded'])
last2year_features.extend(['nyse_last2year', 'nasdaq_last2year', 'Division_encoded', 'MajorGroup_encoded'])
last3year_features.extend(['nyse_last3year', 'nasdaq_last3year', 'Division_encoded', 'MajorGroup_encoded'])
last4year_features.extend(['nyse_last4year', 'nasdaq_last4year', 'Division_encoded', 'MajorGroup_encoded'])
last5year_features.extend(['nyse_last5year', 'nasdaq_last5year', 'Division_encoded', 'MajorGroup_encoded'])

# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
X_last3year = data_cleaned[last3year_features].values
X_last4year = data_cleaned[last4year_features].values
X_last5year = data_cleaned[last5year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of X_last3year:", X_last3year.shape)
print("Shape of X_last4year:", X_last4year.shape)
print("Shape of X_last5year:", X_last5year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year, X_last3year, X_last4year,  X_last5year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)
X_last3year_scaled = scaler.transform(X_last3year)
X_last4year_scaled = scaler.transform(X_last4year)
X_last5year_scaled = scaler.transform(X_last5year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))
X_last3year_reshaped = np.reshape(X_last3year_scaled, (X_last3year_scaled.shape[0], 1, X_last3year_scaled.shape[1]))
X_last4year_reshaped = np.reshape(X_last4year_scaled, (X_last4year_scaled.shape[0], 1, X_last4year_scaled.shape[1]))
X_last5year_reshaped = np.reshape(X_last5year_scaled, (X_last5year_scaled.shape[0], 1, X_last5year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last5year_reshaped, X_last4year_reshaped, X_last3year_reshaped, X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_last3year_scaled:", X_last3year_scaled.shape)
print("Shape of X_last4year_scaled:", X_last4year_scaled.shape)
print("Shape of X_last5year_scaled:", X_last5year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)



from imblearn.over_sampling import SMOTE
# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 5, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)



from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(5, X_train_resampled.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X_last1year: (4978, 40)
Shape of X_last2year: (4978, 40)
Shape of X_last3year: (4978, 40)
Shape of X_last4year: (4978, 40)
Shape of X_last5year: (4978, 40)
Shape of y: (4978,)
Shape of X_last1year_scaled: (4978, 40)
Shape of X_last2year_scaled: (4978, 40)
Shape of X_last3year_scaled: (4978, 40)
Shape of X_last4year_scaled: (4978, 40)
Shape of X_last5year_scaled: (4978, 40)
Shape of X_combined_reshaped: (4978, 5, 40)
Shape of X_train: (2986, 5, 40)
Shape of X_val: (996, 5, 40)
Shape of X_test: (996, 5, 40)
Shape of X_train_resampled: (5494, 200)
Shape of X_train_resampled: (5494, 5, 40)
Epoch 1/100
344/344 - 3s - 10ms/step - accuracy: 0.7630 - loss: 0.4920 - val_accuracy: 0.7108 - val_loss: 0.4860
Epoch 2/100
344/344 - 1s - 4ms/step - accuracy: 0.8209 - loss: 0.3957 - val_accuracy: 0.7500 - val_loss: 0.4548
Epoch 3/100
344/344 - 1s - 4ms/step - accuracy: 0.8495 - loss: 0.3428 - val_accuracy: 0.8052 - val_loss: 0.3911
Epoch 4/100
344/344 - 1s - 4ms/step - accuracy: 0.8813 - loss

In [14]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Accuracy: 0.8865461847389559
Recall: 0.36082474226804123
Precision: 0.4069767441860465
F1 Score: 0.3825136612021858
Micro F1 Score: 0.8865461847389557
Macro F1 Score: 0.660024105338517
ROC AUC: 0.7564303980367647
Confusion Matrix:
[[848  51]
 [ 62  35]]


### 1.3  Undersampling

In [15]:
# Feature column names
last1year_features = []
last2year_features = []
last3year_features = []
last4year_features = []
last5year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last1year_features.append(f'X{i}_last1year_ycr')
    last2year_features.append(f'X{i}_last2year')
    last2year_features.append(f'X{i}_last2year_ycr')
    last3year_features.append(f'X{i}_last3year')
    last3year_features.append(f'X{i}_last3year_ycr')
    last4year_features.append(f'X{i}_last4year')
    last4year_features.append(f'X{i}_last4year_ycr')
    last5year_features.append(f'X{i}_last5year')
    last5year_features.append(f'X{i}_last5year_ycr')

last1year_features.extend(['nyse_last1year', 'nasdaq_last1year', 'Division_encoded', 'MajorGroup_encoded'])
last2year_features.extend(['nyse_last2year', 'nasdaq_last2year', 'Division_encoded', 'MajorGroup_encoded'])
last3year_features.extend(['nyse_last3year', 'nasdaq_last3year', 'Division_encoded', 'MajorGroup_encoded'])
last4year_features.extend(['nyse_last4year', 'nasdaq_last4year', 'Division_encoded', 'MajorGroup_encoded'])
last5year_features.extend(['nyse_last5year', 'nasdaq_last5year', 'Division_encoded', 'MajorGroup_encoded'])

# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
X_last3year = data_cleaned[last3year_features].values
X_last4year = data_cleaned[last4year_features].values
X_last5year = data_cleaned[last5year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of X_last3year:", X_last3year.shape)
print("Shape of X_last4year:", X_last4year.shape)
print("Shape of X_last5year:", X_last5year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year, X_last3year, X_last4year,  X_last5year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)
X_last3year_scaled = scaler.transform(X_last3year)
X_last4year_scaled = scaler.transform(X_last4year)
X_last5year_scaled = scaler.transform(X_last5year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))
X_last3year_reshaped = np.reshape(X_last3year_scaled, (X_last3year_scaled.shape[0], 1, X_last3year_scaled.shape[1]))
X_last4year_reshaped = np.reshape(X_last4year_scaled, (X_last4year_scaled.shape[0], 1, X_last4year_scaled.shape[1]))
X_last5year_reshaped = np.reshape(X_last5year_scaled, (X_last5year_scaled.shape[0], 1, X_last5year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last5year_reshaped, X_last4year_reshaped, X_last3year_reshaped, X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_last3year_scaled:", X_last3year_scaled.shape)
print("Shape of X_last4year_scaled:", X_last4year_scaled.shape)
print("Shape of X_last5year_scaled:", X_last5year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)


# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)



from imblearn.under_sampling import RandomUnderSampler

# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply undersampling only on the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 5, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)




from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(5, X_train_resampled.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X_last1year: (4978, 40)
Shape of X_last2year: (4978, 40)
Shape of X_last3year: (4978, 40)
Shape of X_last4year: (4978, 40)
Shape of X_last5year: (4978, 40)
Shape of y: (4978,)
Shape of X_last1year_scaled: (4978, 40)
Shape of X_last2year_scaled: (4978, 40)
Shape of X_last3year_scaled: (4978, 40)
Shape of X_last4year_scaled: (4978, 40)
Shape of X_last5year_scaled: (4978, 40)
Shape of X_combined_reshaped: (4978, 5, 40)
Shape of X_train: (2986, 5, 40)
Shape of X_val: (996, 5, 40)
Shape of X_test: (996, 5, 40)
Shape of X_train_resampled: (478, 200)
Shape of X_train_resampled: (478, 5, 40)
Epoch 1/100
30/30 - 3s - 84ms/step - accuracy: 0.6423 - loss: 0.6378 - val_accuracy: 0.5994 - val_loss: 0.6928
Epoch 2/100
30/30 - 0s - 9ms/step - accuracy: 0.7155 - loss: 0.5618 - val_accuracy: 0.6506 - val_loss: 0.6012
Epoch 3/100
30/30 - 0s - 10ms/step - accuracy: 0.7259 - loss: 0.5302 - val_accuracy: 0.6677 - val_loss: 0.5544
Epoch 4/100
30/30 - 0s - 11ms/step - accuracy: 0.7469 - loss: 0.5050

In [16]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Accuracy: 0.7208835341365462
Recall: 0.7010309278350515
Precision: 0.21451104100946372
F1 Score: 0.3285024154589372
Micro F1 Score: 0.7208835341365462
Macro F1 Score: 0.5761650226851087
ROC AUC: 0.7455248099262641
Confusion Matrix:
[[650 249]
 [ 29  68]]


## undersampling, CNN-LSTM

In [18]:
# Feature column names
last1year_features = []
last2year_features = []
last3year_features = []
last4year_features = []
last5year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last1year_features.append(f'X{i}_last1year_ycr')
    last2year_features.append(f'X{i}_last2year')
    last2year_features.append(f'X{i}_last2year_ycr')
    last3year_features.append(f'X{i}_last3year')
    last3year_features.append(f'X{i}_last3year_ycr')
    last4year_features.append(f'X{i}_last4year')
    last4year_features.append(f'X{i}_last4year_ycr')
    last5year_features.append(f'X{i}_last5year')
    last5year_features.append(f'X{i}_last5year_ycr')

last1year_features.extend(['nyse_last1year', 'nasdaq_last1year', 'Division_encoded', 'MajorGroup_encoded'])
last2year_features.extend(['nyse_last2year', 'nasdaq_last2year', 'Division_encoded', 'MajorGroup_encoded'])
last3year_features.extend(['nyse_last3year', 'nasdaq_last3year', 'Division_encoded', 'MajorGroup_encoded'])
last4year_features.extend(['nyse_last4year', 'nasdaq_last4year', 'Division_encoded', 'MajorGroup_encoded'])
last5year_features.extend(['nyse_last5year', 'nasdaq_last5year', 'Division_encoded', 'MajorGroup_encoded'])

# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
X_last3year = data_cleaned[last3year_features].values
X_last4year = data_cleaned[last4year_features].values
X_last5year = data_cleaned[last5year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of X_last3year:", X_last3year.shape)
print("Shape of X_last4year:", X_last4year.shape)
print("Shape of X_last5year:", X_last5year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year, X_last3year, X_last4year,  X_last5year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)
X_last3year_scaled = scaler.transform(X_last3year)
X_last4year_scaled = scaler.transform(X_last4year)
X_last5year_scaled = scaler.transform(X_last5year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))
X_last3year_reshaped = np.reshape(X_last3year_scaled, (X_last3year_scaled.shape[0], 1, X_last3year_scaled.shape[1]))
X_last4year_reshaped = np.reshape(X_last4year_scaled, (X_last4year_scaled.shape[0], 1, X_last4year_scaled.shape[1]))
X_last5year_reshaped = np.reshape(X_last5year_scaled, (X_last5year_scaled.shape[0], 1, X_last5year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last5year_reshaped, X_last4year_reshaped, X_last3year_reshaped, X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_last3year_scaled:", X_last3year_scaled.shape)
print("Shape of X_last4year_scaled:", X_last4year_scaled.shape)
print("Shape of X_last5year_scaled:", X_last5year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)


from imblearn.under_sampling import RandomUnderSampler

# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply undersampling only on the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 5, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)



from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Reshape, Conv1D, MaxPooling1D, Flatten, ZeroPadding1D



#Define input layer
inputs = Input(shape=(5, X_train_resampled.shape[2]))
# Reshape input to fit Conv1D layer
inputs_reshaped = Reshape((X_train_resampled.shape[2], 5))(inputs)  
# Define CNN-LSTM model
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(inputs_reshaped)
# Add Padding1D layer
padded_layer = ZeroPadding1D(padding=1)(cnn_layer)  #Consider flatten being divisible by the LSTM time step, so add padding
pooling_layer = MaxPooling1D(pool_size=2)(padded_layer)
flattened_layer = Flatten()(pooling_layer)
print("Shape of flattened_layer:", flattened_layer.shape)
reshaped_layer = Reshape((5, -1))(flattened_layer)  # Reshape the output to fit LSTM input  #The input shape expected by the LSTM layer is (timesteps, features)
lstm_layer = LSTM(100)(reshaped_layer)   
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)
# Define the model
model = Model(inputs=inputs, outputs=output_layer)
# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)



Shape of X_last1year: (4978, 40)
Shape of X_last2year: (4978, 40)
Shape of X_last3year: (4978, 40)
Shape of X_last4year: (4978, 40)
Shape of X_last5year: (4978, 40)
Shape of y: (4978,)
Shape of X_last1year_scaled: (4978, 40)
Shape of X_last2year_scaled: (4978, 40)
Shape of X_last3year_scaled: (4978, 40)
Shape of X_last4year_scaled: (4978, 40)
Shape of X_last5year_scaled: (4978, 40)
Shape of X_combined_reshaped: (4978, 5, 40)
Shape of X_train: (2986, 5, 40)
Shape of X_val: (996, 5, 40)
Shape of X_test: (996, 5, 40)
Shape of X_train_resampled: (478, 200)
Shape of X_train_resampled: (478, 5, 40)
Shape of flattened_layer: (None, 1280)
Epoch 1/100
30/30 - 3s - 114ms/step - accuracy: 0.6548 - loss: 0.6363 - val_accuracy: 0.6285 - val_loss: 0.6714
Epoch 2/100
30/30 - 0s - 13ms/step - accuracy: 0.7238 - loss: 0.5532 - val_accuracy: 0.6847 - val_loss: 0.5397
Epoch 3/100
30/30 - 0s - 12ms/step - accuracy: 0.7155 - loss: 0.5279 - val_accuracy: 0.6275 - val_loss: 0.6290
Epoch 4/100
30/30 - 0s - 13

In [19]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
Accuracy: 0.7299196787148594
Recall: 0.5670103092783505
Precision: 0.1950354609929078
F1 Score: 0.29023746701846964
Micro F1 Score: 0.7299196787148593
Macro F1 Score: 0.5617337366090489
ROC AUC: 0.7290861552928225
Confusion Matrix:
[[672 227]
 [ 42  55]]
