In [1]:
#LSTM


In [2]:
import pandas as pd

# Read the data set
data = pd.read_csv("new_df_selected2_last2years_adjusted.csv")


unique_company_names = data['company_name'].nunique()
unique_status_labels = data['status_label'].nunique()
unique_divisions = data['Division'].nunique()
unique_majorgroup = data['MajorGroup'].nunique()
unique_last_year = data['last_year'].nunique()

print("Number of unique values in 'company_name' column:", unique_company_names)
print("Number of unique values in 'status_label' column:", unique_status_labels)
print("Number of unique values in 'Division' column:", unique_divisions)
print("Number of unique values in 'MajorGroup' column:", unique_majorgroup)
print("Number of unique values in 'last_year' column:", unique_last_year)



Number of unique values in 'company_name' column: 8971
Number of unique values in 'status_label' column: 2
Number of unique values in 'Division' column: 10
Number of unique values in 'MajorGroup' column: 73
Number of unique values in 'last_year' column: 20


In [3]:
data.shape

(8971, 81)

In [4]:
# Encoding non-numeric columns

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder object
label_encoder = LabelEncoder()


# Label-encode the company_name column
data['company_name_encoded'] = label_encoder.fit_transform(data['company_name'])


# Label-encode the Division column
data['Division_encoded'] = label_encoder.fit_transform(data['Division'])

# Label-encode MajorGroup columns
data['MajorGroup_encoded'] = label_encoder.fit_transform(data['MajorGroup'])
#When using label encoding for feature encoding, the sequential relationship between categories will not be introduced and will not have an impact on prediction.

# Encode the label of the status_label column
data['status_label_encoded'] = label_encoder.fit_transform(data['status_label'])
#With only two categories, it may be simpler and more appropriate to use label encoding as it maps the categories to 0 and 1, suitable for use in tree-based models. 
#If use one-hot encoding, a new column will be generated

print(data.head())




  company_name status_label Division  MajorGroup  last_year  X1_last1year  \
0          C_1        alive        D          37     2017.0         942.7   
1          C_2        alive        D          36     2010.0        1107.7   
2          C_3        alive        D          38     2008.0       12686.0   
3          C_4        alive        D          28     2007.0      581502.0   
4          C_5        alive        D          35     1999.0       28957.0   

   X1_last2year  X2_last1year  X2_last2year  X3_last1year  ...  \
0         888.5       1524.70        1504.1       1413.20  ...   
1         900.2       1474.50        1343.6        677.20  ...   
2       13454.0      21401.00       27171.0      19334.00  ...   
3      353541.0    1288165.00      927239.0        267.81  ...   
4           NaN         42.21           NaN      79567.00  ...   

   X18_last1year_ycr  X18_last2year_ycr  nyse_last1year  nyse_last2year  \
0           0.001482           0.061414    11912.848307    10451.

In [5]:
data.shape

(8971, 85)

In [6]:
missing_rows_count = data.isnull().any(axis=1).sum()
print("Number of rows with missing values:", missing_rows_count)


Number of rows with missing values: 1870


In [7]:
# Delete rows with missing values
data_cleaned = data.dropna()
# Delete non-numeric columns that are not encoded
# Delete specified column
data_cleaned = data_cleaned.drop(['company_name', 'status_label', 'Division', 'MajorGroup', 'last_year', 'company_name_encoded'], axis=1)

data_cleaned.shape

(7101, 79)

In [8]:
data_cleaned.head

<bound method NDFrame.head of       X1_last1year  X1_last2year  X2_last1year  X2_last2year  X3_last1year  \
0            942.7         888.5       1524.70       1504.10       1413.20   
1           1107.7         900.2       1474.50       1343.60        677.20   
2          12686.0       13454.0      21401.00      27171.00      19334.00   
3         581502.0      353541.0    1288165.00     927239.00        267.81   
5           6838.0        6642.0      25088.00      25438.00      18138.00   
...            ...           ...           ...           ...           ...   
8966       10566.0       11738.0      28278.00      26206.00      31288.00   
8967        3369.0        9049.0       3466.00       9198.00        208.00   
8968        2482.2        2340.6       9401.50      10252.40        966.70   
8969         931.6        1032.7       2810.20       2542.00       1475.90   
8970       82589.0      135207.0       1625.37       1736.11      68817.00   

      X3_last2year  X4_last1year 

In [9]:
status_counts = data_cleaned['status_label_encoded'].value_counts()
print(status_counts)


0    6537
1     564
Name: status_label_encoded, dtype: int64


In [10]:
#the impact of imbalanced datasets

### 1.1 imbalance dataset

In [11]:
# Feature column names
last1year_features = []
last2year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last1year_features.append(f'X{i}_last1year_ycr')
    last2year_features.append(f'X{i}_last2year')
    last2year_features.append(f'X{i}_last2year_ycr')

last1year_features.extend(['nyse_last1year', 'nasdaq_last1year', 'Division_encoded', 'MajorGroup_encoded'])
last2year_features.extend(['nyse_last2year', 'nasdaq_last2year', 'Division_encoded', 'MajorGroup_encoded'])

# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)




from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(2, X_train.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X_last1year: (7101, 40)
Shape of X_last2year: (7101, 40)
Shape of y: (7101,)
Shape of X_last1year_scaled: (7101, 40)
Shape of X_last2year_scaled: (7101, 40)
Shape of X_combined_reshaped: (7101, 2, 40)
Shape of X_train: (4260, 2, 40)
Shape of X_val: (1420, 2, 40)
Shape of X_test: (1421, 2, 40)
Epoch 1/100
267/267 - 4s - 14ms/step - accuracy: 0.9016 - loss: 0.3275 - val_accuracy: 0.9218 - val_loss: 0.2372
Epoch 2/100
267/267 - 1s - 4ms/step - accuracy: 0.9207 - loss: 0.2395 - val_accuracy: 0.9225 - val_loss: 0.2324
Epoch 3/100
267/267 - 1s - 3ms/step - accuracy: 0.9239 - loss: 0.2277 - val_accuracy: 0.9232 - val_loss: 0.2272
Epoch 4/100
267/267 - 1s - 3ms/step - accuracy: 0.9242 - loss: 0.2220 - val_accuracy: 0.9218 - val_loss: 0.2340
Epoch 5/100
267/267 - 1s - 3ms/step - accuracy: 0.9251 - loss: 0.2192 - val_accuracy: 0.9225 - val_loss: 0.2295
Epoch 6/100
267/267 - 1s - 3ms/step - accuracy: 0.9251 - loss: 0.2156 - val_accuracy: 0.9261 - val_loss: 0.2263
Epoch 7/100
267/267 - 1s

In [12]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
Accuracy: 0.9014778325123153
Recall: 0.21929824561403508
Precision: 0.32894736842105265
F1 Score: 0.2631578947368421
Micro F1 Score: 0.9014778325123153
Macro F1 Score: 0.605183773914424
ROC AUC: 0.7375199667109626
Confusion Matrix:
[[1256   51]
 [  89   25]]


### 1.2 SMOTE

In [13]:
# Feature column names
last1year_features = []
last2year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last1year_features.append(f'X{i}_last1year_ycr')
    last2year_features.append(f'X{i}_last2year')
    last2year_features.append(f'X{i}_last2year_ycr')

last1year_features.extend(['nyse_last1year', 'nasdaq_last1year', 'Division_encoded', 'MajorGroup_encoded'])
last2year_features.extend(['nyse_last2year', 'nasdaq_last2year', 'Division_encoded', 'MajorGroup_encoded'])

# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)



from imblearn.over_sampling import SMOTE
# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 2, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)



from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(2, X_train_resampled.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X_last1year: (7101, 40)
Shape of X_last2year: (7101, 40)
Shape of y: (7101,)
Shape of X_last1year_scaled: (7101, 40)
Shape of X_last2year_scaled: (7101, 40)
Shape of X_combined_reshaped: (7101, 2, 40)
Shape of X_train: (4260, 2, 40)
Shape of X_val: (1420, 2, 40)
Shape of X_test: (1421, 2, 40)
Shape of X_train_resampled: (7838, 80)
Shape of X_train_resampled: (7838, 2, 40)
Epoch 1/100
490/490 - 3s - 7ms/step - accuracy: 0.7198 - loss: 0.5422 - val_accuracy: 0.7035 - val_loss: 0.5010
Epoch 2/100
490/490 - 1s - 3ms/step - accuracy: 0.7678 - loss: 0.4697 - val_accuracy: 0.6585 - val_loss: 0.5356
Epoch 3/100
490/490 - 1s - 3ms/step - accuracy: 0.7937 - loss: 0.4358 - val_accuracy: 0.7282 - val_loss: 0.4982
Epoch 4/100
490/490 - 1s - 3ms/step - accuracy: 0.8154 - loss: 0.4067 - val_accuracy: 0.8394 - val_loss: 0.3617
Epoch 5/100
490/490 - 1s - 3ms/step - accuracy: 0.8334 - loss: 0.3779 - val_accuracy: 0.7507 - val_loss: 0.4773
Epoch 6/100
490/490 - 1s - 3ms/step - accuracy: 0.8422 -

In [14]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
Accuracy: 0.8810696692470091
Recall: 0.22807017543859648
Precision: 0.24299065420560748
F1 Score: 0.23529411764705882
Micro F1 Score: 0.8810696692470091
Macro F1 Score: 0.5854074556186458
ROC AUC: 0.7174559390058928
Confusion Matrix:
[[1226   81]
 [  88   26]]


### 1.3  Undersampling

In [15]:
# Feature column names
last1year_features = []
last2year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last1year_features.append(f'X{i}_last1year_ycr')
    last2year_features.append(f'X{i}_last2year')
    last2year_features.append(f'X{i}_last2year_ycr')

last1year_features.extend(['nyse_last1year', 'nasdaq_last1year', 'Division_encoded', 'MajorGroup_encoded'])
last2year_features.extend(['nyse_last2year', 'nasdaq_last2year', 'Division_encoded', 'MajorGroup_encoded'])

# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)


# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)



from imblearn.under_sampling import RandomUnderSampler

# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply undersampling only on the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 2, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)




from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(2, X_train_resampled.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X_last1year: (7101, 40)
Shape of X_last2year: (7101, 40)
Shape of y: (7101,)
Shape of X_last1year_scaled: (7101, 40)
Shape of X_last2year_scaled: (7101, 40)
Shape of X_combined_reshaped: (7101, 2, 40)
Shape of X_train: (4260, 2, 40)
Shape of X_val: (1420, 2, 40)
Shape of X_test: (1421, 2, 40)
Shape of X_train_resampled: (682, 80)
Shape of X_train_resampled: (682, 2, 40)
Epoch 1/100
43/43 - 2s - 56ms/step - accuracy: 0.6672 - loss: 0.6395 - val_accuracy: 0.5866 - val_loss: 0.6678
Epoch 2/100
43/43 - 0s - 7ms/step - accuracy: 0.7126 - loss: 0.5848 - val_accuracy: 0.6077 - val_loss: 0.6312
Epoch 3/100
43/43 - 0s - 7ms/step - accuracy: 0.7185 - loss: 0.5573 - val_accuracy: 0.6246 - val_loss: 0.6065
Epoch 4/100
43/43 - 0s - 6ms/step - accuracy: 0.7170 - loss: 0.5390 - val_accuracy: 0.6225 - val_loss: 0.5880
Epoch 5/100
43/43 - 0s - 7ms/step - accuracy: 0.7346 - loss: 0.5245 - val_accuracy: 0.6268 - val_loss: 0.5652
Epoch 6/100
43/43 - 0s - 6ms/step - accuracy: 0.7331 - loss: 0.5119

In [16]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Accuracy: 0.6995073891625616
Recall: 0.5614035087719298
Precision: 0.14512471655328799
F1 Score: 0.23063063063063066
Micro F1 Score: 0.6995073891625616
Macro F1 Score: 0.521961576793234
ROC AUC: 0.7087645471751299
Confusion Matrix:
[[930 377]
 [ 50  64]]


## undersampling, CNN-LSTM

In [17]:
# Feature column names
last1year_features = []
last2year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last1year_features.append(f'X{i}_last1year_ycr')
    last2year_features.append(f'X{i}_last2year')
    last2year_features.append(f'X{i}_last2year_ycr')

last1year_features.extend(['nyse_last1year', 'nasdaq_last1year', 'Division_encoded', 'MajorGroup_encoded'])
last2year_features.extend(['nyse_last2year', 'nasdaq_last2year', 'Division_encoded', 'MajorGroup_encoded'])

# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)

# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)


from imblearn.under_sampling import RandomUnderSampler

# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply undersampling only on the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 2, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)



from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Reshape, Conv1D, MaxPooling1D, Flatten, ZeroPadding1D



#Define input layer
inputs = Input(shape=(2, X_train_resampled.shape[2]))
# Reshape input to fit Conv1D layer
inputs_reshaped = Reshape((X_train_resampled.shape[2], 2))(inputs)  
# Define CNN-LSTM model
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(inputs_reshaped)
# Add Padding1D layer
padded_layer = ZeroPadding1D(padding=2)(cnn_layer)  #Consider flatten being divisible by the LSTM time step, so add padding
pooling_layer = MaxPooling1D(pool_size=2)(padded_layer)
flattened_layer = Flatten()(pooling_layer)
print("Shape of flattened_layer:", flattened_layer.shape)
reshaped_layer = Reshape((2, -1))(flattened_layer)  # Reshape the output to fit LSTM input  #The input shape expected by the LSTM layer is (timesteps, features)
lstm_layer = LSTM(100)(reshaped_layer)   
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)
# Define the model
model = Model(inputs=inputs, outputs=output_layer)
# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)



Shape of X_last1year: (7101, 40)
Shape of X_last2year: (7101, 40)
Shape of y: (7101,)
Shape of X_last1year_scaled: (7101, 40)
Shape of X_last2year_scaled: (7101, 40)
Shape of X_combined_reshaped: (7101, 2, 40)
Shape of X_train: (4260, 2, 40)
Shape of X_val: (1420, 2, 40)
Shape of X_test: (1421, 2, 40)
Shape of X_train_resampled: (682, 80)
Shape of X_train_resampled: (682, 2, 40)
Shape of flattened_layer: (None, 1344)
Epoch 1/100
43/43 - 3s - 74ms/step - accuracy: 0.6437 - loss: 0.6462 - val_accuracy: 0.6514 - val_loss: 0.6065
Epoch 2/100
43/43 - 0s - 11ms/step - accuracy: 0.6994 - loss: 0.5871 - val_accuracy: 0.5704 - val_loss: 0.7057
Epoch 3/100
43/43 - 1s - 13ms/step - accuracy: 0.6965 - loss: 0.5649 - val_accuracy: 0.7444 - val_loss: 0.4528
Epoch 4/100
43/43 - 0s - 10ms/step - accuracy: 0.7053 - loss: 0.5478 - val_accuracy: 0.6317 - val_loss: 0.6127
Epoch 5/100
43/43 - 0s - 10ms/step - accuracy: 0.7243 - loss: 0.5240 - val_accuracy: 0.6211 - val_loss: 0.6179
Epoch 6/100
43/43 - 0s -

In [18]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
Accuracy: 0.6565798733286418
Recall: 0.7456140350877193
Precision: 0.15625
F1 Score: 0.25835866261398177
Micro F1 Score: 0.6565798733286418
Macro F1 Score: 0.5174577195853791
ROC AUC: 0.7521745258325614
Confusion Matrix:
[[848 459]
 [ 29  85]]
