In [1]:
#LSTM


In [2]:
import pandas as pd

# Read the data set
data = pd.read_csv("new_df_selected4_last4years_adjusted.csv")


unique_company_names = data['company_name'].nunique()
unique_status_labels = data['status_label'].nunique()
unique_divisions = data['Division'].nunique()
unique_majorgroup = data['MajorGroup'].nunique()
unique_last_year = data['last_year'].nunique()

print("Number of unique values in 'company_name' column:", unique_company_names)
print("Number of unique values in 'status_label' column:", unique_status_labels)
print("Number of unique values in 'Division' column:", unique_divisions)
print("Number of unique values in 'MajorGroup' column:", unique_majorgroup)
print("Number of unique values in 'last_year' column:", unique_last_year)



Number of unique values in 'company_name' column: 8971
Number of unique values in 'status_label' column: 2
Number of unique values in 'Division' column: 10
Number of unique values in 'MajorGroup' column: 73
Number of unique values in 'last_year' column: 20


In [3]:
data.shape

(8971, 157)

In [4]:
# Encoding non-numeric columns

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder object
label_encoder = LabelEncoder()


# Label-encode the company_name column
data['company_name_encoded'] = label_encoder.fit_transform(data['company_name'])


# Label-encode the Division column
data['Division_encoded'] = label_encoder.fit_transform(data['Division'])

# Label-encode MajorGroup columns
data['MajorGroup_encoded'] = label_encoder.fit_transform(data['MajorGroup'])
#When using label encoding for feature encoding, the sequential relationship between categories will not be introduced and will not have an impact on prediction.

# Encode the label of the status_label column
data['status_label_encoded'] = label_encoder.fit_transform(data['status_label'])
#With only two categories, it may be simpler and more appropriate to use label encoding as it maps the categories to 0 and 1, suitable for use in tree-based models. 
#If use one-hot encoding, a new column will be generated

print(data.head())




  company_name status_label Division  MajorGroup  last_year  X1_last1year  \
0          C_1        alive        D          37     2017.0         942.7   
1          C_2        alive        D          36     2010.0        1107.7   
2          C_3        alive        D          38     2008.0       12686.0   
3          C_4        alive        D          28     2007.0      581502.0   
4          C_5        alive        D          35     1999.0       28957.0   

   X1_last2year  X1_last3year  X1_last4year  X2_last1year  ...  \
0         888.5         873.1         954.1       1524.70  ...   
1         900.2        1077.4        1008.2       1474.50  ...   
2       13454.0       13582.0        7726.0      21401.00  ...   
3      353541.0     1037047.0      672072.0    1288165.00  ...   
4           NaN           NaN           NaN         42.21  ...   

   nyse_last3year  nyse_last4year  nasdaq_last1year  nasdaq_last2year  \
0    10606.906738    10699.956624       6293.024211       5015.9267

In [5]:
data.shape

(8971, 161)

In [6]:
missing_rows_count = data.isnull().any(axis=1).sum()
print("Number of rows with missing values:", missing_rows_count)


Number of rows with missing values: 3371


In [7]:
# Delete rows with missing values
data_cleaned = data.dropna()
# Delete non-numeric columns that are not encoded
# List of static columns to drop
static_columns_to_drop = [
    'company_name', 'status_label', 'Division', 'MajorGroup', 'last_year', 'company_name_encoded',
    'Division_encoded', 'MajorGroup_encoded'
]

# List of dynamic columns to drop (nyse and nasdaq columns for 1 year and 2 years)
nyse_nasdaq_columns_to_drop = [f'{exchange}_last{year}year' for exchange in ('nyse', 'nasdaq') for year in (1, 2, 3, 4)]

# Add X1_last1year_ycr to X18_last1year_ycr and X1_last2year_ycr to X18_last2year_ycr columns to the list
ycr_columns_to_drop = [f'X{i}_last{year}year_ycr' for i in range(1, 19) for year in (1, 2, 3, 4)]

# Combine all columns to drop
columns_to_drop = static_columns_to_drop + nyse_nasdaq_columns_to_drop + ycr_columns_to_drop

# Drop the columns from the data
data_cleaned = data_cleaned.drop(columns=columns_to_drop)


data_cleaned.shape

(5600, 73)

In [8]:
data_cleaned.head

<bound method NDFrame.head of       X1_last1year  X1_last2year  X1_last3year  X1_last4year  X2_last1year  \
0            942.7         888.5         873.1       954.100       1524.70   
1           1107.7         900.2        1077.4      1008.200       1474.50   
2          12686.0       13454.0       13582.0      7726.000      21401.00   
3         581502.0      353541.0     1037047.0    672072.000    1288165.00   
5           6838.0        6642.0        5935.0      7229.000      25088.00   
...            ...           ...           ...           ...           ...   
8966       10566.0       11738.0        9599.0      9789.000      28278.00   
8967        3369.0        9049.0       21381.0        58.314       3466.00   
8968        2482.2        2340.6        2071.2      2270.500       9401.50   
8969         931.6        1032.7         829.3       735.100       2810.20   
8970       82589.0      135207.0       63971.0    105559.000       1625.37   

      X2_last2year  X2_last3year 

In [9]:
status_counts = data_cleaned['status_label_encoded'].value_counts()
print(status_counts)


0    5124
1     476
Name: status_label_encoded, dtype: int64


In [10]:
#the impact of imbalanced datasets

### 1.1 imbalance dataset

In [11]:
# Feature column names
last1year_features = []
last2year_features = []
last3year_features = []
last4year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last2year_features.append(f'X{i}_last2year')
    last3year_features.append(f'X{i}_last3year')
    last4year_features.append(f'X{i}_last4year')

# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
X_last3year = data_cleaned[last3year_features].values
X_last4year = data_cleaned[last4year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of X_last3year:", X_last3year.shape)
print("Shape of X_last4year:", X_last4year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year, X_last3year, X_last4year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)
X_last3year_scaled = scaler.transform(X_last3year)
X_last4year_scaled = scaler.transform(X_last4year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))
X_last3year_reshaped = np.reshape(X_last3year_scaled, (X_last3year_scaled.shape[0], 1, X_last3year_scaled.shape[1]))
X_last4year_reshaped = np.reshape(X_last4year_scaled, (X_last4year_scaled.shape[0], 1, X_last4year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last4year_reshaped, X_last3year_reshaped, X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_last3year_scaled:", X_last3year_scaled.shape)
print("Shape of X_last4year_scaled:", X_last4year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)




from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(4, X_train.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X_last1year: (5600, 18)
Shape of X_last2year: (5600, 18)
Shape of X_last3year: (5600, 18)
Shape of X_last4year: (5600, 18)
Shape of y: (5600,)
Shape of X_last1year_scaled: (5600, 18)
Shape of X_last2year_scaled: (5600, 18)
Shape of X_last3year_scaled: (5600, 18)
Shape of X_last4year_scaled: (5600, 18)
Shape of X_combined_reshaped: (5600, 4, 18)
Shape of X_train: (3360, 4, 18)
Shape of X_val: (1120, 4, 18)
Shape of X_test: (1120, 4, 18)
Epoch 1/100
210/210 - 3s - 12ms/step - accuracy: 0.8878 - loss: 0.3425 - val_accuracy: 0.9232 - val_loss: 0.2508
Epoch 2/100
210/210 - 1s - 3ms/step - accuracy: 0.9101 - loss: 0.2807 - val_accuracy: 0.9214 - val_loss: 0.2452
Epoch 3/100
210/210 - 1s - 3ms/step - accuracy: 0.9119 - loss: 0.2747 - val_accuracy: 0.9196 - val_loss: 0.2462
Epoch 4/100
210/210 - 1s - 3ms/step - accuracy: 0.9128 - loss: 0.2685 - val_accuracy: 0.9223 - val_loss: 0.2395
Epoch 5/100
210/210 - 1s - 3ms/step - accuracy: 0.9137 - loss: 0.2627 - val_accuracy: 0.9223 - val_los

In [12]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Accuracy: 0.9
Recall: 0.14942528735632185
Precision: 0.2549019607843137
F1 Score: 0.18840579710144928
Micro F1 Score: 0.9
Macro F1 Score: 0.5675616045450158
ROC AUC: 0.7238375004172648
Confusion Matrix:
[[995  38]
 [ 74  13]]


### 1.2 SMOTE

In [13]:
# Feature column names
last1year_features = []
last2year_features = []
last3year_features = []
last4year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last2year_features.append(f'X{i}_last2year')
    last3year_features.append(f'X{i}_last3year')
    last4year_features.append(f'X{i}_last4year')

# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
X_last3year = data_cleaned[last3year_features].values
X_last4year = data_cleaned[last4year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of X_last3year:", X_last3year.shape)
print("Shape of X_last4year:", X_last4year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year, X_last3year, X_last4year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)
X_last3year_scaled = scaler.transform(X_last3year)
X_last4year_scaled = scaler.transform(X_last4year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))
X_last3year_reshaped = np.reshape(X_last3year_scaled, (X_last3year_scaled.shape[0], 1, X_last3year_scaled.shape[1]))
X_last4year_reshaped = np.reshape(X_last4year_scaled, (X_last4year_scaled.shape[0], 1, X_last4year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last4year_reshaped, X_last3year_reshaped, X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_last3year_scaled:", X_last3year_scaled.shape)
print("Shape of X_last4year_scaled:", X_last4year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)



from imblearn.over_sampling import SMOTE
# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 4, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)



from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(4, X_train_resampled.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X_last1year: (5600, 18)
Shape of X_last2year: (5600, 18)
Shape of X_last3year: (5600, 18)
Shape of X_last4year: (5600, 18)
Shape of y: (5600,)
Shape of X_last1year_scaled: (5600, 18)
Shape of X_last2year_scaled: (5600, 18)
Shape of X_last3year_scaled: (5600, 18)
Shape of X_last4year_scaled: (5600, 18)
Shape of X_combined_reshaped: (5600, 4, 18)
Shape of X_train: (3360, 4, 18)
Shape of X_val: (1120, 4, 18)
Shape of X_test: (1120, 4, 18)
Shape of X_train_resampled: (6114, 72)
Shape of X_train_resampled: (6114, 4, 18)
Epoch 1/100
383/383 - 5s - 13ms/step - accuracy: 0.6523 - loss: 0.6093 - val_accuracy: 0.5277 - val_loss: 0.5936
Epoch 2/100
383/383 - 2s - 4ms/step - accuracy: 0.6943 - loss: 0.5480 - val_accuracy: 0.4866 - val_loss: 0.6523
Epoch 3/100
383/383 - 2s - 4ms/step - accuracy: 0.7116 - loss: 0.5213 - val_accuracy: 0.8402 - val_loss: 0.4536
Epoch 4/100
383/383 - 2s - 4ms/step - accuracy: 0.7280 - loss: 0.4989 - val_accuracy: 0.6036 - val_loss: 0.5551
Epoch 5/100
383/383 -

In [14]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.8008928571428572
Recall: 0.4367816091954023
Precision: 0.1792452830188679
F1 Score: 0.25418060200668896
Micro F1 Score: 0.8008928571428572
Macro F1 Score: 0.5696456848261162
ROC AUC: 0.7233367827219014
Confusion Matrix:
[[859 174]
 [ 49  38]]


### 1.3  Undersampling

In [15]:
# Feature column names
last1year_features = []
last2year_features = []
last3year_features = []
last4year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last2year_features.append(f'X{i}_last2year')
    last3year_features.append(f'X{i}_last3year')
    last4year_features.append(f'X{i}_last4year')

# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
X_last3year = data_cleaned[last3year_features].values
X_last4year = data_cleaned[last4year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of X_last3year:", X_last3year.shape)
print("Shape of X_last4year:", X_last4year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year, X_last3year, X_last4year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)
X_last3year_scaled = scaler.transform(X_last3year)
X_last4year_scaled = scaler.transform(X_last4year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))
X_last3year_reshaped = np.reshape(X_last3year_scaled, (X_last3year_scaled.shape[0], 1, X_last3year_scaled.shape[1]))
X_last4year_reshaped = np.reshape(X_last4year_scaled, (X_last4year_scaled.shape[0], 1, X_last4year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last4year_reshaped, X_last3year_reshaped, X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_last3year_scaled:", X_last3year_scaled.shape)
print("Shape of X_last4year_scaled:", X_last4year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)


# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)



from imblearn.under_sampling import RandomUnderSampler

# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply undersampling only on the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 4, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)




from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(4, X_train_resampled.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X_last1year: (5600, 18)
Shape of X_last2year: (5600, 18)
Shape of X_last3year: (5600, 18)
Shape of X_last4year: (5600, 18)
Shape of y: (5600,)
Shape of X_last1year_scaled: (5600, 18)
Shape of X_last2year_scaled: (5600, 18)
Shape of X_last3year_scaled: (5600, 18)
Shape of X_last4year_scaled: (5600, 18)
Shape of X_combined_reshaped: (5600, 4, 18)
Shape of X_train: (3360, 4, 18)
Shape of X_val: (1120, 4, 18)
Shape of X_test: (1120, 4, 18)
Shape of X_train_resampled: (606, 72)
Shape of X_train_resampled: (606, 4, 18)
Epoch 1/100
38/38 - 2s - 58ms/step - accuracy: 0.6188 - loss: 0.6667 - val_accuracy: 0.6830 - val_loss: 0.6734
Epoch 2/100
38/38 - 0s - 7ms/step - accuracy: 0.6733 - loss: 0.6309 - val_accuracy: 0.7464 - val_loss: 0.6643
Epoch 3/100
38/38 - 0s - 7ms/step - accuracy: 0.6634 - loss: 0.6111 - val_accuracy: 0.6759 - val_loss: 0.6721
Epoch 4/100
38/38 - 0s - 7ms/step - accuracy: 0.6535 - loss: 0.5991 - val_accuracy: 0.7839 - val_loss: 0.6103
Epoch 5/100
38/38 - 0s - 7ms/st

In [16]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.7107142857142857
Recall: 0.5977011494252874
Precision: 0.15249266862170088
F1 Score: 0.24299065420560748
Micro F1 Score: 0.7107142857142857
Macro F1 Score: 0.5320913535928699
ROC AUC: 0.7375794193900146
Confusion Matrix:
[[744 289]
 [ 35  52]]


## undersampling, CNN-LSTM

In [17]:
## Feature column names
last1year_features = []
last2year_features = []
last3year_features = []
last4year_features = []

# Loop to build list of feature column names
for i in range(1, 19):
    last1year_features.append(f'X{i}_last1year')
    last2year_features.append(f'X{i}_last2year')
    last3year_features.append(f'X{i}_last3year')
    last4year_features.append(f'X{i}_last4year')
    
# Define target column name
target_column = 'status_label_encoded'

# Extract features and target
X_last1year = data_cleaned[last1year_features].values
X_last2year = data_cleaned[last2year_features].values
X_last3year = data_cleaned[last3year_features].values
X_last4year = data_cleaned[last4year_features].values
y = data_cleaned[target_column].values

# Print data structure
print("Shape of X_last1year:", X_last1year.shape)
print("Shape of X_last2year:", X_last2year.shape)
print("Shape of X_last3year:", X_last3year.shape)
print("Shape of X_last4year:", X_last4year.shape)
print("Shape of y:", y.shape)



# Data Standardization
import numpy as np
from sklearn.preprocessing import StandardScaler

# Define the scaler
scaler = StandardScaler()
# Fit the scaler on the entire data set
scaler.fit(np.concatenate((X_last1year, X_last2year, X_last3year, X_last4year), axis=0))
# Transform each year's data using the same scaler
X_last1year_scaled = scaler.transform(X_last1year)
X_last2year_scaled = scaler.transform(X_last2year)
X_last3year_scaled = scaler.transform(X_last3year)
X_last4year_scaled = scaler.transform(X_last4year)

# Reshape features to 3D arrays [samples, time steps, features]
X_last1year_reshaped = np.reshape(X_last1year_scaled, (X_last1year_scaled.shape[0], 1, X_last1year_scaled.shape[1]))
X_last2year_reshaped = np.reshape(X_last2year_scaled, (X_last2year_scaled.shape[0], 1, X_last2year_scaled.shape[1]))
X_last3year_reshaped = np.reshape(X_last3year_scaled, (X_last3year_scaled.shape[0], 1, X_last3year_scaled.shape[1]))
X_last4year_reshaped = np.reshape(X_last4year_scaled, (X_last4year_scaled.shape[0], 1, X_last4year_scaled.shape[1]))

# Combine last1year and last2year features along the time step dimension
X_combined_reshaped = np.concatenate((X_last4year_reshaped, X_last3year_reshaped, X_last2year_reshaped, X_last1year_reshaped), axis=1)
#First use data from the past two years (X_last3year_reshaped), then data from the past years 

# Print data structure
print("Shape of X_last1year_scaled:", X_last1year_scaled.shape)
print("Shape of X_last2year_scaled:", X_last2year_scaled.shape)
print("Shape of X_last3year_scaled:", X_last3year_scaled.shape)
print("Shape of X_last4year_scaled:", X_last4year_scaled.shape)
print("Shape of X_combined_reshaped:", X_combined_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_combined_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)


from imblearn.under_sampling import RandomUnderSampler

# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply undersampling only on the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 4, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)



from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Reshape, Conv1D, MaxPooling1D, Flatten, ZeroPadding1D



#Define input layer
inputs = Input(shape=(4, X_train_resampled.shape[2]))
# Reshape input to fit Conv1D layer
inputs_reshaped = Reshape((X_train_resampled.shape[2], 4))(inputs)  
# Define CNN-LSTM model
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(inputs_reshaped)
# Add Padding1D layer
padded_layer = ZeroPadding1D(padding=2)(cnn_layer)  #Consider flatten being divisible by the LSTM time step, so add padding
pooling_layer = MaxPooling1D(pool_size=2)(padded_layer)
flattened_layer = Flatten()(pooling_layer)
print("Shape of flattened_layer:", flattened_layer.shape)
reshaped_layer = Reshape((4, -1))(flattened_layer)  # Reshape the output to fit LSTM input  #The input shape expected by the LSTM layer is (timesteps, features)
lstm_layer = LSTM(100)(reshaped_layer)   
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)
# Define the model
model = Model(inputs=inputs, outputs=output_layer)
# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)



Shape of X_last1year: (5600, 18)
Shape of X_last2year: (5600, 18)
Shape of X_last3year: (5600, 18)
Shape of X_last4year: (5600, 18)
Shape of y: (5600,)
Shape of X_last1year_scaled: (5600, 18)
Shape of X_last2year_scaled: (5600, 18)
Shape of X_last3year_scaled: (5600, 18)
Shape of X_last4year_scaled: (5600, 18)
Shape of X_combined_reshaped: (5600, 4, 18)
Shape of X_train: (3360, 4, 18)
Shape of X_val: (1120, 4, 18)
Shape of X_test: (1120, 4, 18)
Shape of X_train_resampled: (606, 72)
Shape of X_train_resampled: (606, 4, 18)
Shape of flattened_layer: (None, 640)
Epoch 1/100
38/38 - 3s - 86ms/step - accuracy: 0.5314 - loss: 0.6820 - val_accuracy: 0.3250 - val_loss: 0.6872
Epoch 2/100
38/38 - 0s - 9ms/step - accuracy: 0.6254 - loss: 0.6433 - val_accuracy: 0.3902 - val_loss: 0.6972
Epoch 3/100
38/38 - 0s - 9ms/step - accuracy: 0.6419 - loss: 0.6169 - val_accuracy: 0.8571 - val_loss: 0.5781
Epoch 4/100
38/38 - 0s - 9ms/step - accuracy: 0.6749 - loss: 0.5988 - val_accuracy: 0.7893 - val_loss: 

In [18]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Accuracy: 0.7098214285714286
Recall: 0.632183908045977
Precision: 0.15804597701149425
F1 Score: 0.25287356321839083
Micro F1 Score: 0.7098214285714286
Macro F1 Score: 0.5364090807781705
ROC AUC: 0.750854001847092
Confusion Matrix:
[[740 293]
 [ 32  55]]
