In [1]:
#LSTM


In [2]:
import pandas as pd

# Read the data set
data = pd.read_csv("new_df_selected1_onlylastyear_adjusted.csv")


unique_company_names = data['company_name'].nunique()
unique_status_labels = data['status_label'].nunique()
unique_divisions = data['Division'].nunique()
unique_majorgroup = data['MajorGroup'].nunique()
unique_last_year = data['last_year'].nunique()

print("Number of unique values in 'company_name' column:", unique_company_names)
print("Number of unique values in 'status_label' column:", unique_status_labels)
print("Number of unique values in 'Division' column:", unique_divisions)
print("Number of unique values in 'MajorGroup' column:", unique_majorgroup)
print("Number of unique values in 'last_year' column:", unique_last_year)



Number of unique values in 'company_name' column: 8971
Number of unique values in 'status_label' column: 2
Number of unique values in 'Division' column: 10
Number of unique values in 'MajorGroup' column: 73
Number of unique values in 'last_year' column: 20


In [3]:
data.shape

(8971, 43)

In [4]:
# Encoding non-numeric columns

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder object
label_encoder = LabelEncoder()


# Label-encode the company_name column
data['company_name_encoded'] = label_encoder.fit_transform(data['company_name'])


# Label-encode the Division column
data['Division_encoded'] = label_encoder.fit_transform(data['Division'])

# Label-encode MajorGroup columns
data['MajorGroup_encoded'] = label_encoder.fit_transform(data['MajorGroup'])
#When using label encoding for feature encoding, the sequential relationship between categories will not be introduced and will not have an impact on prediction.

# Encode the label of the status_label column
data['status_label_encoded'] = label_encoder.fit_transform(data['status_label'])
#With only two categories, it may be simpler and more appropriate to use label encoding as it maps the categories to 0 and 1, suitable for use in tree-based models. 
#If use one-hot encoding, a new column will be generated

print(data.head())




  company_name status_label Division  MajorGroup  last_year  X1_last1year  \
0          C_1        alive        D          37     2017.0         942.7   
1          C_2        alive        D          36     2010.0        1107.7   
2          C_3        alive        D          38     2008.0       12686.0   
3          C_4        alive        D          28     2007.0      581502.0   
4          C_5        alive        D          35     1999.0       28957.0   

   X2_last1year  X3_last1year  X4_last1year  X5_last1year  ...  \
0       1524.70       1413.20         177.2          40.5  ...   
1       1474.50        677.20         650.8          61.5  ...   
2      21401.00      19334.00          23.0        1686.0  ...   
3    1288165.00        267.81         300.0       46338.0  ...   
4         42.21      79567.00         591.0        2024.0  ...   

   X15_last1year_ycr  X16_last1year_ycr  X17_last1year_ycr  X18_last1year_ycr  \
0        1291.527514          -0.002543          -0.010919 

In [5]:
data.shape

(8971, 47)

In [6]:
missing_rows_count = data.isnull().any(axis=1).sum()
print("Number of rows with missing values:", missing_rows_count)


Number of rows with missing values: 893


In [7]:
# Delete rows with missing values
data_cleaned = data.dropna()
# Delete non-numeric columns that are not encoded
# Delete specified column
columns_to_drop = [
    'company_name', 'status_label', 'Division', 'MajorGroup', 'last_year', 'company_name_encoded',
    'nyse_last1year', 'nasdaq_last1year', 'Division_encoded', 'MajorGroup_encoded'
]

# Add X1_last1year_ycr to X18_last1year_ycr columns to the list
columns_to_drop.extend([f'X{i}_last1year_ycr' for i in range(1, 19)])

data_cleaned = data_cleaned.drop(columns=columns_to_drop)


data_cleaned.shape

(8078, 19)

In [8]:
data_cleaned.head

<bound method NDFrame.head of       X1_last1year  X2_last1year  X3_last1year  X4_last1year  X5_last1year  \
0            942.7       1524.70       1413.20         177.2        40.500   
1           1107.7       1474.50        677.20         650.8        61.500   
2          12686.0      21401.00      19334.00          23.0      1686.000   
3         581502.0    1288165.00        267.81         300.0     46338.000   
5           6838.0      25088.00      18138.00        9253.0       995.000   
...            ...           ...           ...           ...           ...   
8966       10566.0      28278.00      31288.00        8497.0      1200.000   
8967        3369.0       3466.00        208.00           0.0        57.000   
8968        2482.2       9401.50        966.70        5350.7       156.600   
8969         931.6       2810.20       1475.90        1409.5        82.500   
8970       82589.0       1625.37      68817.00      632122.0        65.201   

      X6_last1year  X7_last1year 

In [9]:
status_counts = data_cleaned['status_label_encoded'].value_counts()
print(status_counts)


0    7484
1     594
Name: status_label_encoded, dtype: int64


In [10]:
#the impact of imbalanced datasets

### 1.1 imbalance dataset

In [11]:

# Extract features and target
X = data_cleaned.drop(columns=['status_label_encoded']).values
y = data_cleaned['status_label_encoded'].values
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)



# Data Standardization    #There is not much difference between normalization and Z-score normalization here because the dataset is not balanced
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

import numpy as np
X_reshaped = np.reshape(X_scaled, (X_scaled.shape[0], 1, X_scaled.shape[1]))

print("Shape of X_scaled:", X_scaled.shape)
print("Shape of X_reshaped:", X_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)




from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(1, X_train.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X: (8078, 18)
Shape of y: (8078,)
Shape of X_scaled: (8078, 18)
Shape of X_reshaped: (8078, 1, 18)
Shape of X_train: (4846, 1, 18)
Shape of X_val: (1616, 1, 18)
Shape of X_test: (1616, 1, 18)
Epoch 1/100
303/303 - 3s - 11ms/step - accuracy: 0.9051 - loss: 0.3823 - val_accuracy: 0.9313 - val_loss: 0.2497
Epoch 2/100
303/303 - 1s - 3ms/step - accuracy: 0.9274 - loss: 0.2478 - val_accuracy: 0.9319 - val_loss: 0.2401
Epoch 3/100
303/303 - 1s - 3ms/step - accuracy: 0.9276 - loss: 0.2413 - val_accuracy: 0.9313 - val_loss: 0.2382
Epoch 4/100
303/303 - 1s - 3ms/step - accuracy: 0.9282 - loss: 0.2378 - val_accuracy: 0.9301 - val_loss: 0.2381
Epoch 5/100
303/303 - 1s - 3ms/step - accuracy: 0.9284 - loss: 0.2356 - val_accuracy: 0.9313 - val_loss: 0.2370
Epoch 6/100
303/303 - 1s - 3ms/step - accuracy: 0.9294 - loss: 0.2341 - val_accuracy: 0.9301 - val_loss: 0.2366
Epoch 7/100
303/303 - 1s - 3ms/step - accuracy: 0.9296 - loss: 0.2325 - val_accuracy: 0.9288 - val_loss: 0.2373
Epoch 8/100
30

In [12]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Accuracy: 0.9158415841584159
Recall: 0.08064516129032258
Precision: 0.3125
F1 Score: 0.1282051282051282
Micro F1 Score: 0.9158415841584159
Macro F1 Score: 0.5419959321129673
ROC AUC: 0.7420598028193376
Confusion Matrix:
[[1470   22]
 [ 114   10]]


### 1.2 SMOTE

In [13]:
# Extract features and target
X = data_cleaned.drop(columns=['status_label_encoded']).values
y = data_cleaned['status_label_encoded'].values
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)



# Data Standardization    #There is not much difference between normalization and Z-score normalization here because the dataset is not balanced
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

import numpy as np
X_reshaped = np.reshape(X_scaled, (X_scaled.shape[0], 1, X_scaled.shape[1]))

print("Shape of X_scaled:", X_scaled.shape)
print("Shape of X_reshaped:", X_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)



from imblearn.over_sampling import SMOTE
# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 1, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)



from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(1, X_train_resampled.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X: (8078, 18)
Shape of y: (8078,)
Shape of X_scaled: (8078, 18)
Shape of X_reshaped: (8078, 1, 18)
Shape of X_train: (4846, 1, 18)
Shape of X_val: (1616, 1, 18)
Shape of X_test: (1616, 1, 18)
Shape of X_train_resampled: (8980, 18)
Shape of X_train_resampled: (8980, 1, 18)
Epoch 1/100
562/562 - 4s - 7ms/step - accuracy: 0.6494 - loss: 0.6349 - val_accuracy: 0.7958 - val_loss: 0.5998
Epoch 2/100
562/562 - 1s - 3ms/step - accuracy: 0.6636 - loss: 0.5980 - val_accuracy: 0.7735 - val_loss: 0.6014
Epoch 3/100
562/562 - 1s - 2ms/step - accuracy: 0.6668 - loss: 0.5817 - val_accuracy: 0.6838 - val_loss: 0.6201
Epoch 4/100
562/562 - 1s - 3ms/step - accuracy: 0.6774 - loss: 0.5717 - val_accuracy: 0.7908 - val_loss: 0.5640
Epoch 5/100
562/562 - 2s - 3ms/step - accuracy: 0.6816 - loss: 0.5643 - val_accuracy: 0.7723 - val_loss: 0.5813
Epoch 6/100
562/562 - 1s - 2ms/step - accuracy: 0.6846 - loss: 0.5599 - val_accuracy: 0.7618 - val_loss: 0.5797
Epoch 7/100
562/562 - 1s - 2ms/step - accuracy

In [14]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Accuracy: 0.7889851485148515
Recall: 0.5080645161290323
Precision: 0.1836734693877551
F1 Score: 0.2698072805139186
Micro F1 Score: 0.7889851485148515
Macro F1 Score: 0.573239987454066
ROC AUC: 0.7178122027155582
Confusion Matrix:
[[1212  280]
 [  61   63]]


### 1.3  Undersampling

In [15]:
# Extract features and target
X = data_cleaned.drop(columns=['status_label_encoded']).values
y = data_cleaned['status_label_encoded'].values
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)



# Data Standardization    #There is not much difference between normalization and Z-score normalization here because the dataset is not balanced
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

import numpy as np
X_reshaped = np.reshape(X_scaled, (X_scaled.shape[0], 1, X_scaled.shape[1]))

print("Shape of X_scaled:", X_scaled.shape)
print("Shape of X_reshaped:", X_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)



from imblearn.under_sampling import RandomUnderSampler

# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply undersampling only on the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 1, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)




from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(1, X_train_resampled.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X: (8078, 18)
Shape of y: (8078,)
Shape of X_scaled: (8078, 18)
Shape of X_reshaped: (8078, 1, 18)
Shape of X_train: (4846, 1, 18)
Shape of X_val: (1616, 1, 18)
Shape of X_test: (1616, 1, 18)
Shape of X_train_resampled: (712, 18)
Shape of X_train_resampled: (712, 1, 18)
Epoch 1/100
45/45 - 2s - 54ms/step - accuracy: 0.5871 - loss: 0.6759 - val_accuracy: 0.3199 - val_loss: 0.6869
Epoch 2/100
45/45 - 0s - 6ms/step - accuracy: 0.6138 - loss: 0.6550 - val_accuracy: 0.3571 - val_loss: 0.6823
Epoch 3/100
45/45 - 0s - 6ms/step - accuracy: 0.6152 - loss: 0.6398 - val_accuracy: 0.5186 - val_loss: 0.6772
Epoch 4/100
45/45 - 0s - 6ms/step - accuracy: 0.6798 - loss: 0.6284 - val_accuracy: 0.7259 - val_loss: 0.6681
Epoch 5/100
45/45 - 0s - 6ms/step - accuracy: 0.6475 - loss: 0.6207 - val_accuracy: 0.6671 - val_loss: 0.6761
Epoch 6/100
45/45 - 0s - 6ms/step - accuracy: 0.6742 - loss: 0.6134 - val_accuracy: 0.7543 - val_loss: 0.6623
Epoch 7/100
45/45 - 0s - 6ms/step - accuracy: 0.6629 - loss

In [16]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Accuracy: 0.744430693069307
Recall: 0.6129032258064516
Precision: 0.17233560090702948
F1 Score: 0.26902654867256637
Micro F1 Score: 0.744430693069307
Macro F1 Score: 0.5570854528139735
ROC AUC: 0.7450596730952176
Confusion Matrix:
[[1127  365]
 [  48   76]]


## undersampling, CNN-LSTM

In [17]:
# Extract features and target
X = data_cleaned.drop(columns=['status_label_encoded']).values
y = data_cleaned['status_label_encoded'].values
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)



# Data Standardization    #There is not much difference between normalization and Z-score normalization here because the dataset is not balanced
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

import numpy as np
X_reshaped = np.reshape(X_scaled, (X_scaled.shape[0], 1, X_scaled.shape[1]))

print("Shape of X_scaled:", X_scaled.shape)
print("Shape of X_reshaped:", X_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)


from imblearn.under_sampling import RandomUnderSampler

# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply undersampling only on the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 1, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)



from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Reshape, Conv1D, MaxPooling1D, Flatten, ZeroPadding1D



#Define input layer
inputs = Input(shape=(1, X_train_resampled.shape[2]))
# Reshape input to fit Conv1D layer
inputs_reshaped = Reshape((X_train_resampled.shape[2], 1))(inputs)  
# Define CNN-LSTM model
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(inputs_reshaped)
# Add Padding1D layer
padded_layer = ZeroPadding1D(padding=2)(cnn_layer)  #Consider flatten being divisible by the LSTM time step, so add padding
pooling_layer = MaxPooling1D(pool_size=2)(padded_layer)
flattened_layer = Flatten()(pooling_layer)
print("Shape of flattened_layer:", flattened_layer.shape)
reshaped_layer = Reshape((1, -1))(flattened_layer)  # Reshape the output to fit LSTM input  #The input shape expected by the LSTM layer is (timesteps, features)
lstm_layer = LSTM(100)(reshaped_layer)   
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)
# Define the model
model = Model(inputs=inputs, outputs=output_layer)
# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)



Shape of X: (8078, 18)
Shape of y: (8078,)
Shape of X_scaled: (8078, 18)
Shape of X_reshaped: (8078, 1, 18)
Shape of X_train: (4846, 1, 18)
Shape of X_val: (1616, 1, 18)
Shape of X_test: (1616, 1, 18)
Shape of X_train_resampled: (712, 18)
Shape of X_train_resampled: (712, 1, 18)
Shape of flattened_layer: (None, 640)
Epoch 1/100
45/45 - 3s - 76ms/step - accuracy: 0.5885 - loss: 0.6728 - val_accuracy: 0.3608 - val_loss: 0.6914
Epoch 2/100
45/45 - 0s - 10ms/step - accuracy: 0.6306 - loss: 0.6404 - val_accuracy: 0.6677 - val_loss: 0.7030
Epoch 3/100
45/45 - 0s - 10ms/step - accuracy: 0.6376 - loss: 0.6182 - val_accuracy: 0.7723 - val_loss: 0.6185
Epoch 4/100
45/45 - 0s - 10ms/step - accuracy: 0.6489 - loss: 0.6054 - val_accuracy: 0.8007 - val_loss: 0.5985
Epoch 5/100
45/45 - 0s - 11ms/step - accuracy: 0.6503 - loss: 0.6060 - val_accuracy: 0.7308 - val_loss: 0.6526
Epoch 6/100
45/45 - 0s - 9ms/step - accuracy: 0.6531 - loss: 0.5972 - val_accuracy: 0.7475 - val_loss: 0.6097
Epoch 7/100
45/45

In [18]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
Accuracy: 0.6732673267326733
Recall: 0.5806451612903226
Precision: 0.13138686131386862
F1 Score: 0.2142857142857143
Micro F1 Score: 0.6732673267326733
Macro F1 Score: 0.5040178571428571
ROC AUC: 0.6951375075672406
Confusion Matrix:
[[1016  476]
 [  52   72]]
