In [1]:
#LSTM


In [2]:
import pandas as pd

# Read the data set
data = pd.read_csv("new_df_selected1_onlylastyear_adjusted.csv")


unique_company_names = data['company_name'].nunique()
unique_status_labels = data['status_label'].nunique()
unique_divisions = data['Division'].nunique()
unique_majorgroup = data['MajorGroup'].nunique()
unique_last_year = data['last_year'].nunique()

print("Number of unique values in 'company_name' column:", unique_company_names)
print("Number of unique values in 'status_label' column:", unique_status_labels)
print("Number of unique values in 'Division' column:", unique_divisions)
print("Number of unique values in 'MajorGroup' column:", unique_majorgroup)
print("Number of unique values in 'last_year' column:", unique_last_year)



Number of unique values in 'company_name' column: 8971
Number of unique values in 'status_label' column: 2
Number of unique values in 'Division' column: 10
Number of unique values in 'MajorGroup' column: 73
Number of unique values in 'last_year' column: 20


In [3]:
data.shape

(8971, 43)

In [4]:
# Encoding non-numeric columns

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder object
label_encoder = LabelEncoder()


# Label-encode the company_name column
data['company_name_encoded'] = label_encoder.fit_transform(data['company_name'])


# Label-encode the Division column
data['Division_encoded'] = label_encoder.fit_transform(data['Division'])

# Label-encode MajorGroup columns
data['MajorGroup_encoded'] = label_encoder.fit_transform(data['MajorGroup'])
#When using label encoding for feature encoding, the sequential relationship between categories will not be introduced and will not have an impact on prediction.

# Encode the label of the status_label column
data['status_label_encoded'] = label_encoder.fit_transform(data['status_label'])
#With only two categories, it may be simpler and more appropriate to use label encoding as it maps the categories to 0 and 1, suitable for use in tree-based models. 
#If use one-hot encoding, a new column will be generated

print(data.head())




  company_name status_label Division  MajorGroup  last_year  X1_last1year  \
0          C_1        alive        D          37     2017.0         942.7   
1          C_2        alive        D          36     2010.0        1107.7   
2          C_3        alive        D          38     2008.0       12686.0   
3          C_4        alive        D          28     2007.0      581502.0   
4          C_5        alive        D          35     1999.0       28957.0   

   X2_last1year  X3_last1year  X4_last1year  X5_last1year  ...  \
0       1524.70       1413.20         177.2          40.5  ...   
1       1474.50        677.20         650.8          61.5  ...   
2      21401.00      19334.00          23.0        1686.0  ...   
3    1288165.00        267.81         300.0       46338.0  ...   
4         42.21      79567.00         591.0        2024.0  ...   

   X15_last1year_ycr  X16_last1year_ycr  X17_last1year_ycr  X18_last1year_ycr  \
0        1291.527514          -0.002543          -0.010919 

In [5]:
data.shape

(8971, 47)

In [6]:
missing_rows_count = data.isnull().any(axis=1).sum()
print("Number of rows with missing values:", missing_rows_count)


Number of rows with missing values: 893


In [7]:
# Delete rows with missing values
data_cleaned = data.dropna()
# Delete non-numeric columns that are not encoded
# Delete specified column
data_cleaned = data_cleaned.drop(['company_name', 'status_label', 'Division', 'MajorGroup', 'last_year', 'company_name_encoded'], axis=1)

data_cleaned.shape

(8078, 41)

In [8]:
data_cleaned.head

<bound method NDFrame.head of       X1_last1year  X2_last1year  X3_last1year  X4_last1year  X5_last1year  \
0            942.7       1524.70       1413.20         177.2        40.500   
1           1107.7       1474.50        677.20         650.8        61.500   
2          12686.0      21401.00      19334.00          23.0      1686.000   
3         581502.0    1288165.00        267.81         300.0     46338.000   
5           6838.0      25088.00      18138.00        9253.0       995.000   
...            ...           ...           ...           ...           ...   
8966       10566.0      28278.00      31288.00        8497.0      1200.000   
8967        3369.0       3466.00        208.00           0.0        57.000   
8968        2482.2       9401.50        966.70        5350.7       156.600   
8969         931.6       2810.20       1475.90        1409.5        82.500   
8970       82589.0       1625.37      68817.00      632122.0        65.201   

      X6_last1year  X7_last1year 

In [9]:
status_counts = data_cleaned['status_label_encoded'].value_counts()
print(status_counts)


0    7484
1     594
Name: status_label_encoded, dtype: int64


In [10]:
#the impact of imbalanced datasets

### 1.1 imbalance dataset

In [11]:

# Extract features and target
X = data_cleaned.drop(columns=['status_label_encoded']).values
y = data_cleaned['status_label_encoded'].values
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)



# Data Standardization    #There is not much difference between normalization and Z-score normalization here because the dataset is not balanced
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

import numpy as np
X_reshaped = np.reshape(X_scaled, (X_scaled.shape[0], 1, X_scaled.shape[1]))

print("Shape of X_scaled:", X_scaled.shape)
print("Shape of X_reshaped:", X_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)




from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(1, X_train.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X: (8078, 40)
Shape of y: (8078,)
Shape of X_scaled: (8078, 40)
Shape of X_reshaped: (8078, 1, 40)
Shape of X_train: (4846, 1, 40)
Shape of X_val: (1616, 1, 40)
Shape of X_test: (1616, 1, 40)
Epoch 1/100
303/303 - 3s - 11ms/step - accuracy: 0.9059 - loss: 0.3618 - val_accuracy: 0.9282 - val_loss: 0.2384
Epoch 2/100
303/303 - 1s - 3ms/step - accuracy: 0.9267 - loss: 0.2332 - val_accuracy: 0.9313 - val_loss: 0.2264
Epoch 3/100
303/303 - 1s - 3ms/step - accuracy: 0.9307 - loss: 0.2203 - val_accuracy: 0.9338 - val_loss: 0.2200
Epoch 4/100
303/303 - 1s - 3ms/step - accuracy: 0.9309 - loss: 0.2133 - val_accuracy: 0.9325 - val_loss: 0.2177
Epoch 5/100
303/303 - 1s - 3ms/step - accuracy: 0.9315 - loss: 0.2080 - val_accuracy: 0.9325 - val_loss: 0.2146
Epoch 6/100
303/303 - 1s - 3ms/step - accuracy: 0.9323 - loss: 0.2041 - val_accuracy: 0.9313 - val_loss: 0.2155
Epoch 7/100
303/303 - 1s - 3ms/step - accuracy: 0.9323 - loss: 0.2009 - val_accuracy: 0.9313 - val_loss: 0.2151
Epoch 8/100
30

In [12]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Accuracy: 0.9164603960396039
Recall: 0.11290322580645161
Precision: 0.358974358974359
F1 Score: 0.17177914110429449
Micro F1 Score: 0.9164603960396039
Macro F1 Score: 0.5638954356547865
ROC AUC: 0.7409571477990142
Confusion Matrix:
[[1467   25]
 [ 110   14]]


### 1.2 SMOTE

In [13]:
# Extract features and target
X = data_cleaned.drop(columns=['status_label_encoded']).values
y = data_cleaned['status_label_encoded'].values
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)



# Data Standardization    #There is not much difference between normalization and Z-score normalization here because the dataset is not balanced
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

import numpy as np
X_reshaped = np.reshape(X_scaled, (X_scaled.shape[0], 1, X_scaled.shape[1]))

print("Shape of X_scaled:", X_scaled.shape)
print("Shape of X_reshaped:", X_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)



from imblearn.over_sampling import SMOTE
# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 1, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)



from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(1, X_train_resampled.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X: (8078, 40)
Shape of y: (8078,)
Shape of X_scaled: (8078, 40)
Shape of X_reshaped: (8078, 1, 40)
Shape of X_train: (4846, 1, 40)
Shape of X_val: (1616, 1, 40)
Shape of X_test: (1616, 1, 40)
Shape of X_train_resampled: (8980, 40)
Shape of X_train_resampled: (8980, 1, 40)
Epoch 1/100
562/562 - 3s - 6ms/step - accuracy: 0.6983 - loss: 0.5726 - val_accuracy: 0.6132 - val_loss: 0.5785
Epoch 2/100
562/562 - 1s - 2ms/step - accuracy: 0.7581 - loss: 0.4934 - val_accuracy: 0.7085 - val_loss: 0.5104
Epoch 3/100
562/562 - 1s - 2ms/step - accuracy: 0.7718 - loss: 0.4631 - val_accuracy: 0.6980 - val_loss: 0.5353
Epoch 4/100
562/562 - 1s - 2ms/step - accuracy: 0.7845 - loss: 0.4472 - val_accuracy: 0.7407 - val_loss: 0.4967
Epoch 5/100
562/562 - 1s - 2ms/step - accuracy: 0.7884 - loss: 0.4340 - val_accuracy: 0.7469 - val_loss: 0.4885
Epoch 6/100
562/562 - 1s - 2ms/step - accuracy: 0.7998 - loss: 0.4240 - val_accuracy: 0.7370 - val_loss: 0.4825
Epoch 7/100
562/562 - 1s - 2ms/step - accuracy

In [14]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Accuracy: 0.8508663366336634
Recall: 0.3548387096774194
Precision: 0.2146341463414634
F1 Score: 0.2674772036474164
Micro F1 Score: 0.8508663366336634
Macro F1 Score: 0.5922298178071735
ROC AUC: 0.7034182305630028
Confusion Matrix:
[[1331  161]
 [  80   44]]


### 1.3  Undersampling

In [15]:
# Extract features and target
X = data_cleaned.drop(columns=['status_label_encoded']).values
y = data_cleaned['status_label_encoded'].values
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)



# Data Standardization    #There is not much difference between normalization and Z-score normalization here because the dataset is not balanced
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

import numpy as np
X_reshaped = np.reshape(X_scaled, (X_scaled.shape[0], 1, X_scaled.shape[1]))

print("Shape of X_scaled:", X_scaled.shape)
print("Shape of X_reshaped:", X_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)



from imblearn.under_sampling import RandomUnderSampler

# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply undersampling only on the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 1, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)




from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam

# Define LSTM model
inputs = Input(shape=(1, X_train_resampled.shape[2]))  # (2, number of features) Accepts data with two time steps as input, and the first time step is the data of the past two years and the second time step is the data of the past year.
lstm_layer = LSTM(100)(inputs)
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=inputs, outputs=output_layer)

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)

Shape of X: (8078, 40)
Shape of y: (8078,)
Shape of X_scaled: (8078, 40)
Shape of X_reshaped: (8078, 1, 40)
Shape of X_train: (4846, 1, 40)
Shape of X_val: (1616, 1, 40)
Shape of X_test: (1616, 1, 40)
Shape of X_train_resampled: (712, 40)
Shape of X_train_resampled: (712, 1, 40)
Epoch 1/100
45/45 - 3s - 57ms/step - accuracy: 0.6180 - loss: 0.6776 - val_accuracy: 0.5019 - val_loss: 0.6922
Epoch 2/100
45/45 - 0s - 6ms/step - accuracy: 0.6770 - loss: 0.6291 - val_accuracy: 0.5266 - val_loss: 0.6798
Epoch 3/100
45/45 - 0s - 6ms/step - accuracy: 0.6910 - loss: 0.5985 - val_accuracy: 0.5600 - val_loss: 0.6614
Epoch 4/100
45/45 - 0s - 6ms/step - accuracy: 0.6966 - loss: 0.5739 - val_accuracy: 0.5712 - val_loss: 0.6440
Epoch 5/100
45/45 - 0s - 6ms/step - accuracy: 0.7051 - loss: 0.5582 - val_accuracy: 0.5724 - val_loss: 0.6351
Epoch 6/100
45/45 - 0s - 6ms/step - accuracy: 0.7191 - loss: 0.5441 - val_accuracy: 0.5786 - val_loss: 0.6367
Epoch 7/100
45/45 - 0s - 6ms/step - accuracy: 0.7177 - loss

In [16]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Accuracy: 0.6961633663366337
Recall: 0.7096774193548387
Precision: 0.16206261510128914
F1 Score: 0.26386806596701645
Micro F1 Score: 0.6961633663366337
Macro F1 Score: 0.5362225320088493
ROC AUC: 0.7792635994119174
Confusion Matrix:
[[1037  455]
 [  36   88]]


## undersampling, CNN-LSTM

In [17]:
# Extract features and target
X = data_cleaned.drop(columns=['status_label_encoded']).values
y = data_cleaned['status_label_encoded'].values
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)



# Data Standardization    #There is not much difference between normalization and Z-score normalization here because the dataset is not balanced
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

import numpy as np
X_reshaped = np.reshape(X_scaled, (X_scaled.shape[0], 1, X_scaled.shape[1]))

print("Shape of X_scaled:", X_scaled.shape)
print("Shape of X_reshaped:", X_reshaped.shape)



# Split into train, validation, and test sets
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.8 * 0.25 = 0.2

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)


from imblearn.under_sampling import RandomUnderSampler

# Flatten X_train
X_train_flat = X_train.reshape(X_train.shape[0], -1)
# Apply undersampling only on the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_flat, y_train)
print("Shape of X_train_resampled:", X_train_resampled.shape)
# Reshape resampled training data to match LSTM input shape
X_train_resampled = X_train_resampled.reshape(-1, 1, X_train.shape[2])
# Check the shape of resampled data and target variable
print("Shape of X_train_resampled:", X_train_resampled.shape)



from keras.layers import LSTM, Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Reshape, Conv1D, MaxPooling1D, Flatten, ZeroPadding1D



#Define input layer
inputs = Input(shape=(1, X_train_resampled.shape[2]))
# Reshape input to fit Conv1D layer
inputs_reshaped = Reshape((X_train_resampled.shape[2], 1))(inputs)  
# Define CNN-LSTM model
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(inputs_reshaped)
# Add Padding1D layer
padded_layer = ZeroPadding1D(padding=2)(cnn_layer)  #Consider flatten being divisible by the LSTM time step, so add padding
pooling_layer = MaxPooling1D(pool_size=2)(padded_layer)
flattened_layer = Flatten()(pooling_layer)
print("Shape of flattened_layer:", flattened_layer.shape)
reshaped_layer = Reshape((1, -1))(flattened_layer)  # Reshape the output to fit LSTM input  #The input shape expected by the LSTM layer is (timesteps, features)
lstm_layer = LSTM(100)(reshaped_layer)   
dropout_layer = Dropout(0.2)(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)
# Define the model
model = Model(inputs=inputs, outputs=output_layer)
# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=16, validation_data=(X_val, y_val), verbose=2)

## Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("test loss:", loss)
print("Test accuracy:", accuracy)



Shape of X: (8078, 40)
Shape of y: (8078,)
Shape of X_scaled: (8078, 40)
Shape of X_reshaped: (8078, 1, 40)
Shape of X_train: (4846, 1, 40)
Shape of X_val: (1616, 1, 40)
Shape of X_test: (1616, 1, 40)
Shape of X_train_resampled: (712, 40)
Shape of X_train_resampled: (712, 1, 40)
Shape of flattened_layer: (None, 1344)
Epoch 1/100
45/45 - 3s - 77ms/step - accuracy: 0.6531 - loss: 0.6386 - val_accuracy: 0.6046 - val_loss: 0.5889
Epoch 2/100
45/45 - 1s - 12ms/step - accuracy: 0.6896 - loss: 0.5686 - val_accuracy: 0.5619 - val_loss: 0.6463
Epoch 3/100
45/45 - 1s - 12ms/step - accuracy: 0.7022 - loss: 0.5495 - val_accuracy: 0.6083 - val_loss: 0.6225
Epoch 4/100
45/45 - 1s - 12ms/step - accuracy: 0.7149 - loss: 0.5300 - val_accuracy: 0.7426 - val_loss: 0.4544
Epoch 5/100
45/45 - 1s - 11ms/step - accuracy: 0.7177 - loss: 0.5350 - val_accuracy: 0.7265 - val_loss: 0.4856
Epoch 6/100
45/45 - 1s - 12ms/step - accuracy: 0.7388 - loss: 0.5112 - val_accuracy: 0.8014 - val_loss: 0.4423
Epoch 7/100
45/

In [18]:
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

# Calculate micro F1 score
micro_f1 = f1_score(y_test, y_pred, average='micro')

# Calculate macro F1 score
macro_f1 = f1_score(y_test, y_pred, average='macro')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)
print("Micro F1 Score:", micro_f1)
print("Macro F1 Score:", macro_f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
Accuracy: 0.718440594059406
Recall: 0.6612903225806451
Precision: 0.16565656565656567
F1 Score: 0.2649434571890145
Micro F1 Score: 0.718440594059406
Macro F1 Score: 0.5454070519775919
ROC AUC: 0.7637183256940241
Confusion Matrix:
[[1079  413]
 [  42   82]]
