In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Load the penguin dataset
df = pd.read_csv('penguins_size.csv')

# Preprocessing
# Convert categorical features to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['species', 'island', 'sex'], drop_first=True)

# Handle missing values by dropping rows or imputing
df.dropna(inplace=True)

# Scale numerical features
scaler = MinMaxScaler()
numerical_cols = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Prepare data for LSTM
# Example: Predicting the 'sex_MALE' column
X = df.drop(columns=['sex_MALE'])  # Drop target column
y = df['sex_MALE']  # Target variable

# Ensure data types are float32 for TensorFlow compatibility
X = X.astype(np.float32)
y = y.astype(np.float32)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape data for LSTM (samples, timesteps, features)
X_train = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Build the LSTM model
model = Sequential([
    LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(units=50),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

# Feature importance (based on LSTM weights)
# Access weights of the first LSTM layer
weights = model.layers[0].get_weights()[0]

# Analyze weights to determine feature importance
feature_importance = np.mean(np.abs(weights), axis=0)

# Ensure feature_importance matches the number of features in X
feature_importance = feature_importance[:X.shape[1]]

# Print feature importance
print("Feature Importance (based on LSTM weights):")
for i, importance in enumerate(feature_importance):
    print(f"{X.columns[i]}: {importance}")

# Evaluate the model
_, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")


Epoch 1/200


  super().__init__(**kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 80ms/step - accuracy: 0.7245 - loss: 0.6884 - val_accuracy: 0.6429 - val_loss: 0.6856
Epoch 2/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7269 - loss: 0.6814 - val_accuracy: 0.6429 - val_loss: 0.6794
Epoch 3/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7446 - loss: 0.6724 - val_accuracy: 0.6786 - val_loss: 0.6708
Epoch 4/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7530 - loss: 0.6622 - val_accuracy: 0.8571 - val_loss: 0.6577
Epoch 5/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8954 - loss: 0.6481 - val_accuracy: 0.9643 - val_loss: 0.6366
Epoch 6/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9671 - loss: 0.6286 - val_accuracy: 1.0000 - val_loss: 0.6082
Epoch 7/200
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0