In [1]:
import pandas as pd

# Load the .data file
file_path = "plants/plants.data"

# Read and parse the file
with open(file_path, "r", encoding="ISO-8859-1") as f:
    raw_lines = f.readlines()

# Split into plant name and states
data = []
for line in raw_lines:
    parts = line.strip().split(",")
    plant_name = parts[0]
    states = parts[1:]  # list of state abbreviations
    data.append((plant_name, states))

# Create a DataFrame from the parsed data
df = pd.DataFrame(data, columns=["plant_name", "states"])

# Convert state list into binary columns (one-hot encoding)
# Explode the list of states into rows, then pivot to binary columns
df_exploded = df.explode("states")
df_encoded = pd.crosstab(df_exploded.index, df_exploded["states"])

# Combine with the plant name (optional)
df_final = pd.concat([df["plant_name"], df_encoded], axis=1)

# Show the final DataFrame
df_final.head()


Unnamed: 0,plant_name,ab,ak,al,ar,az,bc,ca,co,ct,...,tx,ut,va,vi,vt,wa,wi,wv,wy,yt
0,abelia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,abelia x grandiflora,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,abelmoschus,0,0,0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
3,abelmoschus esculentus,0,0,0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
4,abelmoschus moschatus,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Drop plant name, get feature matrix (X)
X = df_final.drop(columns=["plant_name"]).values

# For learning purpose, let's try to predict ONE of the states (e.g., 'fl') just as binary classification
# You can change 'fl' to any state that appears often
y = df_final["fl"].values

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the MLP model
model = Sequential([
    Dense(128, input_shape=(X.shape[1],), activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Binary output
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.9345 - loss: 0.1773 - val_accuracy: 1.0000 - val_loss: 2.7461e-04
Epoch 2/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9999 - loss: 0.0012 - val_accuracy: 0.9998 - val_loss: 2.2901e-04
Epoch 3/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 1.0000 - loss: 5.1448e-04 - val_accuracy: 1.0000 - val_loss: 7.2322e-05
Epoch 4/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9986 - loss: 0.0048 - val_accuracy: 1.0000 - val_loss: 5.4364e-06
Epoch 5/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 1.0000 - loss: 1.6273e-04 - val_accuracy: 1.0000 - val_loss: 2.9093e-06
Epoch 6/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9989 - loss: 0.0030 - val_accuracy: 1.0000 - val_loss: 7.9542e

In [5]:
# Prepare the full label matrix (all states, excluding plant name)
y_multi = df_final.drop(columns=["plant_name"]).values

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y_multi, test_size=0.2, random_state=42)

# Build multi-label MLP
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout

input_layer = Input(shape=(X.shape[1],))
x = Dense(128, activation='relu')(input_layer)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
output_layer = Dense(y_multi.shape[1], activation='sigmoid')(x)  # One sigmoid per label

model = Model(inputs=input_layer, outputs=output_layer)

# Compile with binary crossentropy for multi-label
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)

# Evaluate
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Multi-label Accuracy: {accuracy:.4f}")

Epoch 1/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.2413 - loss: 0.2904 - val_accuracy: 0.4952 - val_loss: 0.0749
Epoch 2/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.4742 - loss: 0.0886 - val_accuracy: 0.4996 - val_loss: 0.0576
Epoch 3/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.4936 - loss: 0.0747 - val_accuracy: 0.4988 - val_loss: 0.0504
Epoch 4/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.4929 - loss: 0.0689 - val_accuracy: 0.5157 - val_loss: 0.0460
Epoch 5/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.4919 - loss: 0.0653 - val_accuracy: 0.5039 - val_loss: 0.0433
Epoch 6/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.5006 - loss: 0.0617 - val_accuracy: 0.5157 - val_loss: 0.0421
Epoch 7/10
[1m696/696[0m 

In [6]:
from sklearn.metrics import f1_score, accuracy_score, hamming_loss

# Predict on test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Evaluation metrics
print("Hamming Loss:", hamming_loss(y_test, y_pred))
print("F1 Score (micro):", f1_score(y_test, y_pred, average='micro'))
print("F1 Score (macro):", f1_score(y_test, y_pred, average='macro'))


[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step  
Hamming Loss: 0.012782603338877594
F1 Score (micro): 0.9480856316039663
F1 Score (macro): 0.9341247300056909


In [7]:
model.save("mlp_environmental_model.h5")

