In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical


In [3]:
df = pd.read_csv("data_distanceFilter_best_before_selection.csv")
df = df.drop(columns=["Unnamed: 0"])


In [4]:
le = LabelEncoder()
df["farming_practice_enc"] = le.fit_transform(df["farming_practice"])

y = df["farming_practice_enc"]
X = df.drop(columns=["farming_practice", "farming_practice_enc"])


In [5]:
rain_cols = [c for c in X.columns if c.startswith("yearly_rain_") and c[-2:].isdigit()]
min_cols  = [c for c in X.columns if c.startswith("yearly_min_temp_") and c[-2:].isdigit()]
max_cols  = [c for c in X.columns if c.startswith("yearly_max_temp_") and c[-2:].isdigit()]
mean_cols = [c for c in X.columns if c.startswith("yearly_avg_mean_temp_") and c[-2:].isdigit()]

rain_cols.sort()
min_cols.sort()
max_cols.sort()
mean_cols.sort()


In [6]:
timesteps = len(rain_cols)
features = 4

X_seq = []

for i in range(len(X)):
    seq = []
    for t in range(timesteps):
        seq.append([
            X.loc[i, rain_cols[t]],
            X.loc[i, min_cols[t]],
            X.loc[i, max_cols[t]],
            X.loc[i, mean_cols[t]],
        ])
    X_seq.append(seq)

X_seq = np.array(X_seq)


In [7]:
print(X_seq.shape)  
# (samples, timesteps, features)


(3007, 15, 4)


In [8]:
scaler = MinMaxScaler()
ns, ts, nf = X_seq.shape
X_seq = X_seq.reshape(ns, ts*nf)
X_seq = scaler.fit_transform(X_seq)
X_seq = X_seq.reshape(ns, ts, nf)


In [9]:
y_cat = to_categorical(y)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y_cat, test_size=0.2, random_state=42
)


In [11]:
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(timesteps, features)),
    Dropout(0.3),
    LSTM(32),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(y_cat.shape[1], activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


  super().__init__(**kwargs)


In [12]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=30,
    batch_size=32
)


Epoch 1/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6931 - loss: 1.1582 - val_accuracy: 0.6761 - val_loss: 1.0126
Epoch 2/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7173 - loss: 0.8290 - val_accuracy: 0.7442 - val_loss: 0.7306
Epoch 3/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7451 - loss: 0.6535 - val_accuracy: 0.6744 - val_loss: 0.7386
Epoch 4/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7663 - loss: 0.6233 - val_accuracy: 0.7492 - val_loss: 0.6119
Epoch 5/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7609 - loss: 0.6511 - val_accuracy: 0.7475 - val_loss: 0.6173
Epoch 6/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7651 - loss: 0.5819 - val_accuracy: 0.7508 - val_loss: 0.5927
Epoch 7/30
[1m76/76[0m [32m━━━━━━━━━━

In [13]:
loss, acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", acc)


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8023 - loss: 0.4653 
Test Accuracy: 0.8023256063461304


In [None]:
sample = X_seq[0].reshape(1, timesteps, features)
pred = model.predict(sample)[0]   # shape: (num_classes,)

# Get class names back from LabelEncoder
class_names = le.inverse_transform(range(len(pred)))

# Combine and sort by probability 
results = list(zip(class_names, pred))
results.sort(key=lambda x: x[1], reverse=True)

print("Farming Practice Probabilities:\n")
for name, prob in results:
    print(f"{name:25s} : {prob*100:.2f}%")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
Farming Practice Probabilities:

aerated irrigation (AI) (ph <7) : 60.75%
Soil Mulching (>13 Celcious) : 29.94%
Film-mulching drip irrigation (Spain) : 6.36%
Soil Mulching (Yearly rainfall <400) : 2.20%
Soil Mulching (<13 Celcious) : 0.50%
biochar amendment (lat <35) : 0.20%
Soil Mulching (Clay soil) : 0.05%


In [15]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Get predictions on test set
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

# With class names
class_names = le.inverse_transform(range(len(cm)))

print("\nConfusion Matrix with Labels:")
for i, row in enumerate(cm):
    print(f"{class_names[i]:20s} -> {row}")

# Detailed report
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, target_names=class_names))


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step  
Confusion Matrix:
 [[ 34  17   8   0  26]
 [  0 400   0   0   7]
 [  0   2  11   0  40]
 [  0   7   0   0   1]
 [  0   2   9   0  38]]

Confusion Matrix with Labels:
Film-mulching drip irrigation (Spain) -> [34 17  8  0 26]
Soil Mulching (<13 Celcious) -> [  0 400   0   0   7]
Soil Mulching (>13 Celcious) -> [ 0  2 11  0 40]
Soil Mulching (Clay soil) -> [0 7 0 0 1]
Soil Mulching (Yearly rainfall <400) -> [ 0  2  9  0 38]

Classification Report:

                                       precision    recall  f1-score   support

Film-mulching drip irrigation (Spain)       1.00      0.40      0.57        85
         Soil Mulching (<13 Celcious)       0.93      0.98      0.96       407
         Soil Mulching (>13 Celcious)       0.39      0.21      0.27        53
            Soil Mulching (Clay soil)       0.00      0.00      0.00         8
 Soil Mulching (Yearly rainfall <400)       0.34      0.78      0.47        49


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [17]:
soil_cols = [c for c in X.columns if any(
    k in c for k in ["phh2o", "nitrogen", "sand", "clay", "silt", "soc", "cec", "bdod", "ocs", "ocd", "cfvo"]
)]
X_soil = X[soil_cols].values


In [18]:
soil_scaler = MinMaxScaler()
X_soil = soil_scaler.fit_transform(X_soil)


In [19]:
Xc_train, Xc_test, Xs_train, Xs_test, y_train, y_test = train_test_split(
    X_seq, X_soil, y_cat, test_size=0.2, random_state=42
)


In [20]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Concatenate


In [21]:
# Climate branch
climate_input = Input(shape=(timesteps, features))
x1 = LSTM(64, return_sequences=True)(climate_input)
x1 = Dropout(0.3)(x1)
x1 = LSTM(32)(x1)

# Soil branch
soil_input = Input(shape=(Xs_train.shape[1],))
x2 = Dense(32, activation="relu")(soil_input)
x2 = Dense(16, activation="relu")(x2)

# Merge
merged = Concatenate()([x1, x2])
merged = Dense(32, activation="relu")(merged)
output = Dense(y_cat.shape[1], activation="softmax")(merged)

model = Model(inputs=[climate_input, soil_input], outputs=output)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()


In [22]:
history = model.fit(
    [Xc_train, Xs_train], y_train,
    validation_data=([Xc_test, Xs_test], y_test),
    epochs=30,
    batch_size=32
)


Epoch 1/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6894 - loss: 1.1550 - val_accuracy: 0.6761 - val_loss: 1.0292
Epoch 2/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7177 - loss: 0.8168 - val_accuracy: 0.7276 - val_loss: 0.7339
Epoch 3/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7617 - loss: 0.6447 - val_accuracy: 0.6960 - val_loss: 0.7329
Epoch 4/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7642 - loss: 0.6105 - val_accuracy: 0.7641 - val_loss: 0.6089
Epoch 5/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7796 - loss: 0.5704 - val_accuracy: 0.7492 - val_loss: 0.6175
Epoch 6/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7788 - loss: 0.5671 - val_accuracy: 0.7458 - val_loss: 0.6458
Epoch 7/30
[1m76/76[0m [32m━━━━━━━━━━

In [23]:
loss, acc = model.evaluate([Xc_test, Xs_test], y_test)
print("Test Accuracy:", acc)


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8355 - loss: 0.3989 
Test Accuracy: 0.8355481624603271


In [24]:
sample_climate = X_seq[0].reshape(1, timesteps, features)
sample_soil = X_soil[0].reshape(1, -1)

pred = model.predict([sample_climate, sample_soil])[0]
class_names = le.inverse_transform(range(len(pred)))

results = list(zip(class_names, pred))
results.sort(key=lambda x: x[1], reverse=True)

for name, prob in results:
    print(f"{name:35s} : {prob*100:.2f}%")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step
Soil Mulching (>13 Celcious)        : 46.37%
aerated irrigation (AI) (ph <7)     : 44.10%
Film-mulching drip irrigation (Spain) : 7.91%
Soil Mulching (Yearly rainfall <400) : 0.96%
Soil Mulching (<13 Celcious)        : 0.54%
Soil Mulching (Clay soil)           : 0.08%
biochar amendment (lat <35)         : 0.04%


In [25]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Predict on test set
y_pred_probs = model.predict([Xc_test, Xs_test])
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

# Get class names
class_names = le.inverse_transform(range(len(cm)))

# Print matrix with labels
print("\nConfusion Matrix with Labels:")
for i, row in enumerate(cm):
    print(f"{class_names[i]:35s} -> {row}")

# Detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, target_names=class_names))


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step  
Confusion Matrix:
 [[ 44  18  13   0   1   9]
 [  0 404   0   1   0   2]
 [  7   2  38   1   0   5]
 [  0   6   0   2   0   0]
 [  0   0   0   0   0   0]
 [  3   4  25   2   0  15]]

Confusion Matrix with Labels:
Film-mulching drip irrigation (Spain) -> [44 18 13  0  1  9]
Soil Mulching (<13 Celcious)        -> [  0 404   0   1   0   2]
Soil Mulching (>13 Celcious)        -> [ 7  2 38  1  0  5]
Soil Mulching (Clay soil)           -> [0 6 0 2 0 0]
Soil Mulching (Yearly rainfall <400) -> [0 0 0 0 0 0]
aerated irrigation (AI) (ph <7)     -> [ 3  4 25  2  0 15]

Classification Report:

                                       precision    recall  f1-score   support

Film-mulching drip irrigation (Spain)       0.81      0.52      0.63        85
         Soil Mulching (<13 Celcious)       0.93      0.99      0.96       407
         Soil Mulching (>13 Celcious)       0.50      0.72      0.59        53
            Soil Mu

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Predict on test set (dual-input model)
y_pred_probs = model.predict([Xc_test, Xs_test])
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

# Metrics
accuracy = accuracy_score(y_true, y_pred)
precision_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
recall_macro = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro = f1_score(y_true, y_pred, average="macro", zero_division=0)
f1_weighted = f1_score(y_true, y_pred, average="weighted", zero_division=0)

print(f"Accuracy        : {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}")
print(f"Recall (Macro)   : {recall_macro:.4f}")
print(f"F1 Score (Macro) : {f1_macro:.4f}")
print(f"F1 Score (Weighted): {f1_weighted:.4f}")


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Accuracy        : 0.8355
Precision (Macro): 0.5105
Recall (Macro)   : 0.4639
F1 Score (Macro) : 0.4740
F1 Score (Weighted): 0.8251
