In [1]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os
import joblib # We use joblib to save our preprocessor

print("TensorFlow Version:", tf.__version__)
print("Libraries imported.")

TensorFlow Version: 2.19.0
Libraries imported.


In [2]:
# Cell 2: Load the Dataset
print("Loading soil and microbial dataset...")

try:
    soil_df = pd.read_csv('../data/simulated/soil_microbe_data.csv')
except FileNotFoundError:
    print("ERROR: soil_microbe_data.csv not found. Please run the '1_data_simulation.ipynb' notebook first.")
    raise

print("Dataset loaded successfully.")
print("Shape of the data:", soil_df.shape)
print(soil_df.head())

Loading soil and microbial dataset...
Dataset loaded successfully.
Shape of the data: (15, 10)
   plot_id  primary_crop                 crop_mix  microbe_trichoderma_cfu_g  \
0        0  Black Pepper  Black Pepper + Arecanut                   237940.0   
1        1        Banana        Banana + Arecanut                    38712.0   
2        2      Arecanut  Arecanut + Black Pepper                   166993.0   
3        3  Black Pepper    Black Pepper + Banana                   164362.0   
4        4  Black Pepper  Black Pepper + Arecanut                   240067.0   

   microbe_pseudomonas_cfu_g  organic_carbon_percent  nitrogen_kg_ha  \
0                   128595.0                    2.30          471.73   
1                    38878.0                    1.09          355.07   
2                    74996.0                    1.64          325.03   
3                   115056.0                    1.24          430.80   
4                   147384.0                    2.21          39

In [3]:
# Cell 3: Data Preprocessing
print("\nPreprocessing data...")

# We will drop 'crop_mix' as 'primary_crop' is our main categorical feature
soil_df = soil_df.drop(columns=['plot_id', 'crop_mix'])

# Define which columns are numerical and which are categorical
numerical_features = [
    'microbe_trichoderma_cfu_g', 'microbe_pseudomonas_cfu_g',
    'organic_carbon_percent', 'nitrogen_kg_ha', 'phosphorus_kg_ha', 'potassium_kg_ha'
]
categorical_features = ['primary_crop']

# Separate features (X) from the target (y)
X = soil_df.drop('yield_kg_ha', axis=1)
y = soil_df['yield_kg_ha']

# Create a preprocessing pipeline. This is a robust way to handle different data types.
# 1. For numerical features, we scale them to be between 0 and 1.
# 2. For categorical features, we convert them into a numerical format using One-Hot Encoding.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

print("Preprocessing pipeline created.")


Preprocessing data...
Preprocessing pipeline created.


In [4]:
# Cell 4: Split Data and Apply Preprocessing
print("\nSplitting data and applying preprocessing...")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the preprocessor on the training data and transform it
X_train_processed = preprocessor.fit_transform(X_train)

# Only transform the test data using the already-fitted preprocessor
X_test_processed = preprocessor.transform(X_test)

print("Training data shape after processing:", X_train_processed.shape)
print("Testing data shape after processing:", X_test_processed.shape)


Splitting data and applying preprocessing...
Training data shape after processing: (12, 9)
Testing data shape after processing: (3, 9)


In [5]:
# Cell 5: Build the Dense Neural Network (DNN) Model
print("\nBuilding the DNN model...")

model = Sequential([
    # Input layer: The shape must match the number of features after preprocessing
    Dense(128, activation='relu', input_shape=[X_train_processed.shape[1]]),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    # Output layer: A single neuron for the single regression value (yield)
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

model.summary()


Building the DNN model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [6]:
# Cell 6: Train the Model
print("\nTraining the DNN model...")

history = model.fit(
    X_train_processed,
    y_train,
    epochs=50,  # DNNs often train faster, so we can use more epochs
    batch_size=4, # Use a smaller batch size for smaller datasets
    validation_split=0.2,
    verbose=1
)

print("Model training complete.")


Training the DNN model...
Epoch 1/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 188ms/step - loss: 1270133.8750 - mean_absolute_error: 1010.9336 - val_loss: 3317288.2500 - val_mean_absolute_error: 1707.0941
Epoch 2/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - loss: 1270001.5000 - mean_absolute_error: 1010.8730 - val_loss: 3316815.7500 - val_mean_absolute_error: 1706.9603
Epoch 3/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - loss: 1269863.6250 - mean_absolute_error: 1010.7821 - val_loss: 3316363.2500 - val_mean_absolute_error: 1706.8252
Epoch 4/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - loss: 1269393.3750 - mean_absolute_error: 1010.6172 - val_loss: 3315803.0000 - val_mean_absolute_error: 1706.6578
Epoch 5/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - loss: 1269142.2500 - mean_absolute_error: 1010.4904 - val_loss: 3315146.7500 - val_mean_ab

In [7]:
# Cell 7: Evaluate the Model
print("\nEvaluating the DNN model on the test set...")

loss, mae = model.evaluate(X_test_processed, y_test, verbose=0)
print(f"Test Set Mean Absolute Error (MAE): {mae:.2f} kg/ha")
print(f"This DNN model's yield predictions are, on average, off by {mae:.2f} kg/ha.")


Evaluating the DNN model on the test set...
Test Set Mean Absolute Error (MAE): 337.51 kg/ha
This DNN model's yield predictions are, on average, off by 337.51 kg/ha.


In [8]:
# Cell 8: Save the Trained Model and the Preprocessor
print("\nSaving the trained model and the preprocessor...")

# Define the path to save the model and preprocessor
model_dir = '../backend/models/'
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, 'soil_microbe_yield_model.h5')
preprocessor_path = os.path.join(model_dir, 'soil_data_preprocessor.joblib')

# Save the model
model.save(model_path)

# Save the preprocessor object for later use in the backend API
joblib.dump(preprocessor, preprocessor_path)

print(f"Model saved successfully to: {model_path}")
print(f"Preprocessor saved successfully to: {preprocessor_path}")




Saving the trained model and the preprocessor...
Model saved successfully to: ../backend/models/soil_microbe_yield_model.h5
Preprocessor saved successfully to: ../backend/models/soil_data_preprocessor.joblib
