In [22]:
import os
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow import keras
from sklearn.preprocessing import StandardScaler

In [23]:
data = pd.read_csv("/kaggle/input/soil-health/AP Primary Sector Farmerwise Soil Health Data of 13 districts.csv")
data.head()

Unnamed: 0,Sl no,Date,Farmer No,Macro/ Micro nutrient,Farmer Name,District,Mandal,Village,Latitude,Longitude,...,Exch-K,Avail-Ca,Avail-Mg,Avail-S,Avail-Zn,Avail-B,Avail-Fe,Avail-Cu,Avail-Mn,Time
0,1,1/1/2015,1910,RK2276,P.Krishna Naik,Anantapur,Penukonda,Gonipeta,14.08,77.69,...,41,587,101,5.16,0.3,0.17,8.89,0.51,15.24,1/1/2015
1,2,1/1/2015,1911,RK2277,Kallu Thippe Naik,Anantapur,Penukonda,Gonipeta,14.09,77.69,...,102,811,261,9.91,0.36,0.57,3.24,0.44,6.9,1/1/2015
2,3,1/1/2015,1912,RK2278,P.Duble Bai,Anantapur,Penukonda,Gonipeta,14.09,77.69,...,46,582,48,3.77,0.37,0.19,5.54,0.42,8.34,1/1/2015
3,4,1/1/2015,1913,RK2279,H.Marekka (Kamma),Anantapur,Penukonda,Gonipeta,14.1,77.7,...,35,3048,52,4.14,0.23,0.21,1.79,0.67,4.17,1/1/2015
4,5,1/1/2015,1914,RK2280,M.Alevelamma,Anantapur,Penukonda,Gonipeta,14.09,77.69,...,76,511,84,1.45,0.36,0.22,22.26,0.45,9.2,1/1/2015


# ** Data Preprocessing**

# Removing the unnecessary columns

In [24]:
keep = ["pH", "EC", "OC", "Avail-P", "Exch-K", "Avail-S", 
        "Avail-B", "Avail-Zn", "Avail-Fe", "Avail-Cu", "Avail-Mn"]

data = data.drop(columns=[col for col in data.columns if col not in keep])

# Save the changes
data.to_csv("cleaned_data.csv", index=False)
data.head()

Unnamed: 0,pH,EC,OC,Avail-P,Exch-K,Avail-S,Avail-Zn,Avail-B,Avail-Fe,Avail-Cu,Avail-Mn
0,6.19,0.07,0.18,7.13,41,5.16,0.3,0.17,8.89,0.51,15.24
1,8.4,0.33,0.31,10.34,102,9.91,0.36,0.57,3.24,0.44,6.9
2,7.1,0.11,0.17,8.46,46,3.77,0.37,0.19,5.54,0.42,8.34
3,8.3,0.21,0.2,2.31,35,4.14,0.23,0.21,1.79,0.67,4.17
4,6.4,0.06,0.22,6.08,76,1.45,0.36,0.22,22.26,0.45,9.2


In [25]:
data.isnull().sum()

pH          0
EC          0
OC          0
Avail-P     0
Exch-K      0
Avail-S     0
Avail-Zn    0
Avail-B     0
Avail-Fe    0
Avail-Cu    0
Avail-Mn    0
dtype: int64

In [26]:
data = data.apply(pd.to_numeric, errors='coerce')

# We'll classify soil into two categories:

* Good Soil (1)
* Bad Soil (0)

Parameter > Ideal Range (Good Soil)
* pH	6.0 - 7.5
* EC	< 2 dS/m
* OC	> 0.5%
* Avail-P	> 10 kg/ha
* Exch-K	> 100 kg/ha
* Avail-S	> 10 mg/kg
* Avail-B	> 0.5 mg/kg
* Avail-Zn	> 0.6 mg/kg
* Avail-Fe	> 4.5 mg/kg
* Avail-Cu	> 0.2 mg/kg
* Avail-Mn	> 2 mg/kg* 

In [27]:
def classify_soil(row):
    if (
        6.0 <= row["pH"] <= 7.5 and
        row["EC"] < 2 and
        row["OC"] > 0.5 and
        row["Avail-P"] > 10 and
        row["Exch-K"] > 100 and
        row["Avail-S"] > 10 and
        row["Avail-B"] > 0.5 and
        row["Avail-Zn"] > 0.6 and
        row["Avail-Fe"] > 4.5 and
        row["Avail-Cu"] > 0.2 and
        row["Avail-Mn"] > 2
    ):
        return 1  
    else:
        return 0  

data["health"] = data.apply(classify_soil, axis=1)

# Save the changes
data.to_csv("updated_health.csv", index=False)

In [28]:
data.head()

Unnamed: 0,pH,EC,OC,Avail-P,Exch-K,Avail-S,Avail-Zn,Avail-B,Avail-Fe,Avail-Cu,Avail-Mn,health
0,6.19,0.07,0.18,7.13,41,5.16,0.3,0.17,8.89,0.51,15.24,0
1,8.4,0.33,0.31,10.34,102,9.91,0.36,0.57,3.24,0.44,6.9,0
2,7.1,0.11,0.17,8.46,46,3.77,0.37,0.19,5.54,0.42,8.34,0
3,8.3,0.21,0.2,2.31,35,4.14,0.23,0.21,1.79,0.67,4.17,0
4,6.4,0.06,0.22,6.08,76,1.45,0.36,0.22,22.26,0.45,9.2,0


# Model Training Part

In [29]:
X = data.drop(columns=["health"])
y = data["health"]

In [30]:
X.isnull().sum()
y.isnull().sum()

0

In [31]:
X.fillna(X.mean(), inplace=True)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
healthy_count = (data['health'] == 1).sum()
unhealthy_count = (data['health'] == 0).sum()

print(f"Healthy: {healthy_count}")
print(f"Unhealthy: {unhealthy_count}")


Healthy: 206
Unhealthy: 4609


In [36]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new distribution
import pandas as pd
print(pd.Series(y_resampled).value_counts())

health
0    4609
1    4609
Name: count, dtype: int64


In [37]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# Making model using ANN

In [38]:
## Normalizing 
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train_resampled)
X_test_scaled = sc.transform(X_test)

In [39]:
model = keras.Sequential([
    keras.layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dropout(0.6),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [40]:
model.fit(X_train_resampled, y_train_resampled, epochs=50, batch_size=16, verbose=1)

Epoch 1/50
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5710 - loss: 5.4631
Epoch 2/50
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6400 - loss: 0.6102
Epoch 3/50
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6905 - loss: 0.5706
Epoch 4/50
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7276 - loss: 0.4952
Epoch 5/50
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7228 - loss: 0.4555
Epoch 6/50
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7231 - loss: 0.4369
Epoch 7/50
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7673 - loss: 0.3919
Epoch 8/50
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8297 - loss: 0.3701
Epoch 9/50
[1m462/462[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x785cb297d7b0>

# Evaluating Performance for both models

In [41]:
y_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))

test_loss, test_acc = model.evaluate(X_test_scaled, y_test)
print("ANN Test Accuracy:", test_acc)

Random Forest Accuracy: 0.9979231568016614
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9511 - loss: 0.1301  
ANN Test Accuracy: 0.9439252614974976


# Taking the user input for predictions 

In [42]:
def get_user_input():
    print("Enter the values for the following soil parameters:")
    pH = float(input("pH: "))
    EC = float(input("EC: "))
    OC = float(input("OC: "))
    Avail_P = float(input("Avail-P: "))
    Exch_K = float(input("Exch-K: "))
    Avail_S = float(input("Avail-S: "))
    Avail_Zn = float(input("Avail-Zn: "))
    Avail_B = float(input("Avail-B: "))
    Avail_Fe = float(input("Avail-Fe: "))
    Avail_Cu = float(input("Avail-Cu: "))
    Avail_Mn = float(input("Avail-Mn: "))

    # Create a NumPy array of the input values
    user_input = np.array([[pH, EC, OC, Avail_P, Exch_K, Avail_S, Avail_Zn, Avail_B, Avail_Fe, Avail_Cu, Avail_Mn]])
    
    return user_input

In [48]:
user_data = get_user_input()

warnings.filterwarnings("ignore")

def interpret_prediction(prediction):
    return "Unealthy" if prediction >= 0.7 else "healthy"

## Random Forest Prediction
rf_prediction = rf_model.predict(user_data)
rf_result = interpret_prediction(rf_prediction[0])
print(f"🌱 Soil Health (Random Forest): {rf_result}")

## ANN Prediction
ann_prediction = model.predict(user_data)
ann_prediction_label = np.argmax(ann_prediction, axis=1)  # Convert softmax output to label
ann_result = interpret_prediction(ann_prediction_label[0])
print(f"🌾 Soil Health (ANN): {ann_result}")

Enter the values for the following soil parameters:


pH:  6.5
EC:  0.8
OC:  0.5
Avail-P:  20
Exch-K:  100
Avail-S:  15
Avail-Zn:  0.4
Avail-B:  0.5
Avail-Fe:  10
Avail-Cu:  0.3
Avail-Mn:  12


🌱 Soil Health (Random Forest): healthy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
🌾 Soil Health (ANN): healthy
