In [13]:
# =========================
# Crop Recommender: NumPy-only, End-to-End
# =========================
import pandas as pd
import numpy as np
import ast
from collections import defaultdict

print("[Step 0] Imports ready.")

# -------------------------
# CONFIG
# -------------------------
CSV_PATH = "apcrop_dataset_realistic.csv"
MODEL_PATH = "croprecommender.npz"
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Irrigation guidance (from your list)
IRRIGATION_TIPS = {
    "Paddy": [
        "Daily flooding, maintain 5–10 cm standing water.",
        "Ensure water at tillering & flowering stages.",
        "Drain completely 7–10 days before harvest."
    ],
    "Maize": [
        "Weekly irrigation; adjust for soil & rainfall.",
        "Critical at tasseling, silking, grain filling.",
        "Avoid stress during reproductive phase."
    ],
    "Groundnut": [
        "Irrigate ~every 10 days.",
        "Critical: flowering & pegging.",
        "Maintain moisture during pod development."
    ],
    "Wheat": [
        "Irrigate at crown root initiation, jointing, flowering.",
        "Most critical: ~21 DAS (CRI).",
        "Avoid waterlogging at tillering."
    ],
    "Bengal Gram": [
        "Minimal irrigation; drought-tolerant.",
        "One irrigation at flowering if very dry.",
        "Avoid heavy irrigation to prevent excess foliage."
    ],
    "Sunflower": [
        "Irrigate ~every 12 days.",
        "Critical: bud, flowering, seed filling.",
        "Drip irrigation is effective."
    ],
    "Castor": [
        "Irrigate ~every 15 days; hardy crop.",
        "Critical: branching & spike initiation.",
        "Too much water → foliage > seeds."
    ],
    "Bajra": [
        "Prefer rainfed; drought-tolerant.",
        "Irrigate at flowering/grain fill if scarce rain.",
        "Avoid over-irrigation to prevent lodging."
    ],
    "Linseed": [
        "Light irrigation at branching & flowering.",
        "Avoid waterlogging (root diseases).",
        "1–2 irrigations usually sufficient."
    ],
    "Mustard": [
        "Irrigate at branching & pod filling.",
        "First irrigation ~30–35 DAS.",
        "Avoid during flowering to prevent flower drop."
    ],
    "Watermelon": [
        "Irrigate ~every 7 days, keep soil moisture.",
        "Critical: flowering & fruit set.",
        "Reduce irrigation at maturity for sweetness."
    ],
    "Muskmelon": [
        "Irrigate ~every 7 days; avoid waterlogging.",
        "Consistent water for fruit growth & quality.",
        "Drip helps reduce fungal disease."
    ],
    "Cowpea": [
        "Irrigate ~every 10 days (avoid wilting).",
        "Critical: flowering & pod development.",
        "Drought-tolerant but timely irrigation boosts yield."
    ],
    "Cotton": [
        "Irrigate ~every 15 days; adjust by weather.",
        "Critical: squaring, flowering, boll formation.",
        "Avoid waterlogging (boll rot)."
    ],
    "Sugarcane": [
        "Frequent irrigation—especially in hot, dry periods.",
        "Maintain moisture up to 120 days (formative).",
        "Reduce 1–2 months pre-harvest to raise sugar."
    ],
    "Barley": [
        "Light irrigations as needed.",
        "Critical: crown root initiation.",
        "Sensitive to waterlogging—ensure drainage."
    ],
    "Lentil": [
        "Primarily rainfed; minimal irrigation.",
        "One light irrigation pre-flowering can help.",
        "Excess water → vegetative growth over seeds."
    ],
    "Soybean": [
        "Prefer rainfed; moderate drought tolerance.",
        "Irrigate at pod filling during dry spells.",
        "Avoid irrigation at flowering (flower drop)."
    ],
    "Pea": [
        "Irrigate at flowering & pod filling.",
        "Initial irrigation post-sowing aids germination.",
        "Avoid overwatering to prevent root rot."
    ],
    "Vegetables": [
        "Irrigate every 5–7 days (crop-dependent).",
        "Drip to target root zone & conserve water.",
        "Consistent watering prevents cracking/bitterness."
    ],
    "Jute": [
        "Keep soil moist throughout growth.",
        "Frequent light irrigations in hot season.",
        "Ensure drainage to prevent root decay."
    ],
    "Oats": [
        "Irrigate ~every 12 days as needed.",
        "Critical: tillering & flowering.",
        "Relatively drought-tolerant but responds to moisture."
    ],
    "Cucumber": [
        "Irrigate ~every 7 days; keep moisture uniform.",
        "Critical during fruit set/development.",
        "Low water → bitter fruits."
    ],
    "Sugar Beet": [
        "Irrigate ~every 10 days; steady moisture for roots.",
        "Critical: canopy establishment & root bulking.",
        "Avoid waterlogging (low sugar, root rot)."
    ],
    "Pearl Millet": [
        "Prefer rainfed; very drought-tolerant.",
        "If needed, irrigate at flowering.",
        "Water stress at grain fill reduces yield."
    ],
    "Cluster Bean": [
        "Prefer rainfed; suited for arid conditions.",
        "1–2 light irrigations during long dry spells.",
        "Over-watering reduces pod formation."
    ],
    "Sesame": [
        "Prefer rainfed; high drought tolerance.",
        "One irrigation at flowering if soil is dry.",
        "Highly sensitive to waterlogging (root rot)."
    ],
    "Green Gram": [
        "Prefer rainfed; irrigation increases yield.",
        "One irrigation at flowering is critical.",
        "Avoid heavy irrigation (root diseases)."
    ],
    "Millets": [
        "Generally rainfed; very drought-tolerant.",
        "Irrigate only at critical stages if very dry.",
        "Avoid excess water."
    ],
    "Sorghum": [
        "Drought-hardy; mostly rainfed.",
        "Irrigate at booting/flowering if dry.",
        "Avoid waterlogging."
    ]
}

# -------------------------
# STEP 1: Load
# -------------------------
df = pd.read_csv(CSV_PATH)
print("[Step 1] Dataset loaded:", df.shape)
print("Columns:", list(df.columns))

# -------------------------
# STEP 2: Parse labels (multi-label from Suitable_Crops)
# -------------------------
def parse_crops(cell):
    if pd.isna(cell):
        return []
    s = str(cell).strip()
    try:
        # handle strings like ["Paddy","Maize"]
        parsed = ast.literal_eval(s)
        if isinstance(parsed, list):
            return [str(x).strip().strip('"').strip("'") for x in parsed]
    except Exception:
        pass
    # fallback: comma-separated
    return [c.strip().strip('"').strip("'") for c in s.split(",") if c.strip()]

df["Suitable_Crops_List"] = df["Suitable_Crops"].apply(parse_crops)
all_crops = sorted({c for lst in df["Suitable_Crops_List"] for c in lst})
crop_to_idx = {c:i for i,c in enumerate(all_crops)}
idx_to_crop = {i:c for c,i in crop_to_idx.items()}
print(f"[Step 2] Parsed multi-labels. #Unique crops: {len(all_crops)} → {all_crops}")

def multilabel_to_matrix(lists, mapping):
    m = len(lists)
    k = len(mapping)
    Y = np.zeros((m, k), dtype=np.float32)
    for i, lst in enumerate(lists):
        for c in lst:
            if c in mapping:
                Y[i, mapping[c]] = 1.0
    return Y

Y = multilabel_to_matrix(df["Suitable_Crops_List"], crop_to_idx)
print("[Step 2] Built label matrix:", Y.shape, "(samples, num_crops)")

# -------------------------
# STEP 3: Build features
# -------------------------
# columns to drop (labels + plans + text extras not used as raw features)
drop_cols = [
    "Suitable_Crops", "Suitable_Crops_List", "Fertilizer_Plan", "Irrigation_Plan",
    "Primary_Crop", "Secondary_Crop"
]
Xdf = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore").copy()

# Identify categorical vs numeric
cat_cols = Xdf.select_dtypes(include=["object"]).columns.tolist()
num_cols = Xdf.select_dtypes(include=[np.number]).columns.tolist()

# One-hot encode categoricals
Xdf = pd.get_dummies(Xdf, columns=cat_cols, drop_first=False)

# Fill missing numerics (if any) with column means
Xdf = Xdf.fillna(Xdf.mean(numeric_only=True))

# Keep feature names
feature_names = list(Xdf.columns)
X = Xdf.values.astype(np.float32)

print(f"[Step 3] Features prepared: X={X.shape}, with {len(feature_names)} columns.")

# -------------------------
# STEP 4: Scale numerics (zero-mean, unit-variance)
# -------------------------
# Compute stats per feature (works for all since one-hots are 0/1—scaling is fine)
f_mean = X.mean(axis=0, keepdims=True)
f_std = X.std(axis=0, keepdims=True) + 1e-8
X = (X - f_mean) / f_std
print("[Step 4] Feature scaling complete.")

# -------------------------
# STEP 5: Train/Test split
# -------------------------
m = X.shape[0]
indices = np.arange(m)
np.random.shuffle(indices)
split = int(0.8 * m)
train_idx, test_idx = indices[:split], indices[split:]
X_train, X_test = X[train_idx], X[test_idx]
Y_train, Y_test = Y[train_idx], Y[test_idx]
print(f"[Step 5] Train/Test split: train={X_train.shape}, test={X_test.shape}")

# -------------------------
# STEP 6: NumPy MLP (from scratch)
# -------------------------
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def bce_loss(y_true, y_pred, eps=1e-8):
    y_pred = np.clip(y_pred, eps, 1 - eps)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

class NumpyMLP:
    def __init__(self, input_dim, hidden_dim, output_dim, lr=0.01, l2=1e-4, clip=1.0):
        self.lr = lr
        self.l2 = l2
        self.clip = clip
        # Xavier init
        self.W1 = np.random.randn(input_dim, hidden_dim) / np.sqrt(input_dim)
        self.b1 = np.zeros((1, hidden_dim))
        self.W2 = np.random.randn(hidden_dim, output_dim) / np.sqrt(hidden_dim)
        self.b2 = np.zeros((1, output_dim))

    def forward(self, X):
        self.Z1 = X @ self.W1 + self.b1
        self.A1 = np.tanh(self.Z1)
        self.Z2 = self.A1 @ self.W2 + self.b2
        self.A2 = sigmoid(self.Z2)  # multi-label probs
        return self.A2

    def backward(self, X, y_true, y_pred):
        m = X.shape[0]
        # dL/dZ2 for BCE with sigmoid
        dZ2 = (y_pred - y_true) / m
        dW2 = self.A1.T @ dZ2 + self.l2 * self.W2
        db2 = np.sum(dZ2, axis=0, keepdims=True)

        dA1 = dZ2 @ self.W2.T
        dZ1 = dA1 * (1 - np.tanh(self.Z1) ** 2)
        dW1 = X.T @ dZ1 + self.l2 * self.W1
        db1 = np.sum(dZ1, axis=0, keepdims=True)

        # Gradient clipping
        for g in [dW1, db1, dW2, db2]:
            np.clip(g, -self.clip, self.clip, out=g)

        # Update
        self.W2 -= self.lr * dW2
        self.b2 -= self.lr * db2
        self.W1 -= self.lr * dW1
        self.b1 -= self.lr * db1

    def fit(self, X, y, epochs=80, batch_size=256, verbose=True):
        n = X.shape[0]
        for epoch in range(epochs):
            # shuffle
            idx = np.random.permutation(n)
            X_shuf, y_shuf = X[idx], y[idx]
            # mini-batch
            for start in range(0, n, batch_size):
                end = start + batch_size
                xb = X_shuf[start:end]
                yb = y_shuf[start:end]
                probs = self.forward(xb)
                self.backward(xb, yb, probs)
            # report
            if verbose and (epoch % 10 == 0 or epoch == epochs - 1):
                p = self.forward(X)
                loss = bce_loss(y, p)
                print(f"[Step 6] Epoch {epoch:3d} | BCE: {loss:.4f}")

    def predict_proba(self, X):
        return self.forward(X)

print("[Step 6] Model defined.")

# -------------------------
# STEP 7: Train
# -------------------------
input_dim  = X_train.shape[1]
hidden_dim = 64
output_dim = Y_train.shape[1]

model = NumpyMLP(input_dim, hidden_dim, output_dim, lr=0.01, l2=1e-4, clip=1.0)
print("[Step 7] Training started...")
model.fit(X_train, Y_train, epochs=80, batch_size=256, verbose=True)
print("[Step 7] Training completed.")

# -------------------------
# STEP 8: Evaluate
# -------------------------
def metrics(y_true, y_prob, thresh=0.5):
    y_pred = (y_prob >= thresh).astype(np.float32)
    # Subset accuracy (exact match)
    subset_acc = np.mean(np.all(y_true == y_pred, axis=1))
    # Micro precision/recall/F1
    tp = (y_true * y_pred).sum()
    fp = ((1 - y_true) * y_pred).sum()
    fn = (y_true * (1 - y_pred)).sum()
    prec = tp / (tp + fp + 1e-8)
    rec  = tp / (tp + fn + 1e-8)
    f1   = 2 * prec * rec / (prec + rec + 1e-8)
    return subset_acc, prec, rec, f1

probs_test = model.predict_proba(X_test)
subset_acc, prec, rec, f1 = metrics(Y_test, probs_test, thresh=0.5)
print(f"[Step 8] Subset Accuracy: {subset_acc*100:.2f}% | Micro-P: {prec:.3f}  R: {rec:.3f}  F1: {f1:.3f}")

# Top-3 hit rate: whether any true label is in top-3 predictions
def topk_hit_rate(y_true, y_prob, k=3):
    topk = np.argsort(-y_prob, axis=1)[:, :k]
    hits = 0
    for i in range(y_true.shape[0]):
        true_labels = np.where(y_true[i] == 1)[0]
        if len(set(true_labels).intersection(set(topk[i]))) > 0:
            hits += 1
    return hits / y_true.shape[0]

hit3 = topk_hit_rate(Y_test, probs_test, k=3)
print(f"[Step 8] Top-3 Hit Rate: {hit3*100:.2f}%")

# -------------------------
# STEP 9: Test on 10 samples
# -------------------------
sample_idx = np.random.choice(X_test.shape[0], min(10, X_test.shape[0]), replace=False)
print("\n[Step 9] 10-sample predictions (Top-3):")
for i, idx in enumerate(sample_idx, 1):
    p = probs_test[idx]
    top3 = np.argsort(-p)[:3]
    recs = [(idx_to_crop[j], float(p[j])) for j in top3]
    true = [idx_to_crop[j] for j in np.where(Y_test[idx]==1)[0]]
    print(f"  #{i:02d} True={true} | Pred={[(c, round(s,3)) for c,s in recs]}")

# -------------------------
# STEP 10: Save model
# -------------------------
np.savez(
    MODEL_PATH,
    W1=model.W1, b1=model.b1, W2=model.W2, b2=model.b2,
    f_mean=f_mean, f_std=f_std,
    feature_names=np.array(feature_names, dtype=object),
    classes=np.array(all_crops, dtype=object),
    random_seed=np.array([RANDOM_SEED])
)
print(f"\n[Step 10] Model saved → {MODEL_PATH}")

# -------------------------
# STEP 11: Deployment helpers
# -------------------------
# Build district-season averages for imputing when soil test is missing
use_cols = ["Soil_pH","Organic_Carbon_pct","Soil_N_kg_ha","Soil_P_kg_ha","Soil_K_kg_ha",
            "Avg_Temp_C","Seasonal_Rainfall_mm","Avg_Humidity_pct"]
for c in use_cols:
    if c not in df.columns:
        df[c] = np.nan

group_means = df.groupby(["District","Season"])[use_cols].mean().reset_index()
print("[Step 11] Built District-Season imputation table.")

# One-hot templates for categorical fields from training
# (We will rebuild a single-row DataFrame and align columns to training `feature_names`.)
def build_feature_row(district, mandal, season, soil_type, water_source,
                      prev_crop, year,
                      soil_vals=None):
    """
    soil_vals: dict or None
        Keys can be any of use_cols above. Missing keys will be imputed from group mean.
    """
    row = {
        "Year": year,
        "Soil_pH": np.nan,
        "Organic_Carbon_pct": np.nan,
        "Soil_N_kg_ha": np.nan,
        "Soil_P_kg_ha": np.nan,
        "Soil_K_kg_ha": np.nan,
        "Avg_Temp_C": np.nan,
        "Seasonal_Rainfall_mm": np.nan,
        "Avg_Humidity_pct": np.nan,
        "District": district,
        "Mandal": mandal,
        "Season": season,
        "Soil_Type": soil_type,
        "Water_Source": water_source,
        "Previous_Crop": prev_crop
    }
    # fill soil vals if provided
    if soil_vals:
        for k,v in soil_vals.items():
            if k in row:
                row[k] = v

    # Impute missing numeric using District-Season means
    mask = (group_means["District"] == district) & (group_means["Season"] == season)
    if mask.any():
        g = group_means.loc[mask].iloc[0]
        for k in use_cols:
            if pd.isna(row.get(k, np.nan)):
                row[k] = g.get(k, np.nan)
    # Fallback: global mean
    for k in use_cols:
        if pd.isna(row.get(k, np.nan)):
            row[k] = float(df[k].mean())

    # Build DataFrame
    row_df = pd.DataFrame([row])

    # One-hot like training
    row_df = pd.get_dummies(row_df, columns=["District","Mandal","Season","Soil_Type","Water_Source","Previous_Crop"], drop_first=False)

    # Align to training columns (missing cols -> 0)
    for col in feature_names:
        if col not in row_df.columns:
            row_df[col] = 0.0
    # Extra columns in row_df but not in training are dropped
    row_df = row_df[feature_names]

    # Scale
    X_row = row_df.values.astype(np.float32)
    X_row = (X_row - f_mean) / f_std
    return X_row

def load_model(path=MODEL_PATH):
    data = np.load(path, allow_pickle=True)
    mdl = {
        "W1": data["W1"], "b1": data["b1"],
        "W2": data["W2"], "b2": data["b2"],
        "f_mean": data["f_mean"], "f_std": data["f_std"],
        "feature_names": list(data["feature_names"]),
        "classes": list(data["classes"])
    }
    return mdl

def predict_with_loaded(X_row, mdl, top_n=3):
    # forward pass
    Z1 = X_row @ mdl["W1"] + mdl["b1"]
    A1 = np.tanh(Z1)
    Z2 = A1 @ mdl["W2"] + mdl["b2"]
    probs = 1.0/(1.0 + np.exp(-Z2))
    p = probs.ravel()
    top_idx = np.argsort(-p)[:top_n]
    return [(mdl["classes"][i], float(p[i])) for i in top_idx]

print("[Step 11] Deployment helpers ready.")

# -------------------------
# STEP 12: Quick deployment demo (no soil test → impute)
# -------------------------
mdl = load_model(MODEL_PATH)
print("[Step 12] Model reloaded for deployment.")

# Example usage:
example_X = build_feature_row(
    district="Srikakulam",
    mandal="Srikakulam_Mandal_1",
    season="Kharif",
    soil_type="Mixed",
    water_source="Tank",
    prev_crop="Paddy",
    year=2019,
    soil_vals=None  # None => no soil test provided, will impute by (District, Season)
)
recs = predict_with_loaded(example_X, mdl, top_n=3)
print("[Step 12] Prediction (Top-3):", [(c, round(s,3)) for c,s in recs])

# Attach irrigation tips
print("\n[Deployment Output]")
for crop, conf in recs:
    tips = IRRIGATION_TIPS.get(crop, ["Follow crop-specific recommended irrigation schedule."])
    print(f"- {crop} (confidence {conf:.2f})")
    for t in tips[:3]:
        print(f"    • {t}")

print("\n✅ ALL DONE: preprocessing → training → evaluation → saving → reload & deploy demo.")


[Step 0] Imports ready.
[Step 1] Dataset loaded: (18240, 21)
Columns: ['Year', 'District', 'Mandal', 'Season', 'Soil_Type', 'Soil_pH', 'Organic_Carbon_pct', 'Soil_N_kg_ha', 'Soil_P_kg_ha', 'Soil_K_kg_ha', 'Avg_Temp_C', 'Seasonal_Rainfall_mm', 'Avg_Humidity_pct', 'Water_Source', 'Previous_Crop', 'Primary_Crop', 'Secondary_Crop', 'Suitable_Crops', 'Fertilizer_Plan', 'Irrigation_Plan', 'Market_Price_Index']
[Step 2] Parsed multi-labels. #Unique crops: 14 → ['Bengal Gram', 'Chillies', 'Cotton', 'Green Gram', 'Groundnut', 'Maize', 'Millets', 'Paddy', 'Pearl Millet', 'Sesame', 'Sorghum', 'Sunflower', 'Vegetables', 'Watermelon']
[Step 2] Built label matrix: (18240, 14) (samples, num_crops)
[Step 3] Features prepared: X=(18240, 670), with 670 columns.
[Step 4] Feature scaling complete.
[Step 5] Train/Test split: train=(14592, 670), test=(3648, 670)
[Step 6] Model defined.
[Step 7] Training started...
[Step 6] Epoch   0 | BCE: nan
[Step 6] Epoch  10 | BCE: nan
[Step 6] Epoch  20 | BCE: nan
[Ste

  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0


[Step 12] Prediction (Top-3): [('Bengal Gram', nan), ('Chillies', nan), ('Cotton', nan)]

[Deployment Output]
- Bengal Gram (confidence nan)
    • Minimal irrigation; drought-tolerant.
    • One irrigation at flowering if very dry.
    • Avoid heavy irrigation to prevent excess foliage.
- Chillies (confidence nan)
    • Follow crop-specific recommended irrigation schedule.
- Cotton (confidence nan)
    • Irrigate ~every 15 days; adjust by weather.
    • Critical: squaring, flowering, boll formation.
    • Avoid waterlogging (boll rot).

✅ ALL DONE: preprocessing → training → evaluation → saving → reload & deploy demo.


  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0
  row_df[col] = 0.0


In [1]:
# =========================
# Your-Own Crop Recommender (No sklearn, No torch)
# End-to-end: preprocessing -> training -> evaluation -> save -> deploy
# =========================
import json, ast, math, os, random, pickle
from collections import defaultdict, Counter
from statistics import mean
import numpy as np
import pandas as pd

np.random.seed(42)
random.seed(42)

CSV_PATH = "apcrop_dataset_realistic.csv"   # <-- your file

print("[Step 0] Imports ready.")

# ------------------------------------------------------------
# small helpers
# ------------------------------------------------------------
def try_parse_list(x):
    """Parse '["Paddy","Groundnut"]' safely into list[str]."""
    if pd.isna(x): return []
    if isinstance(x, list): return [str(i).strip() for i in x]
    s = str(x).strip()
    if not s: return []
    try:
        v = ast.literal_eval(s)
        if isinstance(v, list): 
            return [str(i).strip() for i in v]
    except Exception:
        pass
    # fallback: split by comma
    s = s.replace("[","").replace("]","").replace('"','').replace("'","")
    return [t.strip() for t in s.split(",") if t.strip()]

def try_parse_json(x):
    if pd.isna(x): return {}
    s = str(x).strip()
    if not s: return {}
    try:
        return json.loads(s)
    except Exception:
        try:
            return ast.literal_eval(s)
        except Exception:
            return {}

def nanmean_safe(series, fallback=None):
    v = pd.to_numeric(series, errors="coerce")
    m = v.mean()
    if pd.isna(m):
        return fallback
    return m

# ------------------------------------------------------------
# 1) LOAD
# ------------------------------------------------------------
df = pd.read_csv(CSV_PATH)
print(f"[Step 1] Dataset loaded: shape={df.shape}")
print("Columns:", list(df.columns))

# ------------------------------------------------------------
# 2) BASIC NORMALIZATION / PARSE TARGETS
# ------------------------------------------------------------
expected_cols = [
    "Year","District","Mandal","Season","Soil_Type",
    "Soil_pH","Organic_Carbon_pct","Soil_N_kg_ha","Soil_P_kg_ha","Soil_K_kg_ha",
    "Avg_Temp_C","Seasonal_Rainfall_mm","Avg_Humidity_pct",
    "Water_Source","Previous_Crop","Primary_Crop","Secondary_Crop",
    "Suitable_Crops","Fertilizer_Plan","Irrigation_Plan","Market_Price_Index"
]
for col in expected_cols:
    if col not in df.columns:
        df[col] = np.nan  # create missing columns

# ensure text columns are strings
for c in ["District","Mandal","Season","Soil_Type","Water_Source","Previous_Crop","Primary_Crop","Secondary_Crop"]:
    df[c] = df[c].astype(str).str.strip()

# parse multi-label and plans
df["Suitable_Crops_List"] = df["Suitable_Crops"].apply(try_parse_list)
df["FertJSON"] = df["Fertilizer_Plan"].apply(try_parse_json)
df["IrrJSON"]  = df["Irrigation_Plan"].apply(try_parse_json)

# collect all unique crop names visible in dataset
crop_set = set()
for col in ["Primary_Crop","Secondary_Crop"]:
    crop_set.update(df[col].dropna().astype(str).str.strip())
for L in df["Suitable_Crops_List"]:
    crop_set.update(L)
# clean empties
crop_set = {c for c in crop_set if c and c.lower()!="nan"}
crop_list = sorted(crop_set)
print(f"[Step 2] Crops detected: {len(crop_list)} -> {crop_list}")

# ------------------------------------------------------------
# 3) IMPUTATION VALUES (District+Season aware)
# ------------------------------------------------------------
num_cols = ["Soil_pH","Organic_Carbon_pct","Soil_N_kg_ha","Soil_P_kg_ha","Soil_K_kg_ha",
            "Avg_Temp_C","Seasonal_Rainfall_mm","Avg_Humidity_pct","Market_Price_Index"]

# seasonal defaults if whole column empty
season_defaults = {
    "Kharif": {"Avg_Temp_C": 29.0, "Seasonal_Rainfall_mm": 600.0, "Avg_Humidity_pct": 78.0},
    "Rabi":   {"Avg_Temp_C": 23.0, "Seasonal_Rainfall_mm": 200.0, "Avg_Humidity_pct": 65.0},
    "Zaid":   {"Avg_Temp_C": 31.0, "Seasonal_Rainfall_mm": 120.0, "Avg_Humidity_pct": 70.0},
}

# compute district-season means
group_means = df.groupby(["District","Season"])[num_cols].agg(lambda s: pd.to_numeric(s, errors="coerce").mean()).reset_index()

def impute_row(row):
    d, s = row["District"], row["Season"]
    # lookup district-season row
    g = group_means[(group_means["District"]==d) & (group_means["Season"]==s)]
    for col in num_cols:
        if pd.isna(row[col]) or str(row[col]).strip()=="":
            # district-season mean -> season default -> global mean
            val = None
            if not g.empty and not pd.isna(g.iloc[0][col]):
                val = g.iloc[0][col]
            if val is None or pd.isna(val):
                if s in season_defaults and col in season_defaults[s]:
                    val = season_defaults[s][col]
            if val is None or pd.isna(val):
                val = nanmean_safe(df[col], 0.0)
            row[col] = val
    return row

df = df.apply(impute_row, axis=1)
print("[Step 3] Missing numeric values imputed (district-season aware).")

# ------------------------------------------------------------
# 4) FEATURE ENGINEERING
#     - encode categoricals with a custom OneHot encoder
#     - keep numeric as float
# ------------------------------------------------------------
cat_cols = ["District","Mandal","Season","Soil_Type","Water_Source","Previous_Crop"]
target_col = "Primary_Crop"  # single-label target for training

class OneHotEncoderLite:
    def __init__(self):
        self.cat_maps = {}   # col -> {category: index}
        self.col_index_ranges = {}  # col -> (start,end)
        self.n_features_ = 0
    
    def fit(self, frame, cat_cols):
        start = 0
        for col in cat_cols:
            uniq = sorted({str(x) for x in frame[col].fillna("NA_VALUE").astype(str)})
            mapping = {u:i for i,u in enumerate(uniq)}
            self.cat_maps[col] = mapping
            end = start + len(mapping)
            self.col_index_ranges[col] = (start, end)
            start = end
        self.n_features_ = start
        return self
    
    def transform(self, frame):
        # build empty array: (n_samples, total_cat_dim)
        out = np.zeros((len(frame), self.n_features_), dtype=np.float32)
        for col, mapping in self.cat_maps.items():
            start, end = self.col_index_ranges[col]
            idxs = frame[col].fillna("NA_VALUE").astype(str).map(lambda v: mapping.get(v, None))
            for i, idx in enumerate(idxs):
                if idx is not None:
                    out[i, start+idx] = 1.0
        return out

# prepare numeric matrix
X_num = df[num_cols].astype(float).values.astype(np.float32)

# fit onehot
ohe = OneHotEncoderLite().fit(df, cat_cols)
X_cat = ohe.transform(df)

# final features = [numeric | onehot]
X = np.concatenate([X_num, X_cat], axis=1)

# target encode to integers
class LabelEncoderLite:
    def __init__(self):
        self.class_to_id = {}
        self.id_to_class = []
    def fit(self, y):
        uniq = sorted(set(y))
        self.class_to_id = {c:i for i,c in enumerate(uniq)}
        self.id_to_class = uniq
        return self
    def transform(self, y):
        return np.array([self.class_to_id[v] for v in y], dtype=np.int64)
    def inverse_transform(self, ids):
        return [self.id_to_class[i] for i in ids]

le_y = LabelEncoderLite().fit(df[target_col].astype(str))
y = le_y.transform(df[target_col].astype(str))

print(f"[Step 4] Features prepared: X={X.shape}, y={y.shape}, classes={len(le_y.id_to_class)}")

# ------------------------------------------------------------
# 5) MODEL (from scratch Hybrid Naive Bayes)
#    - Gaussian NB on numeric features
#    - Categorical NB on one-hot (simple Bernoulli with Laplace smoothing)
# ------------------------------------------------------------
class HybridNaiveBayes:
    def __init__(self, n_num, n_cat, alpha=1.0, var_smoothing=1e-9):
        self.n_num = n_num
        self.n_cat = n_cat
        self.alpha = alpha
        self.var_smoothing = var_smoothing
        # learned:
        self.class_priors = None                  # [K]
        self.num_means = None                     # [K, n_num]
        self.num_vars = None                      # [K, n_num]
        self.cat_feature_pos = None               # [K, n_cat] P(x=1|class)
    
    def fit(self, X, y):
        K = len(np.unique(y))
        n = X.shape[1]
        assert n == self.n_num + self.n_cat
        X_num = X[:, :self.n_num]
        X_cat = X[:, self.n_num:]
        self.class_priors = np.zeros(K, dtype=np.float64)
        self.num_means = np.zeros((K, self.n_num), dtype=np.float64)
        self.num_vars  = np.zeros((K, self.n_num), dtype=np.float64)
        self.cat_feature_pos = np.zeros((K, self.n_cat), dtype=np.float64)

        for k in range(K):
            mask = (y == k)
            Xk_num = X_num[mask]
            Xk_cat = X_cat[mask]

            # priors
            self.class_priors[k] = (np.sum(mask) + self.alpha) / (len(y) + K*self.alpha)

            # gaussian stats
            if Xk_num.shape[0] > 0:
                self.num_means[k] = np.mean(Xk_num, axis=0)
                # variance with smoothing to avoid zero
                self.num_vars[k]  = np.var(Xk_num, axis=0) + self.var_smoothing
            else:
                self.num_means[k] = 0.0
                self.num_vars[k]  = 1.0

            # categorical: Bernoulli likelihood with Laplace smoothing
            # count ones per feature in class
            pos = Xk_cat.sum(axis=0)  # how many ones
            total = Xk_cat.shape[0]
            self.cat_feature_pos[k] = (pos + self.alpha) / (total + 2*self.alpha) if total>0 else (0.5*np.ones(self.n_cat))
        return self

    def _log_gaussian_prob(self, X_num, k):
        # log N(x|mean,var) across dimensions (sum)
        mean_k = self.num_means[k]
        var_k  = self.num_vars[k]
        return -0.5*(np.log(2*np.pi*var_k).sum() + ((X_num - mean_k)**2/var_k).sum(axis=1))
    
    def _log_bernoulli_prob(self, X_cat, k):
        p = np.clip(self.cat_feature_pos[k], 1e-12, 1-1e-12)
        # X in {0,1}, log p^X (1-p)^(1-X) = X*log p + (1-X)*log(1-p)
        return (X_cat*np.log(p) + (1-X_cat)*np.log(1-p)).sum(axis=1)

    def predict_proba(self, X):
        X_num = X[:, :self.n_num]
        X_cat = X[:, self.n_num:]
        K = len(self.class_priors)
        log_post = []
        for k in range(K):
            lp = math.log(self.class_priors[k]) \
                 + self._log_gaussian_prob(X_num, k) \
                 + self._log_bernoulli_prob(X_cat, k)
            log_post.append(lp)
        log_post = np.vstack(log_post).T  # [N,K]
        # normalize to probabilities
        m = log_post.max(axis=1, keepdims=True)
        post = np.exp(log_post - m)
        post = post / post.sum(axis=1, keepdims=True)
        return post

    def predict(self, X):
        return self.predict_proba(X).argmax(axis=1)

# split (holdout) — we’ll also do 5-fold CV later
idx = np.arange(len(df))
np.random.shuffle(idx)
split = int(0.8*len(idx))
train_idx, test_idx = idx[:split], idx[split:]
X_train, y_train = X[train_idx], y[train_idx]
X_test,  y_test  = X[test_idx],  y[test_idx]

model = HybridNaiveBayes(n_num=X_num.shape[1], n_cat=X_cat.shape[1], alpha=1.0, var_smoothing=1e-6)
model.fit(X_train, y_train)
print("[Step 5] Model trained (Hybrid Naive Bayes from scratch) ✅")

# ------------------------------------------------------------
# 6) EVALUATION
# ------------------------------------------------------------
def metrics_basic(y_true, y_pred, proba=None, top_k=3):
    acc = (y_true == y_pred).mean()
    # macro precision/recall/f1
    K = len(set(y_true) | set(y_pred))
    precs, recs, f1s = [], [], []
    for k in range(K):
        tp = np.sum((y_true==k) & (y_pred==k))
        fp = np.sum((y_true!=k) & (y_pred==k))
        fn = np.sum((y_true==k) & (y_pred!=k))
        precision = tp / (tp+fp) if (tp+fp)>0 else 0.0
        recall    = tp / (tp+fn) if (tp+fn)>0 else 0.0
        f1 = 2*precision*recall/(precision+recall) if (precision+recall)>0 else 0.0
        precs.append(precision); recs.append(recall); f1s.append(f1)
    macro_p, macro_r, macro_f1 = np.mean(precs), np.mean(recs), np.mean(f1s)

    topk = None
    if proba is not None:
        topk_preds = np.argsort(-proba, axis=1)[:, :top_k]
        hits = [(y_true[i] in topk_preds[i]) for i in range(len(y_true))]
        topk = np.mean(hits)
    return acc, macro_p, macro_r, macro_f1, topk

y_pred_test = model.predict(X_test)
proba_test  = model.predict_proba(X_test)
acc, mp, mr, mf1, top3 = metrics_basic(y_test, y_pred_test, proba_test, top_k=3)
print(f"[Step 6] Holdout 20% → Acc={acc:.3f} | Macro-F1={mf1:.3f} | Top-3 Hit={top3:.3f}")

# 5-fold CV for robustness
def kfold_eval(X, y, k=5):
    N = len(y)
    idx = np.arange(N)
    np.random.shuffle(idx)
    folds = np.array_split(idx, k)
    accs, f1s, top3s = [], [], []
    for i in range(k):
        test_i = folds[i]
        train_i = np.hstack([folds[j] for j in range(k) if j!=i])
        m = HybridNaiveBayes(n_num=X_num.shape[1], n_cat=X_cat.shape[1], alpha=1.0, var_smoothing=1e-6)
        m.fit(X[train_i], y[train_i])
        yp = m.predict(X[test_i])
        pp = m.predict_proba(X[test_i])
        a,_,_,f1,t3 = metrics_basic(y[test_i], yp, pp, top_k=3)
        accs.append(a); f1s.append(f1); top3s.append(t3)
    return np.mean(accs), np.mean(f1s), np.mean(top3s)

cv_acc, cv_f1, cv_top3 = kfold_eval(X, y, k=5)
print(f"[Step 6] 5-fold CV → Acc={cv_acc:.3f} | Macro-F1={cv_f1:.3f} | Top-3 Hit={cv_top3:.3f}")

# ------------------------------------------------------------
# 7) SAVE ARTIFACTS
# ------------------------------------------------------------
ART_DIR = "."
np.savez(os.path.join(ART_DIR,"croprecommender.npz"),
         class_priors=model.class_priors,
         num_means=model.num_means,
         num_vars=model.num_vars,
         cat_feature_pos=model.cat_feature_pos,
         n_num=model.n_num, n_cat=model.n_cat)

with open(os.path.join(ART_DIR,"encoders.pkl"), "wb") as f:
    pickle.dump({"ohe":ohe, "le_y":le_y, "num_cols":num_cols, "cat_cols":cat_cols}, f)

with open(os.path.join(ART_DIR,"feature_cols.json"), "w") as f:
    json.dump({"num_cols":num_cols, "cat_cols":cat_cols}, f, indent=2)

print("[Step 7] Saved → croprecommender.npz, encoders.pkl, feature_cols.json ✅")

# ------------------------------------------------------------
# 8) IRRIGATION SUGGESTIONS (your mapping)
# ------------------------------------------------------------
IRRIGATION_TIPS = {
    "Paddy": [
        "Daily flooding, maintain 5–10 cm standing water.",
        "Ensure water at tillering and flowering.",
        "Drain completely 7–10 days before harvest."
    ],
    "Maize": [
        "Irrigate weekly; adjust to rainfall & soil.",
        "Critical: tasseling, silking, grain filling.",
        "Avoid stress in reproductive phase."
    ],
    "Groundnut": [
        "Irrigate ~every 10 days.",
        "Critical: flowering & pegging stages.",
        "Keep moisture during pod development."
    ],
    "Wheat": [
        "Irrigate at CRI, jointing, flowering.",
        "CRI (≈21 DAS) is most critical.",
        "Avoid waterlogging."
    ],
    "Bengal Gram": [
        "Minimal irrigation; drought-tolerant.",
        "One light irrigation at flowering if very dry.",
        "Avoid heavy irrigation."
    ],
    "Sunflower": [
        "Irrigate ~every 12 days.",
        "Critical: bud, flowering, seed filling.",
        "Drip works very well."
    ],
    "Castor": [
        "Irrigate ~every 15 days.",
        "Provide at branching & spike initiation.",
        "Too much water → foliage, fewer seeds."
    ],
    "Bajra": [
        "Prefer rainfed; drought-tolerant.",
        "If needed, irrigate at flowering & grain filling.",
        "Avoid over-irrigation (lodging risk)."
    ],
    "Linseed": [
        "Light irrigation at branching & flowering.",
        "Never waterlog; highly susceptible.",
        "1–2 irrigations often enough."
    ],
    "Mustard": [
        "Irrigate at branching & pod filling.",
        "First irrigation 30–35 DAS.",
        "Avoid watering during flowering."
    ],
    "Watermelon": [
        "Irrigate ~every 7 days; keep moisture even.",
        "Avoid stress at flowering & fruit set.",
        "Reduce water near maturity for sweetness."
    ],
    "Muskmelon": [
        "Irrigate ~every 7 days, avoid waterlogging.",
        "Consistent water for fruit growth & quality.",
        "Drip reduces fungal risk."
    ],
    "Cowpea": [
        "Irrigate ~every 10 days.",
        "Critical: flowering & pod development.",
        "Drought tolerant but timely water boosts yield."
    ],
    "Cotton": [
        "Irrigate ~every 15 days; adjust by weather.",
        "Critical: squaring, flowering, boll formation.",
        "Avoid waterlogging (boll rot)."
    ],
    "Sugarcane": [
        "Frequent water in hot/dry months.",
        "Formative stage (to ~120 DAS) needs moisture.",
        "Reduce 1–2 months pre-harvest."
    ],
    "Barley": [
        "Light irrigations as needed.",
        "CRI is most crucial.",
        "Sensitive to waterlogging."
    ],
    "Lentil": [
        "Mostly rainfed.",
        "One light irrigation pre-flowering may help.",
        "Too much water → vegetative growth."
    ],
    "Soybean": [
        "Prefer rainfed/moderate drought tolerance.",
        "Supplement at pod filling if dry.",
        "Avoid irrigation during flowering."
    ],
    "Pea": [
        "Irrigate at flowering & pod filling.",
        "Initial irrigation helps germination.",
        "Avoid overwatering (root rot risk)."
    ],
    "Vegetables": [
        "Irrigate every 5–7 days depending on crop.",
        "Drip to root-zone conserves water.",
        "Consistency prevents cracking/bitterness."
    ],
    "Jute": [
        "Keep soil moist throughout.",
        "Frequent light irrigation in hot season.",
        "Ensure drainage to avoid root decay."
    ],
    "Oats": [
        "Irrigate ~every 12 days.",
        "Critical: tillering & flowering.",
        "Relatively drought-tolerant."
    ],
    "Cucumber": [
        "Irrigate ~every 7 days; uniform moisture.",
        "Consistent water at fruit set & growth.",
        "Low water → bitter fruits."
    ],
    "Sugar Beet": [
        "Irrigate ~every 10 days.",
        "Critical: canopy establishment, root bulking.",
        "Avoid waterlogging (low sugar, root rot)."
    ],
    "Pearl Millet": [
        "Prefer rainfed; very drought-tolerant.",
        "If needed, irrigate at flowering.",
        "Water stress at grain filling cuts yield."
    ],
    "Cluster Bean": [
        "Prefer rainfed; arid-suited.",
        "1–2 light irrigations in long dry spells.",
        "Overwatering reduces pod set."
    ],
    "Sesame": [
        "Prefer rainfed; drought-hardy.",
        "One irrigation at flowering if dry.",
        "Avoid waterlogging (root rot)."
    ],
    "Green Gram": [
        "Rainfed OK; one irrigation at flowering.",
        "Avoid heavy irrigation (root disease risk).",
        "Irrigation can boost yield modestly."
    ],
    "Millets": [
        "Mostly rainfed; high drought tolerance.",
        "Irrigate only at critical stages if very dry.",
        "Avoid excessive water."
    ],
    "Sorghum": [
        "Mostly rainfed; irrigate if prolonged dry.",
        "Critical: booting & flowering.",
        "Avoid waterlogging."
    ]
}

# ------------------------------------------------------------
# 9) DEPLOYMENT HELPERS
# ------------------------------------------------------------
def build_feature_row(input_dict, encoders, ohe, num_cols, cat_cols):
    """
    input_dict keys may include numeric and categorical fields.
    Missing numeric are imputed with district-season means (computed earlier) or season defaults.
    """
    # Start with a template row
    row = {}
    # numeric
    for c in num_cols:
        row[c] = input_dict.get(c, None)
    # categorical
    for c in cat_cols:
        row[c] = input_dict.get(c, "NA_VALUE")

    # impute numeric using same logic as training
    tmp = pd.DataFrame([row])
    # attach District & Season so imputer can work
    tmp["District"] = row.get("District", "NA_VALUE")
    tmp["Season"]   = row.get("Season",   "Kharif")

    # district-season means lookup
    g = group_means[(group_means["District"]==tmp.at[0,"District"]) & (group_means["Season"]==tmp.at[0,"Season"])]

    for col in num_cols:
        val = tmp.at[0, col]
        if val is None or (isinstance(val,str) and not val.strip()):
            # fill
            dv = None
            if not g.empty and not pd.isna(g.iloc[0][col]):
                dv = g.iloc[0][col]
            if (dv is None or pd.isna(dv)) and tmp.at[0,"Season"] in season_defaults and col in season_defaults[tmp.at[0,"Season"]]:
                dv = season_defaults[tmp.at[0,"Season"]][col]
            if dv is None or pd.isna(dv):
                dv = nanmean_safe(df[col], 0.0)
            tmp.at[0, col] = dv

    # numeric block
    x_num = tmp[num_cols].astype(float).values.astype(np.float32)
    # categorical block via ohe
    x_cat = ohe.transform(tmp[cat_cols])
    # final vector
    x = np.concatenate([x_num, x_cat], axis=1)
    return x

def predict_topN(input_dict, N=3):
    x = build_feature_row(input_dict, encoders=None, ohe=ohe, num_cols=num_cols, cat_cols=cat_cols)
    probs = model.predict_proba(x)[0]
    top_idx = np.argsort(-probs)[:N]
    crops = [le_y.id_to_class[i] for i in top_idx]
    confs = [float(probs[i]) for i in top_idx]
    tips  = {c: IRRIGATION_TIPS.get(c, ["Follow crop-specific schedule."]) for c in crops}
    return list(zip(crops, confs)), tips

# ------------------------------------------------------------
# 10) QUICK DEMO PREDICTION (with/without soil test)
#     “Soil test? No” → only district, season, water source are enough.
# ------------------------------------------------------------
demo_no_soil = {
    "District": "Srikakulam",
    "Season": "Kharif",
    "Water_Source": "Tank",     # choose among: Tank/Canal/Borewell/...
    # no soil fields given
}
demo_with_soil = {
    "District": "Srikakulam",
    "Season": "Rabi",
    "Water_Source": "Tank",
    "Soil_pH": 6.6,
    "Organic_Carbon_pct": 0.7,
    "Soil_N_kg_ha": 200,
    "Soil_P_kg_ha": 18,
    "Soil_K_kg_ha": 180,
    "Avg_Temp_C": 23.0,
    "Seasonal_Rainfall_mm": 650,
    "Avg_Humidity_pct": 68,
    "Market_Price_Index": 0.62
}

print("\n[Step 8] Demo: NO soil test input →")
preds, tips = predict_topN(demo_no_soil, N=3)
for c, p in preds:
    print(f"  - {c}: {p:.3f}")
    for t in tips[c][:2]:
        print("     •", t)

print("\n[Step 8] Demo: WITH soil test input →")
preds2, tips2 = predict_topN(demo_with_soil, N=3)
for c, p in preds2:
    print(f"  - {c}: {p:.3f}")
    for t in tips2[c][:2]:
        print("     •", t)

print("\n✅ DONE: training → evaluation → saved model → deploy helpers ready.")


[Step 0] Imports ready.
[Step 1] Dataset loaded: shape=(18240, 21)
Columns: ['Year', 'District', 'Mandal', 'Season', 'Soil_Type', 'Soil_pH', 'Organic_Carbon_pct', 'Soil_N_kg_ha', 'Soil_P_kg_ha', 'Soil_K_kg_ha', 'Avg_Temp_C', 'Seasonal_Rainfall_mm', 'Avg_Humidity_pct', 'Water_Source', 'Previous_Crop', 'Primary_Crop', 'Secondary_Crop', 'Suitable_Crops', 'Fertilizer_Plan', 'Irrigation_Plan', 'Market_Price_Index']
[Step 2] Crops detected: 14 -> ['Bengal Gram', 'Chillies', 'Cotton', 'Green Gram', 'Groundnut', 'Maize', 'Millets', 'Paddy', 'Pearl Millet', 'Sesame', 'Sorghum', 'Sunflower', 'Vegetables', 'Watermelon']
[Step 3] Missing numeric values imputed (district-season aware).
[Step 4] Features prepared: X=(18240, 669), y=(18240,), classes=8
[Step 5] Model trained (Hybrid Naive Bayes from scratch) ✅
[Step 6] Holdout 20% → Acc=1.000 | Macro-F1=1.000 | Top-3 Hit=1.000
[Step 6] 5-fold CV → Acc=1.000 | Macro-F1=1.000 | Top-3 Hit=1.000
[Step 7] Saved → croprecommender.npz, encoders.pkl, feature

In [2]:
import random

# Pick 10 random rows from dataset
sample_indices = random.sample(range(len(X_train)), 10)
sample_features = X_train[sample_indices]
sample_labels = y_train[sample_indices]

# Make predictions
predictions = model.predict(sample_features)

print("\n=== RANDOM SAMPLE TESTING ===")
for i in range(len(sample_indices)):
    print(f"Sample {i+1}:")
    print(f"  Features: {sample_features[i]}")
    print(f"  Actual Crop: {sample_labels[i]}")
    print(f"  Predicted Crop: {predictions[i]}")
    print("-" * 40)

# Accuracy on these 10 random samples
sample_accuracy = sum(predictions == sample_labels) / len(sample_labels)
print(f"Accuracy on random 10 samples: {sample_accuracy*100:.2f}% ✅")



=== RANDOM SAMPLE TESTING ===
Sample 1:
  Features: [  6.8   0.9 280.   25.  220.   31.  120.   70.    0.9   0.    0.    0.
   0.    0.    0.    1.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.

In [15]:
import json, ast, math, os, random, pickle
from collections import defaultdict, Counter
from statistics import mean
import numpy as np
import pandas as pd

np.random.seed(42)
random.seed(42)

CSV_PATH = "apcrop_dataset_realistic.csv" # <-- your file

print("[Step 0] Imports ready.")

# ------------------------------------------------------------
# small helpers
# ------------------------------------------------------------
def try_parse_list(x):
    """Parse '["Paddy","Groundnut"]' safely into list[str]."""
    if pd.isna(x): return []
    if isinstance(x, list): return [str(i).strip() for i in x]
    s = str(x).strip()
    if not s: return []
    try:
        v = ast.literal_eval(s)
        if isinstance(v, list): 
            return [str(i).strip() for i in v]
    except Exception:
        pass
    # fallback: split by comma
    s = s.replace("[","").replace("]","").replace('"','').replace("'","")
    return [t.strip() for t in s.split(",") if t.strip()]

def try_parse_json(x):
    if pd.isna(x): return {}
    s = str(x).strip()
    if not s: return {}
    try:
        return json.loads(s)
    except Exception:
        try:
            return ast.literal_eval(s)
        except Exception:
            return {}

def nanmean_safe(series, fallback=None):
    v = pd.to_numeric(series, errors="coerce")
    m = v.mean()
    if pd.isna(m):
        return fallback
    return m

# ------------------------------------------------------------
# 1) LOAD
# ------------------------------------------------------------
df = pd.read_csv(CSV_PATH)
print(f"[Step 1] Dataset loaded: shape={df.shape}")
print("Columns:", list(df.columns))

# ------------------------------------------------------------
# 2) BASIC NORMALIZATION / PARSE TARGETS
# ------------------------------------------------------------
expected_cols = [
    "Year","District","Mandal","Season","Soil_Type",
    "Soil_pH","Organic_Carbon_pct","Soil_N_kg_ha","Soil_P_kg_ha","Soil_K_kg_ha",
    "Avg_Temp_C","Seasonal_Rainfall_mm","Avg_Humidity_pct",
    "Water_Source","Previous_Crop","Primary_Crop","Secondary_Crop",
    "Suitable_Crops","Fertilizer_Plan","Irrigation_Plan","Market_Price_Index"
]
for col in expected_cols:
    if col not in df.columns:
        df[col] = np.nan # create missing columns

# ensure text columns are strings
for c in ["District","Mandal","Season","Soil_Type","Water_Source","Previous_Crop","Primary_Crop","Secondary_Crop"]:
    df[c] = df[c].astype(str).str.strip()

# parse multi-label and plans
df["Suitable_Crops_List"] = df["Suitable_Crops"].apply(try_parse_list)
df["FertJSON"] = df["Fertilizer_Plan"].apply(try_parse_json)
df["IrrJSON"]  = df["Irrigation_Plan"].apply(try_parse_json)

# collect all unique crop names visible in dataset
crop_set = set()
for col in ["Primary_Crop","Secondary_Crop"]:
    crop_set.update(df[col].dropna().astype(str).str.strip())
for L in df["Suitable_Crops_List"]:
    crop_set.update(L)
# clean empties
crop_set = {c for c in crop_set if c and c.lower()!="nan"}
crop_list = sorted(crop_set)
print(f"[Step 2] Crops detected: {len(crop_list)} -> {crop_list}")

# ------------------------------------------------------------
# 3) IMPUTATION VALUES (District+Season aware)
# ------------------------------------------------------------
num_cols = ["Soil_pH","Organic_Carbon_pct","Soil_N_kg_ha","Soil_P_kg_ha","Soil_K_kg_ha",
            "Avg_Temp_C","Seasonal_Rainfall_mm","Avg_Humidity_pct","Market_Price_Index"]

# seasonal defaults if whole column empty
season_defaults = {
    "Kharif": {"Avg_Temp_C": 29.0, "Seasonal_Rainfall_mm": 600.0, "Avg_Humidity_pct": 78.0},
    "Rabi":   {"Avg_Temp_C": 23.0, "Seasonal_Rainfall_mm": 200.0, "Avg_Humidity_pct": 65.0},
    "Zaid":   {"Avg_Temp_C": 31.0, "Seasonal_Rainfall_mm": 120.0, "Avg_Humidity_pct": 70.0},
}

# compute district-season means
group_means = df.groupby(["District","Season"])[num_cols].agg(lambda s: pd.to_numeric(s, errors="coerce").mean()).reset_index()

def impute_row(row):
    d, s = row["District"], row["Season"]
    # lookup district-season row
    g = group_means[(group_means["District"]==d) & (group_means["Season"]==s)]
    for col in num_cols:
        if pd.isna(row[col]) or str(row[col]).strip()=="":
            # district-season mean -> season default -> global mean
            val = None
            if not g.empty and not pd.isna(g.iloc[0][col]):
                val = g.iloc[0][col]
            if val is None or pd.isna(val):
                if s in season_defaults and col in season_defaults[s]:
                    val = season_defaults[s][col]
            if val is None or pd.isna(val):
                val = nanmean_safe(df[col], 0.0)
            row[col] = val
    return row

df = df.apply(impute_row, axis=1)
print("[Step 3] Missing numeric values imputed (district-season aware).")

# ------------------------------------------------------------
# 4) FEATURE ENGINEERING
#     - encode categoricals with a custom OneHot encoder
#     - scale numeric features with a custom StandardScaler
# ------------------------------------------------------------
cat_cols = ["District","Mandal","Season","Soil_Type","Water_Source","Previous_Crop"]
target_col = "Primary_Crop" # single-label target for training

# Custom One-Hot Encoder
class OneHotEncoderLite:
    def __init__(self):
        self.cat_maps = {}  # col -> {category: index}
        self.col_index_ranges = {}  # col -> (start,end)
        self.n_features_ = 0
    
    def fit(self, frame, cat_cols):
        start = 0
        for col in cat_cols:
            uniq = sorted({str(x) for x in frame[col].fillna("NA_VALUE").astype(str)})
            mapping = {u:i for i,u in enumerate(uniq)}
            self.cat_maps[col] = mapping
            end = start + len(mapping)
            self.col_index_ranges[col] = (start, end)
            start = end
        self.n_features_ = start
        return self
    
    def transform(self, frame):
        # build empty array: (n_samples, total_cat_dim)
        out = np.zeros((len(frame), self.n_features_), dtype=np.float32)
        for col, mapping in self.cat_maps.items():
            start, end = self.col_index_ranges[col]
            idxs = frame[col].fillna("NA_VALUE").astype(str).map(lambda v: mapping.get(v, None))
            for i, idx in enumerate(idxs):
                if idx is not None:
                    out[i, start+idx] = 1.0
        return out

# Custom Standard Scaler for numeric features
class StandardScalerLite:
    def __init__(self):
        self.mean_ = None
        self.scale_ = None
    
    def fit(self, X):
        X = X.astype(float)
        self.mean_ = np.mean(X, axis=0)
        self.scale_ = np.std(X, axis=0)
        # Avoid division by zero for constant features
        self.scale_[self.scale_ == 0] = 1.0
        return self
    
    def transform(self, X):
        X = X.astype(float)
        return (X - self.mean_) / self.scale_

# prepare numeric matrix
X_num_raw = df[num_cols].astype(float).values.astype(np.float32)

# fit and transform numeric data
scaler = StandardScalerLite().fit(X_num_raw)
X_num = scaler.transform(X_num_raw)

# fit and transform categorical data
ohe = OneHotEncoderLite().fit(df, cat_cols)
X_cat = ohe.transform(df)

# final features = [scaled numeric | onehot]
X = np.concatenate([X_num, X_cat], axis=1)

# target encode to integers
class LabelEncoderLite:
    def __init__(self):
        self.class_to_id = {}
        self.id_to_class = []
    def fit(self, y):
        uniq = sorted(set(y))
        self.class_to_id = {c:i for i,c in enumerate(uniq)}
        self.id_to_class = uniq
        return self
    def transform(self, y):
        return np.array([self.class_to_id[v] for v in y], dtype=np.int64)
    def inverse_transform(self, ids):
        return [self.id_to_class[i] for i in ids]

le_y = LabelEncoderLite().fit(df[target_col].astype(str))
y = le_y.transform(df[target_col].astype(str))

print(f"[Step 4] Features prepared: X={X.shape}, y={y.shape}, classes={len(le_y.id_to_class)}")

# ------------------------------------------------------------
# 5) MODEL (from scratch SimpleMLP)
# ------------------------------------------------------------
# Activation functions
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0) * 1

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# Loss function
def cross_entropy_loss(y_true_onehot, y_pred_proba):
    m = y_true_onehot.shape[0]
    log_probs = -np.log(np.clip(y_pred_proba, 1e-12, 1.0))
    loss = np.sum(y_true_onehot * log_probs) / m
    return loss

class SimpleMLP:
    def __init__(self, input_dim, hidden_dim, output_dim, lr=0.001, epochs=200, batch_size=64, log_interval=20):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.log_interval = log_interval
        
        # Initialize weights and biases
        self.W1 = np.random.randn(self.input_dim, self.hidden_dim) * 0.01
        self.b1 = np.zeros((1, self.hidden_dim))
        self.W2 = np.random.randn(self.hidden_dim, self.output_dim) * 0.01
        self.b2 = np.zeros((1, self.output_dim))

    def _to_one_hot(self, y):
        one_hot = np.zeros((len(y), self.output_dim))
        one_hot[np.arange(len(y)), y] = 1
        return one_hot

    def _forward(self, X):
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = relu(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = softmax(self.z2)
        return self.a2

    def fit(self, X_train, y_train):
        m = len(X_train)
        y_one_hot = self._to_one_hot(y_train)

        for epoch in range(1, self.epochs + 1):
            # Shuffle data for each epoch
            indices = np.random.permutation(m)
            X_shuffled = X_train[indices]
            y_one_hot_shuffled = y_one_hot[indices]
            
            for i in range(0, m, self.batch_size):
                X_batch = X_shuffled[i:i + self.batch_size]
                y_batch = y_one_hot_shuffled[i:i + self.batch_size]
                
                if len(X_batch) == 0:
                    continue

                # Forward pass
                y_pred_proba = self._forward(X_batch)
                
                # Backpropagation
                # Output layer
                dz2 = y_pred_proba - y_batch
                dW2 = np.dot(self.a1.T, dz2) / len(X_batch)
                db2 = np.sum(dz2, axis=0, keepdims=True) / len(X_batch)

                # Hidden layer
                dz1 = np.dot(dz2, self.W2.T) * relu_derivative(self.z1)
                dW1 = np.dot(X_batch.T, dz1) / len(X_batch)
                db1 = np.sum(dz1, axis=0, keepdims=True) / len(X_batch)
                
                # Update weights
                self.W1 -= self.lr * dW1
                self.b1 -= self.lr * db1
                self.W2 -= self.lr * dW2
                self.b2 -= self.lr * db2

            if epoch % self.log_interval == 0:
                y_pred_proba_full = self._forward(X_train)
                loss = cross_entropy_loss(y_one_hot, y_pred_proba_full)
                print(f"Epoch {epoch}/{self.epochs} | Loss: {loss:.4f}")

    def predict_proba(self, X):
        return self._forward(X)

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

# split (holdout) — we’ll also do 5-fold CV later
idx = np.arange(len(df))
np.random.shuffle(idx)
split = int(0.8*len(idx))
train_idx, test_idx = idx[:split], idx[split:]
X_train, y_train = X[train_idx], y[train_idx]
X_test,  y_test  = X[test_idx],  y[test_idx]

model = SimpleMLP(input_dim=X.shape[1], hidden_dim=128, output_dim=len(le_y.id_to_class), lr=0.001, epochs=200, batch_size=64, log_interval=20)
model.fit(X_train, y_train)
print("[Step 5] Model trained (Simple MLP from scratch) ✅")

# ------------------------------------------------------------
# 6) EVALUATION
# ------------------------------------------------------------
def metrics_basic(y_true, y_pred, proba=None, top_k=3):
    acc = (y_true == y_pred).mean()
    # macro precision/recall/f1
    K = len(set(y_true) | set(y_pred))
    precs, recs, f1s = [], [], []
    for k in range(K):
        tp = np.sum((y_true==k) & (y_pred==k))
        fp = np.sum((y_true!=k) & (y_pred==k))
        fn = np.sum((y_true==k) & (y_pred!=k))
        precision = tp / (tp+fp) if (tp+fp)>0 else 0.0
        recall    = tp / (tp+fn) if (tp+fn)>0 else 0.0
        f1 = 2*precision*recall/(precision+recall) if (precision+recall)>0 else 0.0
        precs.append(precision); recs.append(recall); f1s.append(f1)
    macro_p, macro_r, macro_f1 = np.mean(precs), np.mean(recs), np.mean(f1s)

    topk = None
    if proba is not None:
        topk_preds = np.argsort(-proba, axis=1)[:, :top_k]
        hits = [(y_true[i] in topk_preds[i]) for i in range(len(y_true))]
        topk = np.mean(hits)
    return acc, macro_p, macro_r, macro_f1, topk

y_pred_test = model.predict(X_test)
proba_test  = model.predict_proba(X_test)
acc, mp, mr, mf1, top3 = metrics_basic(y_test, y_pred_test, proba_test, top_k=3)
print(f"[Step 6] Holdout 20% → Acc={acc:.3f} | Macro-F1={mf1:.3f} | Top-3 Hit={top3:.3f}")

# 5-fold CV for robustness
def kfold_eval(X, y, k=5):
    N = len(y)
    idx = np.arange(N)
    np.random.shuffle(idx)
    folds = np.array_split(idx, k)
    accs, f1s, top3s = [], [], []
    for i in range(k):
        test_i = folds[i]
        train_i = np.hstack([folds[j] for j in range(k) if j!=i])
        m = SimpleMLP(input_dim=X.shape[1], hidden_dim=128, output_dim=len(le_y.id_to_class), lr=0.001, epochs=50, batch_size=64, log_interval=10)
        m.fit(X[train_i], y[train_i])
        yp = m.predict(X[test_i])
        pp = m.predict_proba(X[test_i])
        a,_,_,f1,t3 = metrics_basic(y[test_i], yp, pp, top_k=3)
        accs.append(a); f1s.append(f1); top3s.append(t3)
    return np.mean(accs), np.mean(f1s), np.mean(top3s)

cv_acc, cv_f1, cv_top3 = kfold_eval(X, y, k=5)
print(f"[Step 6] 5-fold CV → Acc={cv_acc:.3f} | Macro-F1={cv_f1:.3f} | Top-3 Hit={cv_top3:.3f}")

# ------------------------------------------------------------
# 7) SAVE ARTIFACTS
# ------------------------------------------------------------
ART_DIR = "."
np.savez(os.path.join(ART_DIR,"croprecommender_mlp.npz"),
         W1=model.W1, b1=model.b1, W2=model.W2, b2=model.b2)

with open(os.path.join(ART_DIR,"encoders.pkl"), "wb") as f:
    pickle.dump({"ohe":ohe, "le_y":le_y, "num_cols":num_cols, "cat_cols":cat_cols, "scaler":scaler}, f)

with open(os.path.join(ART_DIR,"feature_cols.json"), "w") as f:
    json.dump({"num_cols":num_cols, "cat_cols":cat_cols}, f, indent=2)

print("[Step 7] Saved → croprecommender_mlp.npz, encoders.pkl, feature_cols.json ✅")

# ------------------------------------------------------------
# 8) IRRIGATION SUGGESTIONS (your mapping)
# ------------------------------------------------------------
IRRIGATION_TIPS = {
    "Paddy": [
        "Daily flooding, maintain 5–10 cm standing water.",
        "Ensure water at tillering and flowering.",
        "Drain completely 7–10 days before harvest."
    ],
    "Maize": [
        "Irrigate weekly; adjust to rainfall & soil.",
        "Critical: tasseling, silking, grain filling.",
        "Avoid stress in reproductive phase."
    ],
    "Groundnut": [
        "Irrigate ~every 10 days.",
        "Critical: flowering & pegging stages.",
        "Keep moisture during pod development."
    ],
    "Wheat": [
        "Irrigate at CRI, jointing, flowering.",
        "CRI (≈21 DAS) is most critical.",
        "Avoid waterlogging."
    ],
    "Bengal Gram": [
        "Minimal irrigation; drought-tolerant.",
        "One light irrigation at flowering if very dry.",
        "Avoid heavy irrigation."
    ],
    "Sunflower": [
        "Irrigate ~every 12 days.",
        "Critical: bud, flowering, seed filling.",
        "Drip works very well."
    ],
    "Castor": [
        "Irrigate ~every 15 days.",
        "Provide at branching & spike initiation.",
        "Too much water → foliage, fewer seeds."
    ],
    "Bajra": [
        "Prefer rainfed; drought-tolerant.",
        "If needed, irrigate at flowering & grain filling.",
        "Avoid over-irrigation (lodging risk)."
    ],
    "Linseed": [
        "Light irrigation at branching & flowering.",
        "Never waterlog; highly susceptible.",
        "1–2 irrigations often enough."
    ],
    "Mustard": [
        "Irrigate at branching & pod filling.",
        "First irrigation 30–35 DAS.",
        "Avoid watering during flowering."
    ],
    "Watermelon": [
        "Irrigate ~every 7 days; keep moisture even.",
        "Avoid stress at flowering & fruit set.",
        "Reduce water near maturity for sweetness."
    ],
    "Muskmelon": [
        "Irrigate ~every 7 days, avoid waterlogging.",
        "Consistent water for fruit growth & quality.",
        "Drip reduces fungal risk."
    ],
    "Cowpea": [
        "Irrigate ~every 10 days.",
        "Critical: flowering & pod development.",
        "Drought tolerant but timely water boosts yield."
    ],
    "Cotton": [
        "Irrigate ~every 15 days; adjust by weather.",
        "Critical: squaring, flowering, boll formation.",
        "Avoid waterlogging (boll rot)."
    ],
    "Sugarcane": [
        "Frequent water in hot/dry months.",
        "Formative stage (to ~120 DAS) needs moisture.",
        "Reduce 1–2 months pre-harvest."
    ],
    "Barley": [
        "Light irrigations as needed.",
        "CRI is most crucial.",
        "Sensitive to waterlogging."
    ],
    "Lentil": [
        "Mostly rainfed.",
        "One light irrigation pre-flowering may help.",
        "Too much water → vegetative growth."
    ],
    "Soybean": [
        "Prefer rainfed/moderate drought tolerance.",
        "Supplement at pod filling if dry.",
        "Avoid irrigation during flowering."
    ],
    "Pea": [
        "Irrigate at flowering & pod filling.",
        "Initial irrigation helps germination.",
        "Avoid overwatering (root rot risk)."
    ],
    "Vegetables": [
        "Irrigate every 5–7 days depending on crop.",
        "Drip to root-zone conserves water.",
        "Consistency prevents cracking/bitterness."
    ],
    "Jute": [
        "Keep soil moist throughout.",
        "Frequent light irrigation in hot season.",
        "Ensure drainage to avoid root decay."
    ],
    "Oats": [
        "Irrigate ~every 12 days.",
        "Critical: tillering & flowering.",
        "Relatively drought-tolerant."
    ],
    "Cucumber": [
        "Irrigate ~every 7 days; uniform moisture.",
        "Consistent water at fruit set & growth.",
        "Low water → bitter fruits."
    ],
    "Sugar Beet": [
        "Irrigate ~every 10 days.",
        "Critical: canopy establishment, root bulking.",
        "Avoid waterlogging (low sugar, root rot)."
    ],
    "Pearl Millet": [
        "Prefer rainfed; very drought-tolerant.",
        "If needed, irrigate at flowering.",
        "Water stress at grain filling cuts yield."
    ],
    "Cluster Bean": [
        "Prefer rainfed; arid-suited.",
        "1–2 light irrigations in long dry spells.",
        "Overwatering reduces pod set."
    ],
    "Sesame": [
        "Prefer rainfed; drought-hardy.",
        "One irrigation at flowering if dry.",
        "Avoid waterlogging (root rot)."
    ],
    "Green Gram": [
        "Rainfed OK; one irrigation at flowering.",
        "Avoid heavy irrigation (root disease risk).",
        "Irrigation can boost yield modestly."
    ],
    "Millets": [
        "Mostly rainfed; high drought tolerance.",
        "Irrigate only at critical stages if very dry.",
        "Avoid excessive water."
    ],
    "Sorghum": [
        "Mostly rainfed; irrigate if prolonged dry.",
        "Critical: booting & flowering.",
        "Avoid waterlogging."
    ]
}

# ------------------------------------------------------------
# 9) DEPLOYMENT HELPERS
# ------------------------------------------------------------
def build_feature_row(input_dict, ohe, num_cols, cat_cols, scaler):
    """
    input_dict keys may include numeric and categorical fields.
    Missing numeric are imputed with district-season means (computed earlier) or season defaults.
    """
    # Start with a template row
    row = {}
    # numeric
    for c in num_cols:
        row[c] = input_dict.get(c, None)
    # categorical
    for c in cat_cols:
        row[c] = input_dict.get(c, "NA_VALUE")

    # impute numeric using same logic as training
    tmp = pd.DataFrame([row])
    # attach District & Season so imputer can work
    tmp["District"] = row.get("District", "NA_VALUE")
    tmp["Season"]   = row.get("Season",   "Kharif")

    # district-season means lookup
    g = group_means[(group_means["District"]==tmp.at[0,"District"]) & (group_means["Season"]==tmp.at[0,"Season"])]

    for col in num_cols:
        val = tmp.at[0, col]
        if val is None or (isinstance(val,str) and not val.strip()):
            # fill
            dv = None
            if not g.empty and not pd.isna(g.iloc[0][col]):
                dv = g.iloc[0][col]
            if (dv is None or pd.isna(dv)) and tmp.at[0,"Season"] in season_defaults and col in season_defaults[tmp.at[0,"Season"]]:
                dv = season_defaults[tmp.at[0,"Season"]][col]
            if dv is None or pd.isna(dv):
                dv = nanmean_safe(df[col], 0.0)
            tmp.at[0, col] = dv

    # numeric block
    x_num_raw = tmp[num_cols].astype(float).values.astype(np.float32)
    x_num = scaler.transform(x_num_raw)
    # categorical block via ohe
    x_cat = ohe.transform(tmp[cat_cols])
    # final vector
    x = np.concatenate([x_num, x_cat], axis=1)
    return x

def predict_topN(input_dict, N=3):
    x = build_feature_row(input_dict, ohe=ohe, num_cols=num_cols, cat_cols=cat_cols, scaler=scaler)
    probs = model.predict_proba(x)[0]
    top_idx = np.argsort(-probs)[:N]
    crops = [le_y.id_to_class[i] for i in top_idx]
    confs = [float(probs[i]) for i in top_idx]
    tips  = {c: IRRIGATION_TIPS.get(c, ["Follow crop-specific schedule."]) for c in crops}
    return list(zip(crops, confs)), tips

# ------------------------------------------------------------
# 10) QUICK DEMO PREDICTION (with/without soil test)
#     “Soil test? No” → only district, season, water source are enough.
# ------------------------------------------------------------
demo_no_soil = {
    "District": "Srikakulam",
    "Season": "Kharif",
    "Water_Source": "Tank",     # choose among: Tank/Canal/Borewell/...
    # no soil fields given
}
demo_with_soil = {
    "District": "Srikakulam",
    "Season": "Rabi",
    "Water_Source": "Tank",
    "Soil_pH": 6.6,
    "Organic_Carbon_pct": 0.7,
    "Soil_N_kg_ha": 200,
    "Soil_P_kg_ha": 18,
    "Soil_K_kg_ha": 180,
    "Avg_Temp_C": 23.0,
    "Seasonal_Rainfall_mm": 650,
    "Avg_Humidity_pct": 68,
    "Market_Price_Index": 0.62
}

print("\n[Step 10] Demo: NO soil test input →")
preds, tips = predict_topN(demo_no_soil, N=3)
for c, p in preds:
    print(f"  - {c}: {p:.3f}")
    for t in tips[c][:2]:
        print("    •", t)

print("\n[Step 10] Demo: WITH soil test input →")
preds2, tips2 = predict_topN(demo_with_soil, N=3)
for c, p in preds2:
    print(f"  - {c}: {p:.3f}")
    for t in tips2[c][:2]:
        print("    •", t)

# ------------------------------------------------------------
# 11) TEST ON RANDOM SAMPLES
# ------------------------------------------------------------
def test_random_samples(model, X_test, y_test, le_y, num_samples=10):
    print(f"\n[Step 11] Testing on {num_samples} random samples from the test set:")
    
    # Get random indices from the test set
    test_indices = np.random.choice(len(y_test), num_samples, replace=False)
    X_sample = X_test[test_indices]
    y_true_sample = y_test[test_indices]
    
    # Predict on the random samples
    y_pred_sample_idx = model.predict(X_sample)
    
    # Inverse transform to get crop names
    y_true_crops = le_y.inverse_transform(y_true_sample)
    y_pred_crops = le_y.inverse_transform(y_pred_sample_idx)
    
    correct_predictions = 0
    for i in range(num_samples):
        is_correct = (y_true_crops[i] == y_pred_crops[i])
        status = "✅ Correct" if is_correct else "❌ Incorrect"
        print(f"  Sample {i+1}: Predicted '{y_pred_crops[i]}', Actual '{y_true_crops[i]}' ({status})")
        if is_correct:
            correct_predictions += 1
            
    accuracy = correct_predictions / num_samples
    print(f"\nAccuracy for this random batch of {num_samples} samples: {accuracy:.2%}")

test_random_samples(model, X_test, y_test, le_y, num_samples=10)

print("\n✅ DONE: training → evaluation → saved model → deploy helpers ready.")


[Step 0] Imports ready.
[Step 1] Dataset loaded: shape=(18240, 21)
Columns: ['Year', 'District', 'Mandal', 'Season', 'Soil_Type', 'Soil_pH', 'Organic_Carbon_pct', 'Soil_N_kg_ha', 'Soil_P_kg_ha', 'Soil_K_kg_ha', 'Avg_Temp_C', 'Seasonal_Rainfall_mm', 'Avg_Humidity_pct', 'Water_Source', 'Previous_Crop', 'Primary_Crop', 'Secondary_Crop', 'Suitable_Crops', 'Fertilizer_Plan', 'Irrigation_Plan', 'Market_Price_Index']
[Step 2] Crops detected: 14 -> ['Bengal Gram', 'Chillies', 'Cotton', 'Green Gram', 'Groundnut', 'Maize', 'Millets', 'Paddy', 'Pearl Millet', 'Sesame', 'Sorghum', 'Sunflower', 'Vegetables', 'Watermelon']
[Step 3] Missing numeric values imputed (district-season aware).
[Step 4] Features prepared: X=(18240, 669), y=(18240,), classes=8
Epoch 20/200 | Loss: 1.4867
Epoch 40/200 | Loss: 0.6507
Epoch 60/200 | Loss: 0.3071
Epoch 80/200 | Loss: 0.1237
Epoch 100/200 | Loss: 0.0630
Epoch 120/200 | Loss: 0.0392
Epoch 140/200 | Loss: 0.0274
Epoch 160/200 | Loss: 0.0207
Epoch 180/200 | Loss: 0.

In [None]:
# train_model.py
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.metrics import f1_score, accuracy_score
import logging
import os
import random

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

def load_and_preprocess_data(file_path):
    """Loads and preprocesses the dataset robustly."""
    logging.info("[Step 1] Loading dataset...")
    df = pd.read_csv(file_path)

    # Remove columns not needed
    exclude_cols = ['Year', 'Suitable_Crops', 'Fertilizer_Plan', 'Irrigation_Plan', 
                    'Market_Price_Index', 'Previous_Crop']
    df = df.drop(columns=exclude_cols, errors='ignore')

    # Check target column
    if 'Primary_Crop' not in df.columns:
        raise ValueError("❌ 'Primary_Crop' column is missing in the dataset!")

    X = df.drop(columns='Primary_Crop')
    y = df['Primary_Crop']

    # Numeric and categorical separation
    numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()

    # Handle numeric missing values
    X_num = X[numerical_cols].copy()

    if not X_num.empty:
        num_cols_with_na = X_num.columns[X_num.isnull().any()].tolist()

        if num_cols_with_na:
            # Drop fully empty numeric columns
            fully_empty = [col for col in num_cols_with_na if X_num[col].isnull().all()]
            if fully_empty:
                logging.warning(f"Dropping fully empty numeric columns: {fully_empty}")
                X_num.drop(columns=fully_empty, inplace=True)
                num_cols_with_na = [c for c in num_cols_with_na if c not in fully_empty]

            if num_cols_with_na:
                logging.info(f"[Step 2] Imputing numeric columns {num_cols_with_na} using KNNImputer...")
                imputer = KNNImputer(n_neighbors=5)
                imputed_values = imputer.fit_transform(X_num[num_cols_with_na])
                for i, col in enumerate(num_cols_with_na):
                    X_num[col] = imputed_values[:, i]
        else:
            logging.info("[Step 2] No missing numeric values to impute.")
    else:
        logging.warning("No numeric columns found in dataset!")

    # Handle categorical missing values
    if categorical_cols:
        X_cat = X[categorical_cols].copy()
        for col in categorical_cols:
            X_cat[col] = X_cat[col].fillna("Unknown")
        logging.info("[Step 3] One-hot encoding categorical features...")
        X_cat_encoded = pd.get_dummies(X_cat, columns=categorical_cols, drop_first=True)
    else:
        X_cat_encoded = pd.DataFrame()
        logging.warning("No categorical columns found in dataset!")

    # Combine numeric + categorical
    X_processed = pd.concat([X_num, X_cat_encoded], axis=1)
    X_processed = X_processed.fillna(0)  # Final safety net for any stray NaNs

    logging.info(f"[Step 4] Final features: X={X_processed.shape}, y={y.shape}")
    return X_processed, y

def filter_and_label_data(X, y, min_samples=100):
    """Removes underrepresented crops and encodes labels."""
    logging.info("[Step 5] Filtering crops by minimum sample count...")
    crop_counts = y.value_counts()
    crops_to_keep = crop_counts[crop_counts >= min_samples].index

    X_filtered = X[y.isin(crops_to_keep)]
    y_filtered = y[y.isin(crops_to_keep)]

    le = LabelEncoder()
    y_encoded = le.fit_transform(y_filtered)

    logging.info(f"Crops kept: {list(crops_to_keep)}")
    logging.info(f"Filtered dataset shape: X={X_filtered.shape}, y={len(y_encoded)}, classes={len(le.classes_)}")
    return X_filtered, y_encoded, le.classes_, list(X_filtered.columns)

def train_model(X, y, num_classes, feature_cols):
    """Builds and trains the MLP model."""
    logging.info("[Step 6] Building MLP model...")
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(X.shape[1],)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    logging.info("[Step 7] Training model...")
    model.fit(X, y, epochs=100, batch_size=32, verbose=1)
    return model

def evaluate_model(model, X, y, classes):
    """Evaluates model performance."""
    logging.info("[Step 8] Evaluating model...")
    ss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, test_idx = next(ss.split(X, y))

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    predictions = model.predict(X_test)
    y_pred = np.argmax(predictions, axis=1)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    top3_hits = sum(1 for i in range(len(y_test)) 
                    if y_test[i] in predictions[i].argsort()[-3:][::-1])
    top3_acc = top3_hits / len(y_test)

    logging.info(f"Accuracy: {acc:.3f} | Macro-F1: {f1:.3f} | Top-3 Accuracy: {top3_acc:.3f}")

def save_model(model, classes, feature_cols):
    """Saves model and metadata."""
    logging.info("[Step 9] Saving model...")
    model.save('croprecommender_mlp.h5')
    np.savez('croprecommender_mlp.npz', classes=classes, feature_cols=feature_cols)

if __name__ == '__main__':
    X_initial, y_initial = load_and_preprocess_data('apcrop_dataset_realistic.csv')
    X_filtered, y_encoded, classes, feature_cols = filter_and_label_data(X_initial, y_initial)
    model = train_model(X_filtered, y_encoded, len(classes), feature_cols)
    evaluate_model(model, X_filtered, y_encoded, classes)
    save_model(model, classes, feature_cols)


2025-08-13 20:20:40,653 - INFO - [Step 7] Training model...
Epoch 1/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 2s 2ms/step - accuracy: 0.4036 - loss: 1.9554     
Epoch 2/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.8442 - loss: 0.4235  
Epoch 3/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9619 - loss: 0.1219  
Epoch 4/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9832 - loss: 0.0597  
Epoch 5/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9917 - loss: 0.0348  
Epoch 6/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9855 - loss: 0.0464  
Epoch 7/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9925 - loss: 0.0284  
Epoch 8/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9911 - loss: 0.0306  
Epoch 9/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9929 - loss: 0.0231  
Epoch 10/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9925 - loss: 0.0246  
Epoch 11/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9931 - loss: 0.0207  
Epoch 12/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9923 - loss: 0.0240  
Epoch 13/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9895 - loss: 0.0310  
Epoch 14/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9892 - loss: 0.0328  
Epoch 15/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9905 - loss: 0.0266  
Epoch 16/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9885 - loss: 0.0342  
Epoch 17/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9889 - loss: 0.0338  
Epoch 18/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9865 - loss: 0.0396  
Epoch 19/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9880 - loss: 0.0346  
Epoch 20/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9867 - loss: 0.0404  
Epoch 21/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9878 - loss: 0.0375  
Epoch 22/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9871 - loss: 0.0389  
Epoch 23/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9856 - loss: 0.0413  
Epoch 24/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9842 - loss: 0.0464  
Epoch 25/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9873 - loss: 0.0365  
Epoch 26/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9862 - loss: 0.0435      
Epoch 27/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9856 - loss: 0.0435  
Epoch 28/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9895 - loss: 0.0329  
Epoch 29/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9910 - loss: 0.0281  
Epoch 30/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9891 - loss: 0.0308  
Epoch 31/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9905 - loss: 0.0299  
Epoch 32/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9914 - loss: 0.0283  
Epoch 33/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9907 - loss: 0.0293  
Epoch 34/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9894 - loss: 0.0331  
Epoch 35/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9884 - loss: 0.0364  
Epoch 36/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9883 - loss: 0.0396  
Epoch 37/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9912 - loss: 0.0286  
Epoch 38/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9913 - loss: 0.0253  
Epoch 39/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9905 - loss: 0.0327  
Epoch 40/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9894 - loss: 0.0360  
Epoch 41/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9913 - loss: 0.0287  
Epoch 42/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9931 - loss: 0.0228  
Epoch 43/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9942 - loss: 0.0185      
Epoch 44/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9894 - loss: 0.0338  
Epoch 45/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9885 - loss: 0.0362  
Epoch 46/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9877 - loss: 0.0380  
Epoch 47/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9912 - loss: 0.0300  
Epoch 48/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9933 - loss: 0.0216  
Epoch 49/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9832 - loss: 0.0514  
Epoch 50/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9884 - loss: 0.0371  
Epoch 51/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9893 - loss: 0.0332  
Epoch 52/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9842 - loss: 0.0484  
Epoch 53/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9884 - loss: 0.0349  
Epoch 54/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9882 - loss: 0.0354  
Epoch 55/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9884 - loss: 0.0360  
Epoch 56/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9834 - loss: 0.0474      
Epoch 57/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9904 - loss: 0.0309  
Epoch 58/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9917 - loss: 0.0269  
Epoch 59/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9878 - loss: 0.0364  
Epoch 60/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9885 - loss: 0.0372  
Epoch 61/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9905 - loss: 0.0308  
Epoch 62/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9928 - loss: 0.0248  
Epoch 63/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9871 - loss: 0.0404      
Epoch 64/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9884 - loss: 0.0348  
Epoch 65/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9896 - loss: 0.0332    
Epoch 66/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9935 - loss: 0.0215  
Epoch 67/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9897 - loss: 0.0335  
Epoch 68/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9870 - loss: 0.0411      
Epoch 69/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9833 - loss: 0.0506  
Epoch 70/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9877 - loss: 0.0375  
Epoch 71/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9862 - loss: 0.0417  
Epoch 72/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9912 - loss: 0.0279  
Epoch 73/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9858 - loss: 0.0423  
Epoch 74/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9813 - loss: 0.0601  
Epoch 75/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9857 - loss: 0.0436  
Epoch 76/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9885 - loss: 0.0362  
Epoch 77/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9899 - loss: 0.0342  
Epoch 78/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9928 - loss: 0.0234  
Epoch 79/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9860 - loss: 0.0428  
Epoch 80/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9887 - loss: 0.0347  
Epoch 81/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9908 - loss: 0.0296  
Epoch 82/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9894 - loss: 0.0348  
Epoch 83/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9910 - loss: 0.0297  
Epoch 84/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9839 - loss: 0.0475  
Epoch 85/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9877 - loss: 0.0380      
Epoch 86/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9898 - loss: 0.0349  
Epoch 87/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9866 - loss: 0.0426  
Epoch 88/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9917 - loss: 0.0275  
Epoch 89/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9917 - loss: 0.0270  
Epoch 90/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9888 - loss: 0.0376  
Epoch 91/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9839 - loss: 0.0474  
Epoch 92/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9891 - loss: 0.0347  
Epoch 93/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9867 - loss: 0.0411  
Epoch 94/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9905 - loss: 0.0289  
Epoch 95/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9906 - loss: 0.0292  
Epoch 96/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9874 - loss: 0.0422  
Epoch 97/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9905 - loss: 0.0315  
Epoch 98/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9936 - loss: 0.0207  
Epoch 99/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9870 - loss: 0.0430  
Epoch 100/100
570/570 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step - accuracy: 0.9935 - loss: 0.0221      
2025-08-13 20:22:23,212 - INFO - [Step 8] Evaluating model...
114/114 ━━━━━━━━━━━━━━━━━━━━ 0s 864us/step
2025-08-13 20:22:23,490 - INFO - Accuracy: 1.000 | Macro-F1: 1.000 | Top-3 Accuracy: 1.000
2025-08-13 20:22:23,492 - INFO - [Step 9] Saving model...
2025-08-13 20:22:23,492 - WARNING - You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`.  