In [10]:
# üì¶ Step 1: Setup - Optimized for 16-core CPU
import os
import sys
import shutil
import random
import warnings
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
import subprocess

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# ‚ö° CPU Optimization: Enable all cores for TensorFlow (must be set BEFORE importing TF)
NUM_WORKERS = multiprocessing.cpu_count()
os.environ['TF_NUM_INTEROP_THREADS'] = str(NUM_WORKERS)
os.environ['TF_NUM_INTRAOP_THREADS'] = str(NUM_WORKERS)

# Install required packages
def install_if_missing(package, import_name=None):
    try:
        __import__(import_name or package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])

install_if_missing("tqdm")
install_if_missing("scikit-learn", "sklearn")

from tqdm.auto import tqdm
import tensorflow as tf
import numpy as np

print(f"üìä TensorFlow: {tf.__version__}")
print(f"üßµ CPU cores: {NUM_WORKERS} (all will be used)")

# Check GPU (won't find any for AMD on Windows)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"‚úÖ GPU: {gpus}")
    BATCH_SIZE = 32
else:
    print("‚ö†Ô∏è AMD GPU not supported by TF on Windows - using optimized CPU mode")
    BATCH_SIZE = 32  # Larger batch OK with 16 cores

print(f"üì¶ Batch size: {BATCH_SIZE}")

üìä TensorFlow: 2.15.0
üßµ CPU cores: 16 (all will be used)
‚ö†Ô∏è AMD GPU not supported by TF on Windows - using optimized CPU mode
üì¶ Batch size: 32


In [None]:
# üìÅ Step 2: Paths & Configuration
BASE_DIR = Path(r"E:\FasalVaidya\backend\ml\models")
LOCAL_CACHE = BASE_DIR / "combined_balanced_dataset"
OUTPUT_DIR = BASE_DIR

DATA_SOURCES = [
    BASE_DIR / "Bigger CoLeaf DATASET" / "CoLeaf DATASET",
    BASE_DIR / "Propossed_Data" / "Contrast_Stretching",
    BASE_DIR / "Propossed_Data" / "Histogram_Equalization",
    BASE_DIR / "Propossed_Data" / "Log_Transformation",
    BASE_DIR / "Nitrogen deficiency",
    BASE_DIR / "ThorCam_semiFiltered",
    BASE_DIR / "POTASSIUM DEFICIENCY",
]

CLASS_MAPPING = {
    "healthy": "healthy", "control": "healthy", "-C": "healthy",
    "nitrogen-N": "nitrogen-N", "deficiency": "nitrogen-N", "N": "nitrogen-N",
    "phosphorus-P": "phosphorus-P", "-P": "phosphorus-P", "-P50": "phosphorus-P",
    "potasium-K": "potasium-K", "K": "potasium-K",
    "boron-B": "boron-B", "calcium-Ca": "calcium-Ca", "iron-Fe": "iron-Fe",
    "magnesium-Mg": "magnesium-Mg", "manganese-Mn": "manganese-Mn",
}

ALL_CLASSES = ["healthy", "nitrogen-N", "phosphorus-P", "potasium-K",
               "boron-B", "calcium-Ca", "iron-Fe", "magnesium-Mg", "manganese-Mn"]

# Hyperparameters - BATCH_SIZE set in Step 1 based on GPU availability
IMG_SIZE = 224
# BATCH_SIZE already set above (32 for GPU, 16 for CPU)
EPOCHS_PHASE1 = 25
EPOCHS_PHASE2 = 20
EPOCHS_PHASE3 = 15
TARGET_SAMPLES_PER_CLASS = 400

print(f"üìÅ Base: {BASE_DIR}")
print(f"‚öôÔ∏è Batch size: {BATCH_SIZE}")
print(f"üîç Checking datasets...")
for src in DATA_SOURCES:
    if src.exists():
        print(f"   ‚úÖ {src.name}")
    else:
        print(f"   ‚ùå {src.name}")

üìÅ Base: E:\FasalVaidya\backend\ml\models
üîç Checking datasets...
   ‚úÖ CoLeaf DATASET
   ‚úÖ Contrast_Stretching
   ‚úÖ Histogram_Equalization
   ‚úÖ Log_Transformation
   ‚úÖ Nitrogen deficiency
   ‚úÖ ThorCam_semiFiltered
   ‚úÖ POTASSIUM DEFICIENCY


In [3]:
# üìÇ Step 3: Combine & Balance Datasets
from sklearn.model_selection import train_test_split

def collect_images():
    class_images = {cls: [] for cls in ALL_CLASSES}
    
    for dataset_path in DATA_SOURCES:
        if not dataset_path.exists():
            continue
        name = dataset_path.name
        print(f"   üìÇ {name}")
        
        if "Nitrogen" in name or "nitrogen" in name:
            for split in ["train", "val", "test"]:
                split_dir = dataset_path / split
                if split_dir.exists():
                    for class_dir in split_dir.iterdir():
                        if class_dir.is_dir():
                            std = CLASS_MAPPING.get(class_dir.name)
                            if std in class_images:
                                for img in class_dir.glob("*"):
                                    if img.suffix.lower() in [".jpg", ".jpeg", ".png"]:
                                        class_images[std].append(img)
        elif "ThorCam" in name:
            for class_dir in dataset_path.iterdir():
                if class_dir.is_dir():
                    std = CLASS_MAPPING.get(class_dir.name)
                    if std in class_images:
                        for img in class_dir.glob("*"):
                            if img.suffix.lower() in [".jpg", ".jpeg", ".png", ".tif", ".tiff"]:
                                class_images[std].append(img)
        elif "POTASSIUM" in name.upper():
            for img in dataset_path.glob("*"):
                if img.suffix.lower() in [".jpg", ".jpeg", ".png"]:
                    class_images["potasium-K"].append(img)
        else:
            for class_dir in dataset_path.iterdir():
                if class_dir.is_dir():
                    std = CLASS_MAPPING.get(class_dir.name, class_dir.name)
                    if std in class_images:
                        for img in class_dir.glob("*"):
                            if img.suffix.lower() in [".jpg", ".jpeg", ".png"]:
                                class_images[std].append(img)
    return class_images

print("üìÅ Scanning datasets...")
class_images = collect_images()

print("\nüìä Dataset stats:")
for cls in ALL_CLASSES:
    print(f"   {cls:15s}: {len(class_images[cls])}")

# Balance
print(f"\n‚öñÔ∏è Balancing to {TARGET_SAMPLES_PER_CLASS}/class...")
for cls in ALL_CLASSES:
    n = len(class_images[cls])
    if n == 0:
        print(f"   ‚ö†Ô∏è {cls}: No samples!")
    elif n < TARGET_SAMPLES_PER_CLASS:
        class_images[cls].extend(random.choices(class_images[cls], k=TARGET_SAMPLES_PER_CLASS-n))
    elif n > TARGET_SAMPLES_PER_CLASS * 2:
        class_images[cls] = random.sample(class_images[cls], TARGET_SAMPLES_PER_CLASS)

# Copy to cache
if LOCAL_CACHE.exists():
    shutil.rmtree(LOCAL_CACHE)

def copy_file(args):
    src, dst = args
    try:
        shutil.copy2(src, dst)
        return 1
    except:
        return 0

tasks = []
for cls in ALL_CLASSES:
    (LOCAL_CACHE / cls).mkdir(parents=True, exist_ok=True)
    for i, src in enumerate(class_images[cls]):
        dst = LOCAL_CACHE / cls / f"{cls}_{i:05d}{src.suffix.lower()}"
        tasks.append((src, dst))

print(f"\nüìÅ Copying {len(tasks)} files...")
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as ex:
    results = list(tqdm(ex.map(copy_file, tasks), total=len(tasks)))
print(f"‚úÖ Done!")

üìÅ Scanning datasets...
   üìÇ CoLeaf DATASET
   üìÇ Contrast_Stretching
   üìÇ Histogram_Equalization
   üìÇ Log_Transformation
   üìÇ Nitrogen deficiency
   üìÇ ThorCam_semiFiltered
   üìÇ POTASSIUM DEFICIENCY

üìä Dataset stats:
   healthy        : 1395
   nitrogen-N     : 523
   phosphorus-P   : 3508
   potasium-K     : 1815
   boron-B        : 401
   calcium-Ca     : 351
   iron-Fe        : 140
   magnesium-Mg   : 316
   manganese-Mn   : 266

‚öñÔ∏è Balancing to 400/class...

üìÅ Copying 3724 files...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3724/3724 [00:37<00:00, 100.50it/s]

‚úÖ Done!





In [4]:
# üîÄ Step 4: Create Splits
all_images, all_labels = [], []
class_to_idx = {c: i for i, c in enumerate(ALL_CLASSES)}

for cls in ALL_CLASSES:
    for img in (LOCAL_CACHE / cls).glob("*"):
        all_images.append(str(img))
        all_labels.append(class_to_idx[cls])

all_images, all_labels = np.array(all_images), np.array(all_labels)

train_imgs, test_imgs, train_labels, test_labels = train_test_split(
    all_images, all_labels, test_size=0.1, stratify=all_labels, random_state=42)
train_imgs, val_imgs, train_labels, val_labels = train_test_split(
    train_imgs, train_labels, test_size=0.15, stratify=train_labels, random_state=42)

NUM_CLASSES = len(ALL_CLASSES)
print(f"‚úÖ Train: {len(train_imgs)} | Val: {len(val_imgs)} | Test: {len(test_imgs)}")

‚úÖ Train: 2848 | Val: 503 | Test: 373


In [5]:
# ‚ö° Step 5: Data Pipeline
AUTOTUNE = tf.data.AUTOTUNE

def parse_image(path, label, training=True):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE])
    img = tf.cast(img, tf.float32) / 255.0
    if training:
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_brightness(img, 0.2)
        img = tf.image.random_contrast(img, 0.8, 1.2)
    return img, tf.one_hot(label, NUM_CLASSES)

def make_ds(imgs, labels, training=True):
    ds = tf.data.Dataset.from_tensor_slices((imgs, labels))
    if training:
        ds = ds.shuffle(len(imgs))
    ds = ds.map(lambda x, y: parse_image(x, y, training), num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

train_ds = make_ds(train_imgs, train_labels, True)
val_ds = make_ds(val_imgs, val_labels, False)
test_ds = make_ds(test_imgs, test_labels, False)
print(f"‚úÖ Data pipeline ready")

‚úÖ Data pipeline ready


In [6]:
# üèóÔ∏è Step 6: Build Model
from sklearn.utils.class_weight import compute_class_weight

base = tf.keras.applications.EfficientNetV2S(
    input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False, weights='imagenet')
base.trainable = False

inputs = tf.keras.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
x = base(inputs, training=False)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.4)(x)
x = tf.keras.layers.Dense(512, activation='relu')(x)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(256, activation='relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)
outputs = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')(x)

model = tf.keras.Model(inputs, outputs)
print(f"‚úÖ Model built: {model.count_params():,} params")

# Class weights
weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights = dict(enumerate(weights))



‚úÖ Model built: 21,125,993 params


In [7]:
# üéØ Step 7: Callbacks
CHECKPOINT_DIR = OUTPUT_DIR / "checkpoints"
CHECKPOINT_DIR.mkdir(exist_ok=True)

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=8, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, min_lr=1e-7),
    tf.keras.callbacks.ModelCheckpoint(str(CHECKPOINT_DIR / "best.keras"), monitor='val_accuracy', save_best_only=True),
]
print("‚úÖ Callbacks ready")

‚úÖ Callbacks ready


In [11]:
# üöÄ Step 8: Phase 1 - Train Head
print("="*50)
print("üöÄ PHASE 1: Training Head (backbone frozen)")
print("="*50)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
    metrics=['accuracy']
)

h1 = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS_PHASE1,
               callbacks=callbacks, class_weight=class_weights)
print(f"‚úÖ Phase 1 done! Val acc: {max(h1.history['val_accuracy']):.2%}")

üöÄ PHASE 1: Training Head (backbone frozen)
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
‚úÖ Phase 1 done! Val acc: 55.07%


In [12]:
# üî• Step 9: Phase 2 - Unfreeze Top 30%
print("="*50)
print("üî• PHASE 2: Fine-tuning top 30%")
print("="*50)
      
base.trainable = True
for layer in base.layers[:int(len(base.layers) * 0.7)]:
    layer.trainable = False

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
    metrics=['accuracy']
)

h2 = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS_PHASE2,
               callbacks=callbacks, class_weight=class_weights)
print(f"‚úÖ Phase 2 done! Val acc: {max(h2.history['val_accuracy']):.2%}")

üî• PHASE 2: Fine-tuning top 30%
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
‚úÖ Phase 2 done! Val acc: 63.22%


In [13]:
# üéì Step 10: Phase 3 - Full Fine-tuning
print("="*50)
print("üéì PHASE 3: Full fine-tuning")
print("="*50)

for layer in base.layers:
    layer.trainable = True

model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-5),
    loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
    metrics=['accuracy']
)
    
h3 = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS_PHASE3,
               callbacks=callbacks, class_weight=class_weights)
print(f"‚úÖ Phase 3 done! Val acc: {max(h3.history['val_accuracy']):.2%}")

üéì PHASE 3: Full fine-tuning
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
‚úÖ Phase 3 done! Val acc: 79.92%


In [14]:
# üß™ Step 11: Evaluate
print("="*50)
print("üß™ EVALUATION")
print("="*50)

loss, acc = model.evaluate(test_ds)
print(f"\nüìä Test Accuracy: {acc:.2%}")

# Quick classification report
from sklearn.metrics import classification_report
y_true, y_pred = [], []
for imgs, labels in test_ds:
    preds = model.predict(imgs, verbose=0)
    y_true.extend(np.argmax(labels.numpy(), axis=1))
    y_pred.extend(np.argmax(preds, axis=1))

print("\nüìã Classification Report:")
print(classification_report(y_true, y_pred, target_names=ALL_CLASSES, digits=3))

üß™ EVALUATION

üìä Test Accuracy: 83.38%

üìã Classification Report:
              precision    recall  f1-score   support

     healthy      0.811     0.750     0.779        40
  nitrogen-N      0.833     0.755     0.792        53
phosphorus-P      0.857     0.750     0.800        40
  potasium-K      0.949     0.925     0.937        40
     boron-B      0.921     0.875     0.897        40
  calcium-Ca      0.818     0.900     0.857        40
     iron-Fe      0.921     0.875     0.897        40
magnesium-Mg      0.714     0.875     0.787        40
manganese-Mn      0.733     0.825     0.776        40

    accuracy                          0.834       373
   macro avg      0.840     0.837     0.836       373
weighted avg      0.840     0.834     0.834       373



In [15]:
# üíæ Step 12: Save Model
import json

model_path = OUTPUT_DIR / "plantvillage-npk-v3.h5"
model.save(str(model_path))
print(f"‚úÖ Model saved: {model_path}")

with open(OUTPUT_DIR / "class_names.txt", 'w') as f:
    f.write('\n'.join(ALL_CLASSES))

config = {'model': 'EfficientNetV2-S', 'img_size': IMG_SIZE,
          'classes': ALL_CLASSES, 'test_accuracy': float(acc)}
with open(OUTPUT_DIR / "model_config.json", 'w') as f:
    json.dump(config, f, indent=2)

print(f"\nüéâ TRAINING COMPLETE!")
print(f"   Model: {model_path}")
print(f"   Accuracy: {acc:.2%}")
print(f"\nüìã Next: Update MODEL_PATH in tasks.json and restart backend")

‚úÖ Model saved: E:\FasalVaidya\backend\ml\models\plantvillage-npk-v3.h5

üéâ TRAINING COMPLETE!
   Model: E:\FasalVaidya\backend\ml\models\plantvillage-npk-v3.h5
   Accuracy: 83.38%

üìã Next: Update MODEL_PATH in tasks.json and restart backend
