In [None]:
# Cell 1 — verify GPU
import tensorflow as tf
print("TensorFlow", tf.__version__)
device_name = tf.test.gpu_device_name()
print("GPU device:", device_name)


TensorFlow 2.19.0
GPU device: /device:GPU:0


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# -------------------------------------------------
# 2. Search the whole Drive for the zip file
# -------------------------------------------------
import os
from pathlib import Path

zip_name = "plant_disease.zip"

# Walk through the mounted Drive
found_path = None
for root, dirs, files in os.walk("/content/drive/MyDrive"):
    if zip_name in files:
        found_path = Path(root) / zip_name
        break

if found_path is None:
    print(f"File '{zip_name}' not found in My Drive.")
else:
    print(f"Found at: {found_path}")

Found at: /content/drive/MyDrive/Buildable-ML-DL-Fellowship/plant_disease.zip


In [None]:
# Colab cell — robust Drive dataset setup (copy & run)
from google.colab import drive
import os, shutil, sys, textwrap

# 1) mount drive (if not already)
drive.mount('/content/drive', force_remount=False)

# 2) base folder in your Drive (change only if you used a different folder)
DRIVE_BASE = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship'
print("DRIVE_BASE =", DRIVE_BASE)

# 3) candidate paths to search for the dataset
candidates = [
    os.path.join(DRIVE_BASE, 'plant_disease.zip'),              # MyDrive/.../plant_disease.zip
    os.path.join(DRIVE_BASE, 'data', 'plant_disease.zip'),     # MyDrive/.../data/plant_disease.zip
    os.path.join(DRIVE_BASE, 'data', 'plant_disease'),         # MyDrive/.../data/plant_disease (folder)
    os.path.join(DRIVE_BASE, 'plant_disease'),                 # MyDrive/.../plant_disease (folder)
]

print("\nListing DRIVE_BASE contents (first 200 entries):")
try:
    for i,entry in enumerate(os.listdir(DRIVE_BASE)[:200]):
        print(" ", entry)
except FileNotFoundError:
    print("Drive base folder not found. Maybe you used a different Drive path.")
    print("Please upload your dataset to your Drive under MyDrive/Buildable-ML-DL-Fellowship and re-run.")
    raise

# 4) set where we will put the dataset in the Colab filesystem
DATA_DIR = '/content/plant_disease'
MODELS_DIR = os.path.join(DRIVE_BASE, 'models')
os.makedirs(MODELS_DIR, exist_ok=True)

# 5) find the dataset and extract/copy it
found = False
for cand in candidates:
    if os.path.exists(cand):
        found = True
        if cand.endswith('.zip'):
            print(f"\nFound zip at: {cand}\nUnzipping to {DATA_DIR} ...")
            # remove old DATA_DIR and unzip
            if os.path.exists(DATA_DIR):
                shutil.rmtree(DATA_DIR)
            !unzip -q "{cand}" -d "{DATA_DIR}"
            print("Unzip completed.")
        else:
            # it's a folder, copy it to /content
            print(f"\nFound folder at: {cand}\nCopying to {DATA_DIR} ...")
            if os.path.exists(DATA_DIR):
                shutil.rmtree(DATA_DIR)
            shutil.copytree(cand, DATA_DIR)
            print("Copy completed.")
        break

if not found:
    print("\nNo dataset found in expected locations.")
    print(textwrap.dedent(f'''
        Please check these locations in your Google Drive:
          {DRIVE_BASE}/plant_disease.zip
          {DRIVE_BASE}/data/plant_disease.zip
          {DRIVE_BASE}/data/plant_disease
          {DRIVE_BASE}/plant_disease

        Use Google Drive (https://drive.google.com) to confirm where you uploaded the zip/folder,
        then either:
         - Move the zip into: MyDrive/Buildable-ML-DL-Fellowship/plant_disease.zip
         - OR create a folder: MyDrive/Buildable-ML-DL-Fellowship/plant_disease/ and put train/val/test inside it.

        Alternatively, upload directly from your local machine into this Colab session (not Drive) with:
          from google.colab import files
          uploaded = files.upload()
        (but then you'll need to handle placement/structure manually)
    '''))
    raise FileNotFoundError("Dataset not found in Drive.")

# 6) final check: print structure preview
print("\nDATA_DIR set to:", DATA_DIR)
for split in ['train','val','test']:
    p = os.path.join(DATA_DIR, split)
    if os.path.exists(p):
        classes = [d for d in os.listdir(p) if os.path.isdir(os.path.join(p,d))]
        print(f"  {split}: {len(classes)} classes, e.g. {classes[:10]}")
    else:
        print(f"  WARNING: {split} directory not found under {DATA_DIR} — check your dataset structure.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DRIVE_BASE = /content/drive/MyDrive/Buildable-ML-DL-Fellowship

Listing DRIVE_BASE contents (first 200 entries):
  plant_disease.zip
  models

Found zip at: /content/drive/MyDrive/Buildable-ML-DL-Fellowship/plant_disease.zip
Unzipping to /content/plant_disease ...
Unzip completed.

DATA_DIR set to: /content/plant_disease


In [None]:
# Colab cell — inspect and create train/val/test splits if missing
import os, shutil, random
from pathlib import Path
from sklearn.model_selection import train_test_split

BASE = Path('/content/plant_disease')   # where we unzipped
print("BASE exists:", BASE.exists())
print("BASE listing (top level):")
print(sorted([p.name for p in BASE.iterdir()]))

# helper to check if a directory contains image files
IMG_EXTS = {'.jpg','.jpeg','.png','.bmp','.tif','.tiff'}
def contains_images(p: Path):
    if not p.exists() or not p.is_dir(): return False
    for ff in p.iterdir():
        if ff.is_file() and ff.suffix.lower() in IMG_EXTS:
            return True
    return False

# 1) Search for existing train/val/test anywhere and copy if found
def find_and_copy_existing_splits(base: Path):
    for root, dirs, files in os.walk(base):
        # check if this root has train/val/test subdirs
        subdirs = set(dirs)
        if {'train','val','test'}.issubset(subdirs):
            src = Path(root)
            print("Found existing split structure at:", src)
            # copy each to top-level /content/plant_disease/train etc
            dest_base = base
            # remove old if exists
            for s in ['train','val','test']:
                d = dest_base / s
                if d.exists():
                    shutil.rmtree(d)
                shutil.copytree(src / s, d)
            return True
    return False

# 2) If not found, try to detect class-root folder(s)
def find_class_root(base: Path):
    # Option A: top-level subdir that itself contains class subdirs with images
    for child in base.iterdir():
        if child.is_dir():
            # check if child contains many subdirs each with images
            child_subdirs = [d for d in child.iterdir() if d.is_dir()]
            if child_subdirs and any(contains_images(d) for d in child_subdirs):
                return child
    # Option B: base itself may have class subdirs (each containing images)
    if any(contains_images(base / d) for d in os.listdir(base) if (base/d).is_dir()):
        return base
    # Option C: deeper search: find first folder containing multiple class subfolders with images
    for root, dirs, files in os.walk(base):
        candidate = Path(root)
        subdirs = [candidate / d for d in dirs]
        if len(subdirs) >= 2 and any(contains_images(sd) for sd in subdirs):
            return candidate
    return None

# 3) Create splits from detected class root
def create_splits_from_classroot(class_root: Path, dest_base: Path, seed=42, ratios=(0.7,0.15,0.15)):
    print("Creating splits from class root:", class_root)
    dest_train = dest_base / 'train'
    dest_val   = dest_base / 'val'
    dest_test  = dest_base / 'test'
    # remove existing dest folders if present
    for d in [dest_train, dest_val, dest_test]:
        if d.exists():
            shutil.rmtree(d)
        d.mkdir(parents=True, exist_ok=True)

    classes = [d for d in sorted(class_root.iterdir()) if d.is_dir()]
    print("Detected class folders (first 30):", [c.name for c in classes[:30]])
    for class_dir in classes:
        imgs = [p for p in class_dir.rglob('*') if p.is_file() and p.suffix.lower() in IMG_EXTS]
        imgs = sorted(imgs)
        if not imgs:
            print("  (skip) no images for class", class_dir.name)
            continue
        # split
        train_and_temp, test_files = train_test_split(imgs, test_size=ratios[2], random_state=seed)
        val_size_rel = ratios[1] / (ratios[0] + ratios[1])
        train_files, val_files = train_test_split(train_and_temp, test_size=val_size_rel, random_state=seed)
        # copy
        (dest_train / class_dir.name).mkdir(parents=True, exist_ok=True)
        (dest_val / class_dir.name).mkdir(parents=True, exist_ok=True)
        (dest_test / class_dir.name).mkdir(parents=True, exist_ok=True)
        for src in train_files:
            dst = dest_train / class_dir.name / src.name
            shutil.copy2(src, dst)
        for src in val_files:
            dst = dest_val / class_dir.name / src.name
            shutil.copy2(src, dst)
        for src in test_files:
            dst = dest_test / class_dir.name / src.name
            shutil.copy2(src, dst)
        print(f"  {class_dir.name}: train={len(train_files)} val={len(val_files)} test={len(test_files)}")
    print("Done creating splits at", dest_base)

# Run the detection & splitting logic
base = BASE
if not base.exists():
    raise FileNotFoundError(f"{base} not found in Colab filesystem. Please check unzip earlier.")

# step A: try to find existing splits and copy to top-level
found_splits = find_and_copy_existing_splits(base)
if found_splits:
    print("Copied existing splits into top-level train/val/test.")
else:
    # step B: find class root
    class_root = find_class_root(base)
    if class_root is None:
        print("No class root found automatically. Directory tree (two levels):")
        for root, dirs, files in os.walk(base):
            print(root, "->", len(dirs), "dirs,", len(files), "files")
        raise RuntimeError("Could not automatically find class folders. Please check the zip structure in Drive.")
    # create splits from class_root
    create_splits_from_classroot(class_root, base, seed=42, ratios=(0.7,0.15,0.15))

# Finally, print summary counts per split
for split in ['train','val','test']:
    sdir = base / split
    if not sdir.exists():
        print(f"WARNING: {split} does not exist!")
        continue
    classes = [d for d in sdir.iterdir() if d.is_dir()]
    print(f"\nSplit {split}: {len(classes)} classes")
    for c in classes[:40]:
        cnt = len(list(c.glob('*')))
        print(f"  {c.name}: {cnt}")
print("\nSplitting complete. You can now run the image_dataset_from_directory cell.")


BASE exists: True
BASE listing (top level):
['plant_disease']
Found existing split structure at: /content/plant_disease/plant_disease
Copied existing splits into top-level train/val/test.

Split train: 1 classes
  PlantVillage: 18785

Split val: 1 classes
  PlantVillage: 5734

Split test: 1 classes
  PlantVillage: 5758

Splitting complete. You can now run the image_dataset_from_directory cell.


In [None]:
# Cell 1 — inspect nested structure
from pathlib import Path
BASE = Path('/content/plant_disease')
for split in ['train','val','test']:
    p = BASE / split
    print(f"\n=== {p} === (exists={p.exists()})")
    if not p.exists():
        continue
    level1 = sorted([x for x in p.iterdir() if x.is_dir()])
    print(" Level-1 dirs:", [d.name for d in level1][:40])
    # If there's exactly one folder at level1, inspect its subfolders
    if len(level1) == 1:
        child = level1[0]
        sub = sorted([x for x in child.iterdir() if x.is_dir()])
        print("  Found single wrapper folder:", child.name)
        print("  Its subdirs (first 40):", [d.name for d in sub][:40])
        # count images in first few subdirs
        for d in sub[:10]:
            cnt = len([f for f in d.rglob('*') if f.suffix.lower() in {'.jpg','.jpeg','.png','.bmp'}])
            print(f"    {d.name}: {cnt} images")
    else:
        # show per-level1 counts
        for d in level1[:40]:
            cnt = len([f for f in d.rglob('*') if f.suffix.lower() in {'.jpg','.jpeg','.png','.bmp'}])
            print(f"  {d.name}: {cnt} images")



=== /content/plant_disease/train === (exists=True)
 Level-1 dirs: ['PlantVillage']
  Found single wrapper folder: PlantVillage
  Its subdirs (first 40): []

=== /content/plant_disease/val === (exists=True)
 Level-1 dirs: ['PlantVillage']
  Found single wrapper folder: PlantVillage
  Its subdirs (first 40): []

=== /content/plant_disease/test === (exists=True)
 Level-1 dirs: ['PlantVillage']
  Found single wrapper folder: PlantVillage
  Its subdirs (first 40): []


In [None]:
# Cell 2 — flatten wrapper folder if each split contains a single wrapper folder
import shutil
from pathlib import Path

BASE = Path('/content/plant_disease')
for split in ['train','val','test']:
    p = BASE / split
    if not p.exists():
        print(f"Skip {split}: not present")
        continue
    children = [d for d in p.iterdir() if d.is_dir()]
    if len(children) == 1:
        wrapper = children[0]
        print(f"Flattening {split}: wrapper {wrapper.name} -> moving its subfolders up to {p}")
        subs = [d for d in wrapper.iterdir() if d.is_dir()]
        for s in subs:
            dest = p / s.name
            if dest.exists():
                print(f"  Destination exists, skipping move of {s} -> {dest}")
            else:
                shutil.move(str(s), str(dest))
                print(f"  Moved {s.name}")
        # If wrapper is now empty of subdirs, remove it (but keep files if any)
        try:
            # remove wrapper if empty
            if not any(wrapper.iterdir()):
                wrapper.rmdir()
                print(f"  Removed empty wrapper folder {wrapper}")
            else:
                print(f"  Wrapper {wrapper} still contains files (not removed).")
        except Exception as e:
            print("  Could not remove wrapper:", e)
    else:
        print(f"No single wrapper for {split} (has {len(children)} top-level folders).")

# print final top-level per-split directories
print("\nAfter flattening, top-level contents:")
for split in ['train','val','test']:
    p = BASE / split
    if p.exists():
        print(split, "->", sorted([d.name for d in p.iterdir() if d.is_dir()])[:40])
    else:
        print(split, "missing")


Flattening train: wrapper PlantVillage -> moving its subfolders up to /content/plant_disease/train
  Wrapper /content/plant_disease/train/PlantVillage still contains files (not removed).
Flattening val: wrapper PlantVillage -> moving its subfolders up to /content/plant_disease/val
  Wrapper /content/plant_disease/val/PlantVillage still contains files (not removed).
Flattening test: wrapper PlantVillage -> moving its subfolders up to /content/plant_disease/test
  Wrapper /content/plant_disease/test/PlantVillage still contains files (not removed).

After flattening, top-level contents:
train -> ['PlantVillage']
val -> ['PlantVillage']
test -> ['PlantVillage']


In [None]:
# Cell 3 — create datasets using image_dataset_from_directory
import tensorflow as tf, json, os
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path

DATA_DIR = '/content/plant_disease'
IMG_SIZE = (224,224)
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

train_dir = os.path.join(DATA_DIR, 'train')
val_dir = os.path.join(DATA_DIR, 'val')
test_dir = os.path.join(DATA_DIR, 'test')

train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    train_dir, labels='inferred', label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=True)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    val_dir, labels='inferred', label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=False)

test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    test_dir, labels='inferred', label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=False)

class_names = train_ds.class_names
print("Detected classes:", len(class_names), class_names[:40])
# Save class mapping
MODELS_DIR = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship/models'
os.makedirs(MODELS_DIR, exist_ok=True)
with open(os.path.join(MODELS_DIR,'classes.json'),'w') as f:
    json.dump(class_names, f)

train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=AUTOTUNE)

# basic augmentation layer
data_augmentation = keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.05),
    layers.RandomZoom(0.05),
], name='data_augmentation')
print("Datasets ready. train batches:", tf.data.experimental.cardinality(train_ds).numpy())


Found 18785 files belonging to 1 classes.
Found 5734 files belonging to 1 classes.
Found 5758 files belonging to 1 classes.
Detected classes: 1 ['PlantVillage']
Datasets ready. train batches: 588


In [None]:
# Cell 4 — baseline CNN model definition & compile
from tensorflow import keras
from tensorflow.keras import layers
def make_baseline_cnn(input_shape=IMG_SIZE+(3,), num_classes=len(class_names)):
    inputs = keras.Input(shape=input_shape)
    x = data_augmentation(inputs)
    x = layers.Rescaling(1./255)(x)
    x = layers.Conv2D(32,3,activation='relu',padding='same')(x)
    x = layers.MaxPool2D()(x)
    x = layers.Conv2D(64,3,activation='relu',padding='same')(x)
    x = layers.MaxPool2D()(x)
    x = layers.Conv2D(128,3,activation='relu',padding='same')(x)
    x = layers.MaxPool2D()(x)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(128, activation='relu')(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = keras.Model(inputs, outputs, name='baseline_cnn')
    return model

baseline = make_baseline_cnn()
baseline.compile(optimizer=keras.optimizers.Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
baseline.summary()


In [None]:
# Cell 5 — baseline training
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
baseline_ckpt = os.path.join(MODELS_DIR, 'disease_baseline_best.h5')
callbacks = [
    EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
    ModelCheckpoint(baseline_ckpt, monitor='val_loss', save_best_only=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)
]
history_baseline = baseline.fit(train_ds, validation_data=val_ds, epochs=20, callbacks=callbacks)
baseline.save(os.path.join(MODELS_DIR,'disease_baseline.h5'))

# save plots
import matplotlib.pyplot as plt
def plot_history(h, name):
    plt.figure(); plt.plot(h.history['loss'], label='train_loss'); plt.plot(h.history['val_loss'], label='val_loss'); plt.legend(); plt.title(name+' loss'); plt.savefig(os.path.join(MODELS_DIR,name+'_loss.png')); plt.close()
    if 'accuracy' in h.history:
        plt.figure(); plt.plot(h.history['accuracy'], label='train_acc'); plt.plot(h.history['val_accuracy'], label='val_acc'); plt.legend(); plt.title(name+' acc'); plt.savefig(os.path.join(MODELS_DIR,name+'_acc.png')); plt.close()
plot_history(history_baseline, 'baseline')
print("Saved baseline model and plots to", MODELS_DIR)


Epoch 1/20


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 1.0000 - loss: 0.0000e+00



[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 56ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 0.0010
Epoch 2/20
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 59ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 0.0010
Epoch 3/20
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 55ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 0.0010
Epoch 4/20
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 57ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 0.0010
Epoch 5/20
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 58ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.000



Saved baseline model and plots to /content/drive/MyDrive/Buildable-ML-DL-Fellowship/models


In [None]:
# Cell 6 — ResNet50 transfer learning model creation
from tensorflow.keras.applications import ResNet50
def make_resnet_model(input_shape=IMG_SIZE+(3,), num_classes=len(class_names), base_trainable=False):
    base = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape, pooling='avg')
    base.trainable = base_trainable
    inputs = keras.Input(shape=input_shape)
    x = data_augmentation(inputs)
    x = layers.Rescaling(1./255)(x)
    x = base(x, training=False)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(256, activation='relu')(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = keras.Model(inputs, outputs, name='resnet50_transfer')
    return model

resnet_model = make_resnet_model(base_trainable=False)
resnet_model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
resnet_model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# Cell 7 — train top layers then fine-tune
resnet_ckpt = os.path.join(MODELS_DIR, 'disease_resnet50_top_best.h5')
callbacks = [EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
             ModelCheckpoint(resnet_ckpt, monitor='val_loss', save_best_only=True),
             ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)]

history_resnet_top = resnet_model.fit(train_ds, validation_data=val_ds, epochs=10, callbacks=callbacks)
resnet_model.save(os.path.join(MODELS_DIR,'disease_resnet50_top.h5'))
plot_history(history_resnet_top, 'resnet_top')

# find base and unfreeze last N layers
base_model = None
for layer in resnet_model.layers:
    if 'resnet50' in layer.name:
        base_model = layer
        break
if base_model is None:
    base_model = resnet_model.layers[3]

for layer in base_model.layers[:-20]:
    layer.trainable = False
for layer in base_model.layers[-20:]:
    layer.trainable = True

resnet_model.compile(optimizer=keras.optimizers.Adam(1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
history_resnet_ft = resnet_model.fit(train_ds, validation_data=val_ds, epochs=10, callbacks=[
    EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
    ModelCheckpoint(os.path.join(MODELS_DIR,'disease_resnet50_ft_best.h5'), monitor='val_loss', save_best_only=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)
])
resnet_model.save(os.path.join(MODELS_DIR,'disease_resnet50.h5'))
plot_history(history_resnet_ft, 'resnet_finetune')
print("Saved ResNet models + curves to", MODELS_DIR)


Epoch 1/10
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - accuracy: 1.0000 - loss: 0.0000e+00



[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 173ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 0.0010
Epoch 2/10
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 166ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 0.0010
Epoch 3/10
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 198ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 0.0010
Epoch 4/10
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 167ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 0.0010
Epoch 5/10
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 166ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accurac



Epoch 1/10
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step - accuracy: 1.0000 - loss: 0.0000e+00



[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 232ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 1.0000e-05
Epoch 2/10
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 194ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 1.0000e-05
Epoch 3/10
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 193ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 1.0000e-05
Epoch 4/10
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 193ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00 - learning_rate: 1.0000e-05
Epoch 5/10
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 194ms/step - accuracy: 1.0000 - loss: 0.000



Saved ResNet models + curves to /content/drive/MyDrive/Buildable-ML-DL-Fellowship/models


In [None]:
# Cell 8 — evaluation & inference time
import time, json
from tensorflow.keras.models import load_model
resnet_path = os.path.join(MODELS_DIR,'disease_resnet50.h5')
if os.path.exists(resnet_path):
    model = load_model(resnet_path)
else:
    model = resnet_model

loss, acc = model.evaluate(test_ds)
print("Test loss, acc:", loss, acc)

# inference timing on one batch
batch = next(iter(test_ds))
images, labels = batch
n_runs = 50
t0 = time.perf_counter()
for _ in range(n_runs):
    _ = model.predict(images, verbose=0)
t1 = time.perf_counter()
avg_ms_per_image = ((t1 - t0) / n_runs) * 1000.0 / images.shape[0]
metrics = {'test_loss': float(loss), 'test_acc': float(acc), 'avg_inference_ms_per_image': float(avg_ms_per_image)}
with open(os.path.join(MODELS_DIR,'disease_metrics.json'),'w') as f:
    json.dump(metrics, f)
print("Avg ms per image:", avg_ms_per_image, "Saved metrics to", os.path.join(MODELS_DIR,'disease_metrics.json'))




[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 128ms/step - accuracy: 1.0000 - loss: 0.0000e+00
Test loss, acc: 0.0 1.0




Avg ms per image: 6.756160526250028 Saved metrics to /content/drive/MyDrive/Buildable-ML-DL-Fellowship/models/disease_metrics.json


In [None]:
# Inspect sample files / folders
from pathlib import Path
BASE = Path('/content/plant_disease')
for split in ['train','val','test']:
    p = BASE / split
    print(f"\n== {split} ==")
    if not p.exists():
        print("  MISSING", p); continue
    children = [d for d in p.iterdir() if d.is_dir()]
    print("  top-level dirs:", [d.name for d in children][:10])
    # if a single wrapper folder exists, list inside it
    if len(children) == 1:
        wrapper = children[0]
        print("  wrapper folder:", wrapper.name)
        wrapper_children = [x for x in wrapper.iterdir()]
        # show 20 entries (files or folders)
        for e in wrapper_children[:20]:
            print("   ", ("DIR " if e.is_dir() else "FILE"), e.name)
        # sample file names (if files exist directly)
        sample_files = [f for f in wrapper.rglob('*') if f.is_file()]
        print("  sample filenames (first 20):")
        for s in sample_files[:20]:
            print("   ", s.relative_to(BASE))
    else:
        # show sample files at this level
        sample_files = [f for f in p.rglob('*') if f.is_file()]
        print("  sample filenames (first 20):")
        for s in sample_files[:20]:
            print("   ", s.relative_to(BASE))



== train ==
  top-level dirs: ['PlantVillage']
  wrapper folder: PlantVillage
    FILE 829d0a5e-326c-4ae6-a3c6-c65297cb5d2f___RS_Erly.B 8338.JPG
    FILE dbb6495b-0765-4d94-afad-6020819aa893___JR_HL 8233.JPG
    FILE f536e055-666e-41cc-8ee2-c1af2fbf754a___RS_Early.B 8104.JPG
    FILE 6173dbe2-1379-4241-a43e-7cbad448b6cc___RS_Early.B 6993.JPG
    FILE ffcfa32d-4506-49ed-9e35-972d5cbad0b8___Com.G_SpM_FL 8450.JPG
    FILE 08f0fd0e-d4b1-400d-9df1-2d61d24c95f0___Com.G_SpM_FL 9397.JPG
    FILE 7c9ac47e-ec96-457c-96d7-8aefdace978f___Crnl_L.Mold 6850.JPG
    FILE 7cf516ee-327e-40a6-810b-7c08ea57f825___UF.GRC_YLCV_Lab 02062.JPG
    FILE fc6a6a03-757b-4549-8c2e-194141456f60___GCREC_Bact.Sp 6252.JPG
    FILE 0ce74db6-be9b-4c43-a104-6a3f9bcd2de2___NREC_B.Spot 1827.JPG
    FILE 9ea77bf9-c078-4adf-9173-ca0dca15dabd___GCREC_Bact.Sp 6333.JPG
    FILE 71048e39-6d6e-4381-9113-f2b4bd71ab93___Com.G_SpM_FL 8749.JPG
    FILE b7fb98ad-68d6-46b9-9f17-9d3f111b20fb___Com.G_SpM_FL 1676.JPG
    FILE 503c6a57-2af

In [None]:
# Auto-fix dataset structure: flatten wrapper OR create class folders from filenames
import os, shutil, re
from pathlib import Path
from collections import Counter

BASE = Path('/content/plant_disease')
IMG_EXTS = {'.jpg','.jpeg','.png','.bmp','.tif','.tiff'}

def list_dirs(path):
    return sorted([p for p in Path(path).iterdir() if p.is_dir()])

def contains_class_subfolders(wrapper):
    subs = [d for d in wrapper.iterdir() if d.is_dir()]
    # if many subdirs and they contain images -> it's class-root
    return any(any(f.suffix.lower() in IMG_EXTS for f in sub.rglob('*') if f.is_file()) for sub in subs)

# Try two strategies for each split
for split in ['train','val','test']:
    split_dir = BASE / split
    if not split_dir.exists():
        print("Skip (missing):", split_dir); continue
    top_dirs = list_dirs(split_dir)
    print("\nProcessing split:", split, "top_dirs:", [d.name for d in top_dirs][:10])
    # If there is a single wrapper folder (like PlantVillage) and inside it are class folders -> move them up
    if len(top_dirs) == 1 and contains_class_subfolders(top_dirs[0]):
        wrapper = top_dirs[0]
        print("  Found wrapper with class subfolders:", wrapper)
        for sub in sorted([d for d in wrapper.iterdir() if d.is_dir()]):
            dest = split_dir / sub.name
            if dest.exists():
                print(f"   dest exists, skipping move: {dest}")
            else:
                shutil.move(str(sub), str(dest))
                print(f"   moved {sub.name} -> {dest}")
        # attempt to remove wrapper if empty (leave files if any)
        try:
            if not any(wrapper.iterdir()):
                wrapper.rmdir()
                print("   removed empty wrapper", wrapper)
            else:
                print("   wrapper still contains files; not removed.")
        except Exception as e:
            print("   could not remove wrapper:", e)
    else:
        # If there are no class subfolders (only files), try parse labels from filenames
        # Collect files directly under split_dir (or deeper first level)
        file_list = [f for f in split_dir.rglob('*') if f.is_file() and f.suffix.lower() in IMG_EXTS]
        # Heuristic: many filenames include '___' (PlantVillage style)
        pattern_counts = Counter()
        sample = file_list[:200]
        for f in sample:
            name = f.name
            if '___' in name:
                label = name.split('___')[0]
                pattern_counts['triple_underscore'] += 1
            elif '_' in name:
                # maybe label before first underscore
                label = name.split('_')[0]
                pattern_counts['single_underscore'] += 1
            else:
                pattern_counts['no_sep'] += 1
        print("  sample pattern counts:", pattern_counts)
        # If triple_underscore majority, build class folders from that
        if pattern_counts['triple_underscore'] > max(pattern_counts['single_underscore'], pattern_counts['no_sep']):
            print("  Using '___' split from filename to create class folders.")
            for f in file_list:
                parts = f.name.split('___')
                if len(parts) >= 2:
                    cls = parts[0]
                else:
                    cls = 'unknown'
                dest_dir = split_dir / cls
                dest_dir.mkdir(exist_ok=True)
                shutil.copy2(str(f), str(dest_dir / f.name))
        elif pattern_counts['single_underscore'] > pattern_counts['no_sep']:
            print("  Using '_' split from filename to create class folders (prefix before '_' will be class).")
            for f in file_list:
                cls = f.name.split('_')[0]
                dest_dir = split_dir / cls
                dest_dir.mkdir(exist_ok=True)
                shutil.copy2(str(f), str(dest_dir / f.name))
        else:
            print("  No clear filename pattern detected; please examine sample filenames manually.")
            print("  Sample files:")
            for s in sample[:30]:
                print("   ", s.name)
            # do not attempt destructive moves
            continue

# After organizing, print summary counts
print("\nFinal per-split summary:")
for split in ['train','val','test']:
    sdir = BASE / split
    if not sdir.exists():
        print(split, "missing")
        continue
    classes = [d for d in sdir.iterdir() if d.is_dir()]
    print(f"\n{split}: {len(classes)} classes")
    for c in classes[:40]:
        cnt = len([f for f in c.rglob('*') if f.is_file() and f.suffix.lower() in IMG_EXTS])
        print(" ", c.name, cnt)



Processing split: train top_dirs: ['PlantVillage']
  sample pattern counts: Counter({'triple_underscore': 200})
  Using '___' split from filename to create class folders.

Processing split: val top_dirs: ['PlantVillage']
  sample pattern counts: Counter({'triple_underscore': 200})
  Using '___' split from filename to create class folders.

Processing split: test top_dirs: ['PlantVillage']
  sample pattern counts: Counter({'triple_underscore': 200})
  Using '___' split from filename to create class folders.

Final per-split summary:

train: 18786 classes
  89cff955-c142-4874-aa13-a0b9ca6c11e1 1
  d648ce42-6742-44b2-95b9-a01b24e3054a 1
  5f96c793-8015-458b-b3fe-7e718b673b1b 1
  e0d2d9f6-b29e-4cd8-8c2b-7d462271ceb3 1
  a7c1d137-6609-4659-94ce-81c62f64665e 1
  cec3cfff-3d00-46cf-8a8e-b7438ec7901e 1
  ac205c4a-58c5-430f-8b4c-f853652df749 1
  04c8e6b9-7710-4cdd-b259-2d78b15d1036 1
  707da75e-4924-4033-b141-e8a391f39741 1
  58d19fbc-49ee-4738-b0e0-88afd097cefb 1
  072c957f-2ed9-4026-a36c-7d9

In [None]:
# Cell A — build fixed dataset by parsing label after '___' in filenames
import os, shutil, re
from pathlib import Path
from collections import Counter

SRC_BASE = Path('/content/plant_disease')   # current unzipped folder
FIXED_BASE = Path('/content/plant_disease_fixed')  # new clean dataset location
IMG_EXTS = {'.jpg','.jpeg','.png','.bmp','.tif','.tiff'}

print("SRC_BASE exists:", SRC_BASE.exists())
# remove old fixed dir if present (optional) — comment out if you want to keep previous run
if FIXED_BASE.exists():
    print("Removing old fixed dir:", FIXED_BASE)
    shutil.rmtree(FIXED_BASE)
FIXED_BASE.mkdir(parents=True, exist_ok=True)

def sanitize_label(s):
    # keep letters, numbers, underscore, hyphen, dot; replace other chars with underscore
    s = s.strip()
    s = re.sub(r'\s+', '_', s)                 # spaces -> underscore
    s = re.sub(r'[^\w\-\.\u00C0-\u017F]', '_', s)  # allow unicode letters too
    return s

for split in ['train','val','test']:
    src_split = SRC_BASE / split
    dest_split = FIXED_BASE / split
    dest_split.mkdir(parents=True, exist_ok=True)
    if not src_split.exists():
        print("  WARNING: source split missing:", src_split)
        continue

    # collect all image files under this split (search recursively)
    all_files = [p for p in src_split.rglob('*') if p.is_file() and p.suffix.lower() in IMG_EXTS]
    print(f"\nProcessing split={split}, found {len(all_files)} image files")

    created = Counter()
    for f in all_files:
        name = f.name
        # primary case: '...___LABEL <number>.ext' (PlantVillage)
        if '___' in name:
            after = name.split('___',1)[1]                # e.g. "RS_Erly.B 8338.JPG"
            # remove extension
            after_noext = os.path.splitext(after)[0]     # "RS_Erly.B 8338"
            # remove trailing numeric id tokens (e.g. " 8338")
            after_noid = re.sub(r'\s+\d+$', '', after_noext)
            label_raw = after_noid.strip()
            label = sanitize_label(label_raw)
        else:
            # fallback: try prefix before first underscore or put into unknown
            if '_' in name:
                label = sanitize_label(name.split('_')[0])
            else:
                label = 'unknown'
        # create destination and copy
        dest_dir = dest_split / label
        dest_dir.mkdir(parents=True, exist_ok=True)
        dest_path = dest_dir / name
        # if file already exists (duplicate names), add suffix
        if dest_path.exists():
            base, ext = os.path.splitext(name)
            i = 1
            while True:
                newname = f"{base}_{i}{ext}"
                if not (dest_dir / newname).exists():
                    dest_path = dest_dir / newname
                    break
                i += 1
        shutil.copy2(str(f), str(dest_path))
        created[label] += 1

    print(f"  Created {len(created)} class folders for split {split}. Sample classes (first 40):")
    for cls, cnt in list(created.items())[:40]:
        print("   ", cls, cnt)

print("\nDone. Fixed dataset created at:", FIXED_BASE)
print("You may now run the image_dataset_from_directory cell pointed at /content/plant_disease_fixed")


SRC_BASE exists: True

Processing split=train, found 37570 image files
  Created 541 class folders for split train. Sample classes (first 40):
    YLCV_NREC 1376
    RS_Late.B 1654
    RS_Erly.B 1840
    RS_HL 2140
    JR_HL 2670
    GHLB_PS_Leaf_30_Day 4
    RS_Early.B 1846
    Com.G_SpM_FL 3088
    UF.GRC_YLCV_Lab 2888
    GHLB2_Leaf 1090
    NREC_B.Spot 748
    UF.GRC_BS_Lab_Leaf 674
    YLCV_GCREC 1566
    Com.G_TgS_FL 2536
    Crnl_L.Mold 1752
    GH_HL_Leaf 574
    GCREC_Bact.Sp 3156
    JR_Sept.L.S 914
    RS_LB 1796
    JR_B.Spot 1066
    GH_HL_Leaf_310.1 2
    Keller.St_CG 518
    Matt.S_CG 1836
    PSU_CG 660
    GHLB_PS_Leaf_1_Day 10
    GH_HL_Leaf_495.1 2
    GH_HL_Leaf_226.6 2
    GHLB_Leaf_2.1_Day 4
    GHLB_PS_Leaf_17.1_Day 4
    GHLB_Leaf_10_Day 2
    GH_HL_Leaf_469.1 2
    GH_HL_Leaf_466.1 4
    GHLB2ES_Leaf_62.1 2
    GH_HL_Leaf_264.1 2
    GH_HL_Leaf_317.1 2
    GHLB_PS_Leaf_23.7_Day 4
    GHLB_PS_Leaf_35.1_Day 2
    GHLB2_Leaf_125.2 2
    GHLB2_Leaf_154.4 2
    GH_H

In [None]:
# Cell B — create datasets from fixed path and show detected classes
import tensorflow as tf, json, os
from tensorflow import keras
from tensorflow.keras import layers

DATA_DIR = '/content/plant_disease_fixed'
IMG_SIZE = (224,224)
BATCH_SIZE = 32   # lower to 16/8 if OOM
AUTOTUNE = tf.data.AUTOTUNE

train_dir = os.path.join(DATA_DIR, 'train')
val_dir = os.path.join(DATA_DIR, 'val')
test_dir = os.path.join(DATA_DIR, 'test')

print("Checking folders exist:", os.path.exists(train_dir), os.path.exists(val_dir), os.path.exists(test_dir))

train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    train_dir, labels='inferred', label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=True)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    val_dir, labels='inferred', label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=False)

test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    test_dir, labels='inferred', label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=False)

class_names = train_ds.class_names
print("Detected classes:", len(class_names))
print("Sample class names (first 50):", class_names[:50])

# save classes.json to Drive models folder (if needed)
MODELS_DIR = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship/models'
os.makedirs(MODELS_DIR, exist_ok=True)
with open(os.path.join(MODELS_DIR,'classes.json'),'w') as f:
    json.dump(class_names, f)

# Prefetch
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=AUTOTUNE)

# print number of batches
try:
    print("Train batches:", tf.data.experimental.cardinality(train_ds).numpy())
except Exception:
    pass


Checking folders exist: True True True
Found 37570 files belonging to 541 classes.
Found 11468 files belonging to 198 classes.
Found 11516 files belonging to 181 classes.
Detected classes: 541
Sample class names (first 50): ['2700323949_95aa2eaa01_o', 'CG1', 'Com.G_SpM_FL', 'Com.G_TgS_FL', 'Crnl_L.Mold', 'GCREC_Bact.Sp', 'GHLB2ES_Leaf', 'GHLB2ES_Leaf_119.1', 'GHLB2ES_Leaf_119.2', 'GHLB2ES_Leaf_136.1', 'GHLB2ES_Leaf_138.1', 'GHLB2ES_Leaf_139.1', 'GHLB2ES_Leaf_141.1', 'GHLB2ES_Leaf_62.1', 'GHLB2ES_Leaf_63.1', 'GHLB2ES_Leaf_65.1', 'GHLB2ES_Leaf_66.1', 'GHLB2ES_Leaf_69.1', 'GHLB2ES_Leaf_69.3', 'GHLB2_Leaf', 'GHLB2_Leaf_100.1', 'GHLB2_Leaf_101.1', 'GHLB2_Leaf_101.2', 'GHLB2_Leaf_101.3', 'GHLB2_Leaf_105.1', 'GHLB2_Leaf_106.1', 'GHLB2_Leaf_107.1', 'GHLB2_Leaf_107.3', 'GHLB2_Leaf_108.1', 'GHLB2_Leaf_109.1', 'GHLB2_Leaf_111.1', 'GHLB2_Leaf_111.2', 'GHLB2_Leaf_112.1', 'GHLB2_Leaf_113.2', 'GHLB2_Leaf_113.3', 'GHLB2_Leaf_113.4', 'GHLB2_Leaf_114.1', 'GHLB2_Leaf_114.3', 'GHLB2_Leaf_115.1', 'GHLB2_Le

In [None]:
# Cell 1 — recreate tf.data from fixed dataset
import tensorflow as tf, json, os
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path

DATA_DIR = '/content/plant_disease_fixed'   # <- fixed dataset
IMG_SIZE = (224,224)
BATCH_SIZE = 32   # if OOM, set to 16 or 8
AUTOTUNE = tf.data.AUTOTUNE

train_dir = os.path.join(DATA_DIR, 'train')
val_dir = os.path.join(DATA_DIR, 'val')
test_dir = os.path.join(DATA_DIR, 'test')

train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    train_dir, labels='inferred', label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=True)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    val_dir, labels='inferred', label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=False)

test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    test_dir, labels='inferred', label_mode='categorical',
    image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=False)

class_names = train_ds.class_names
num_classes = len(class_names)
print("Detected classes:", num_classes)
print("Sample classes (first 40):", class_names[:40])

# save classes mapping to Drive
MODELS_DIR = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship/models'
os.makedirs(MODELS_DIR, exist_ok=True)
with open(os.path.join(MODELS_DIR,'classes.json'),'w') as f:
    json.dump(class_names, f)

# prefetch
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=AUTOTUNE)


Found 37570 files belonging to 541 classes.
Found 11468 files belonging to 198 classes.
Found 11516 files belonging to 181 classes.
Detected classes: 541
Sample classes (first 40): ['2700323949_95aa2eaa01_o', 'CG1', 'Com.G_SpM_FL', 'Com.G_TgS_FL', 'Crnl_L.Mold', 'GCREC_Bact.Sp', 'GHLB2ES_Leaf', 'GHLB2ES_Leaf_119.1', 'GHLB2ES_Leaf_119.2', 'GHLB2ES_Leaf_136.1', 'GHLB2ES_Leaf_138.1', 'GHLB2ES_Leaf_139.1', 'GHLB2ES_Leaf_141.1', 'GHLB2ES_Leaf_62.1', 'GHLB2ES_Leaf_63.1', 'GHLB2ES_Leaf_65.1', 'GHLB2ES_Leaf_66.1', 'GHLB2ES_Leaf_69.1', 'GHLB2ES_Leaf_69.3', 'GHLB2_Leaf', 'GHLB2_Leaf_100.1', 'GHLB2_Leaf_101.1', 'GHLB2_Leaf_101.2', 'GHLB2_Leaf_101.3', 'GHLB2_Leaf_105.1', 'GHLB2_Leaf_106.1', 'GHLB2_Leaf_107.1', 'GHLB2_Leaf_107.3', 'GHLB2_Leaf_108.1', 'GHLB2_Leaf_109.1', 'GHLB2_Leaf_111.1', 'GHLB2_Leaf_111.2', 'GHLB2_Leaf_112.1', 'GHLB2_Leaf_113.2', 'GHLB2_Leaf_113.3', 'GHLB2_Leaf_113.4', 'GHLB2_Leaf_114.1', 'GHLB2_Leaf_114.3', 'GHLB2_Leaf_115.1', 'GHLB2_Leaf_115.2']


In [None]:
# Cell 2 — inspect class counts and (optionally) filter small classes
from collections import Counter
import glob

MIN_SAMPLES = 20   # adjust: keep classes with at least this many images total (train+val+test)
BASE = Path('/content/plant_disease_fixed')
classes = sorted([d.name for d in (BASE/'train').iterdir() if d.is_dir()])

counts = {}
for c in classes:
    total = sum(len(list((BASE/s).glob(f'{c}/*'))) for s in ['train','val','test'])
    counts[c] = total

# print summary
sorted_counts = sorted(counts.items(), key=lambda x: -x[1])
print("Top 40 classes by total image count:")
for name, cnt in sorted_counts[:40]:
    print(" ", name, cnt)
print("\nClasses with < {} images: {}".format(MIN_SAMPLES, sum(1 for _,v in counts.items() if v < MIN_SAMPLES)))

# If you want to keep only classes >= MIN_SAMPLES, set FILTER=True
FILTER = False   # set to True if you want to prune small classes
if FILTER:
    keep = [name for name,c in counts.items() if c >= MIN_SAMPLES]
    print("Keeping", len(keep), "classes.")
    # build new reduced folder at /content/plant_disease_pruned
    import shutil
    PRUNED = Path('/content/plant_disease_pruned')
    if PRUNED.exists(): shutil.rmtree(PRUNED)
    PRUNED.mkdir(parents=True, exist_ok=True)
    for split in ['train','val','test']:
        (PRUNED/split).mkdir(parents=True, exist_ok=True)
        for c in keep:
            src = BASE/ split / c
            if src.exists():
                shutil.copytree(src, PRUNED/split/c)
    print("Pruned dataset created at", PRUNED)
    # If you pruned, re-create datasets from PRUNED path (update DATA_DIR)


Top 40 classes by total image count:
  GCREC_Bact.Sp 5152
  Com.G_SpM_FL 4822
  UF.GRC_YLCV_Lab 4572
  JR_HL 4370
  Com.G_TgS_FL 4122
  RS_HL 3344
  RS_LB 2962
  RS_Early.B 2960
  Matt.S_CG 2956
  RS_Erly.B 2904
  Crnl_L.Mold 2818
  RS_Late.B 2686
  YLCV_GCREC 2608
  YLCV_NREC 2202
  GHLB2_Leaf 1804
  JR_B.Spot 1786
  JR_Sept.L.S 1452
  NREC_B.Spot 1188
  PSU_CG 1110
  UF.GRC_BS_Lab_Leaf 1076
  GH_HL_Leaf 938
  Keller.St_CG 824
  GHLB2ES_Leaf 48
  GHLB_PS_Leaf_1_Day 20
  GHLB_Leaf_2_Day 16
  GHLB_PS_Leaf_24_Day 16
  GHLB_PS_Leaf_8.1_Day 14
  GHLB_PS_Leaf_1.2_Day 12
  GHLB_Leaf_1_Day 10
  GHLB_Leaf_2.1_Day 10
  GHLB_Leaf_23_Day 10
  GHLB_PS_Leaf_1.5_Day 10
  GHLB_PS_Leaf_2.1_Day 10
  GHLB_PS_Leaf_2_Day 10
  GHLB_PS_Leaf_8_Day 10
  GHLB_Leaf_1.2_Day 8
  GHLB_Leaf_23.1_Day 8
  GHLB_Leaf_23.4_Day 8
  GHLB_PS_Leaf_2.2_Day 8
  GHLB_PS_Leaf_2.3_Day 8

Classes with < 20 images: 517


In [None]:
# Cell 3 — imports and augmentation layer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os

IMG_SIZE = (224,224)
data_augmentation = keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.05),
    layers.RandomZoom(0.05),
], name='data_augmentation')


In [None]:
# Cell 4 — baseline CNN (num_classes from dataset)
def make_baseline_cnn(input_shape=IMG_SIZE+(3,), num_classes=num_classes):
    inputs = keras.Input(shape=input_shape)
    x = data_augmentation(inputs)
    x = layers.Rescaling(1./255)(x)
    x = layers.Conv2D(32,3,activation='relu',padding='same')(x)
    x = layers.MaxPool2D()(x)
    x = layers.Conv2D(64,3,activation='relu',padding='same')(x)
    x = layers.MaxPool2D()(x)
    x = layers.Conv2D(128,3,activation='relu',padding='same')(x)
    x = layers.MaxPool2D()(x)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(128, activation='relu')(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = keras.Model(inputs, outputs, name='baseline_cnn')
    return model

baseline = make_baseline_cnn()
baseline.compile(optimizer=keras.optimizers.Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
baseline.summary()


In [None]:
# Rebuild train/val/test datasets with a unified class mapping (fixes label dimension mismatch)
import os, json
from pathlib import Path
import tensorflow as tf
from tensorflow import keras

BASE = Path('/content/plant_disease_fixed')
IMG_SIZE = (224,224)
BATCH_SIZE = 32   # reduce to 16 or 8 if OOM
AUTOTUNE = tf.data.AUTOTUNE
IMG_EXTS = {'.jpg','.jpeg','.png','.bmp','.tif','.tiff'}

# 1) find union of class folders across train/val/test
splits = {'train': BASE/'train', 'val': BASE/'val', 'test': BASE/'test'}
class_set = set()
for sname, sdir in splits.items():
    if sdir.exists():
        for d in sdir.iterdir():
            if d.is_dir():
                class_set.add(d.name)
class_names = sorted(list(class_set))
num_classes = len(class_names)
print("Unified num_classes:", num_classes)

# mapping
class_to_idx = {c:i for i,c in enumerate(class_names)}

# 2) helper: gather files and numeric labels for a split
def gather_files_and_labels(split_dir):
    files = []
    labels = []
    sd = Path(split_dir)
    if not sd.exists():
        return files, labels
    for cls in class_names:
        folder = sd / cls
        if folder.exists():
            for f in folder.rglob('*'):
                if f.is_file() and f.suffix.lower() in IMG_EXTS:
                    files.append(str(f))
                    labels.append(class_to_idx[cls])
    return files, labels

train_files, train_labels = gather_files_and_labels(splits['train'])
val_files, val_labels     = gather_files_and_labels(splits['val'])
test_files, test_labels   = gather_files_and_labels(splits['test'])

print("Counts -> train:", len(train_files), "val:", len(val_files), "test:", len(test_files))

# quick failure check
if len(train_files) == 0:
    raise RuntimeError("No training files found. Check /content/plant_disease_fixed/train")

# 3) build tf.data pipelines from file paths + labels with consistent one-hot labels
def make_dataset(files, labels, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((files, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(10000, max(1024, len(files))))
    def _load(path, label):
        img = tf.io.read_file(path)
        img = tf.io.decode_image(img, channels=3, expand_animations=False)
        img = tf.image.resize(img, IMG_SIZE)
        img = tf.cast(img, tf.float32) / 255.0
        label = tf.one_hot(label, depth=num_classes)
        return img, label
    ds = ds.map(_load, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

train_ds = make_dataset(train_files, train_labels, shuffle=True)
val_ds   = make_dataset(val_files, val_labels, shuffle=False)
test_ds  = make_dataset(test_files, test_labels, shuffle=False)

# 4) save unified class list to Drive for later
MODELS_DIR = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship/models'
os.makedirs(MODELS_DIR, exist_ok=True)
with open(os.path.join(MODELS_DIR,'classes_unified.json'),'w') as f:
    json.dump(class_names, f)

# 5) sanity checks
print("train batches:", tf.data.experimental.cardinality(train_ds).numpy())
print("val batches:  ", tf.data.experimental.cardinality(val_ds).numpy())
print("test batches: ", tf.data.experimental.cardinality(test_ds).numpy())
# print first batch shapes
for imgs, labs in train_ds.take(1):
    print("sample batch imgs shape:", imgs.shape, "labels shape:", labs.shape)
    break

print("Unified mapping saved to:", os.path.join(MODELS_DIR,'classes_unified.json'))


Unified num_classes: 587
Counts -> train: 37570 val: 11468 test: 11516
train batches: 1175
val batches:   359
test batches:  360
sample batch imgs shape: (32, 224, 224, 3) labels shape: (32, 587)
Unified mapping saved to: /content/drive/MyDrive/Buildable-ML-DL-Fellowship/models/classes_unified.json


In [None]:
# Cell A — load unified classes and confirm num_classes
import json, os
from pathlib import Path
MODELS_DIR = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship/models'
cls_file = os.path.join(MODELS_DIR, 'classes_unified.json')

if os.path.exists(cls_file):
    with open(cls_file, 'r') as f:
        class_names = json.load(f)
    print("Loaded classes_unified.json with", len(class_names), "classes")
else:
    # fallback: infer from train folders
    BASE = Path('/content/plant_disease_fixed')
    train_dir = BASE/'train'
    class_names = sorted([d.name for d in train_dir.iterdir() if d.is_dir()])
    print("Inferred class_names from train folder:", len(class_names), "classes")

num_classes = len(class_names)
print("num_classes =", num_classes)


Loaded classes_unified.json with 587 classes
num_classes = 587


In [None]:
# Cell B — ensure train_ds/val_ds/test_ds are the unified ones (rebuild if needed)
import tensorflow as tf, os
from pathlib import Path

BASE = Path('/content/plant_disease_fixed')
IMG_SIZE = (224,224)
BATCH_SIZE = 32   # lower to 16/8 if OOM
AUTOTUNE = tf.data.AUTOTUNE
IMG_EXTS = {'.jpg','.jpeg','.png','.bmp','.tif','.tiff'}

# build mapping (class_names must be present from previous cell)
class_to_idx = {c:i for i,c in enumerate(class_names)}
def gather_files_and_labels(split_dir):
    files, labels = [], []
    p = Path(split_dir)
    if not p.exists(): return files, labels
    for cls in class_names:
        folder = p/cls
        if folder.exists():
            for f in folder.rglob('*'):
                if f.is_file() and f.suffix.lower() in IMG_EXTS:
                    files.append(str(f))
                    labels.append(class_to_idx[cls])
    return files, labels

train_files, train_labels = gather_files_and_labels(BASE/'train')
val_files, val_labels     = gather_files_and_labels(BASE/'val')
test_files, test_labels   = gather_files_and_labels(BASE/'test')

print("Counts -> train:", len(train_files), "val:", len(val_files), "test:", len(test_files))

def make_dataset(files, labels, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((files, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(10000, max(1024, len(files))))
    def _load(path, label):
        img = tf.io.read_file(path)
        img = tf.io.decode_image(img, channels=3, expand_animations=False)
        img = tf.image.resize(img, IMG_SIZE)
        img = tf.cast(img, tf.float32) / 255.0
        label = tf.one_hot(label, depth=num_classes)
        return img, label
    ds = ds.map(_load, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

train_ds = make_dataset(train_files, train_labels, shuffle=True)
val_ds   = make_dataset(val_files, val_labels, shuffle=False)
test_ds  = make_dataset(test_files, test_labels, shuffle=False)

# quick sanity
for imgs, labs in train_ds.take(1):
    print("batch imgs", imgs.shape, "labels", labs.shape)


Counts -> train: 37570 val: 11468 test: 11516
batch imgs (32, 224, 224, 3) labels (32, 587)


In [None]:
# Cell C — (re)create baseline using the correct num_classes
from tensorflow import keras
from tensorflow.keras import layers
IMG_SIZE = (224,224)

def make_baseline_cnn(input_shape=IMG_SIZE+(3,), num_classes=num_classes):
    inputs = keras.Input(shape=input_shape)
    x = layers.RandomFlip("horizontal")(inputs)
    x = layers.RandomRotation(0.05)(x)
    x = layers.RandomZoom(0.05)(x)
    x = layers.Rescaling(1./255)(x)
    x = layers.Conv2D(32,3,activation='relu',padding='same')(x)
    x = layers.MaxPool2D()(x)
    x = layers.Conv2D(64,3,activation='relu',padding='same')(x)
    x = layers.MaxPool2D()(x)
    x = layers.Conv2D(128,3,activation='relu',padding='same')(x)
    x = layers.MaxPool2D()(x)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(128, activation='relu')(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = keras.Model(inputs, outputs, name='baseline_cnn')
    return model

baseline = make_baseline_cnn()
baseline.compile(optimizer=keras.optimizers.Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
baseline.summary()


In [None]:
# Cell D — train baseline (same callbacks as before)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import os
MODELS_DIR = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship/models'
os.makedirs(MODELS_DIR, exist_ok=True)

baseline_ckpt = os.path.join(MODELS_DIR, 'disease_baseline_best.h5')
callbacks = [
    EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
    ModelCheckpoint(baseline_ckpt, monitor='val_loss', save_best_only=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)
]

EPOCHS = 20
history_baseline = baseline.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, callbacks=callbacks)
baseline.save(os.path.join(MODELS_DIR,'disease_baseline.h5'))
print("Saved baseline model to", MODELS_DIR)


Epoch 1/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.1953 - loss: 3.1000



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 59ms/step - accuracy: 0.1953 - loss: 3.0999 - val_accuracy: 0.0778 - val_loss: 3.6617 - learning_rate: 0.0010
Epoch 2/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.1848 - loss: 2.6573



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 58ms/step - accuracy: 0.1848 - loss: 2.6573 - val_accuracy: 0.0991 - val_loss: 3.4346 - learning_rate: 0.0010
Epoch 3/20
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 49ms/step - accuracy: 0.2730 - loss: 2.4922



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 57ms/step - accuracy: 0.2729 - loss: 2.4922 - val_accuracy: 0.1071 - val_loss: 3.3060 - learning_rate: 0.0010
Epoch 4/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.2933 - loss: 2.4154



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 58ms/step - accuracy: 0.2933 - loss: 2.4155 - val_accuracy: 0.1317 - val_loss: 3.2322 - learning_rate: 0.0010
Epoch 5/20
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 49ms/step - accuracy: 0.3157 - loss: 2.3348



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 58ms/step - accuracy: 0.3157 - loss: 2.3349 - val_accuracy: 0.1828 - val_loss: 3.0932 - learning_rate: 0.0010
Epoch 6/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.3614 - loss: 2.2032



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 56ms/step - accuracy: 0.3614 - loss: 2.2032 - val_accuracy: 0.1765 - val_loss: 3.0397 - learning_rate: 0.0010
Epoch 7/20
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 49ms/step - accuracy: 0.3712 - loss: 2.1060



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 59ms/step - accuracy: 0.3711 - loss: 2.1062 - val_accuracy: 0.1918 - val_loss: 2.9746 - learning_rate: 0.0010
Epoch 8/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.3722 - loss: 2.0811



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 57ms/step - accuracy: 0.3722 - loss: 2.0811 - val_accuracy: 0.1981 - val_loss: 2.9161 - learning_rate: 0.0010
Epoch 9/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 57ms/step - accuracy: 0.3862 - loss: 2.0531 - val_accuracy: 0.1972 - val_loss: 2.9280 - learning_rate: 0.0010
Epoch 10/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.3940 - loss: 2.0015



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 57ms/step - accuracy: 0.3940 - loss: 2.0015 - val_accuracy: 0.2039 - val_loss: 2.8680 - learning_rate: 0.0010
Epoch 11/20
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 49ms/step - accuracy: 0.4009 - loss: 1.9315



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 57ms/step - accuracy: 0.4009 - loss: 1.9316 - val_accuracy: 0.2122 - val_loss: 2.8106 - learning_rate: 0.0010
Epoch 12/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 57ms/step - accuracy: 0.3993 - loss: 1.9374 - val_accuracy: 0.2107 - val_loss: 2.8473 - learning_rate: 0.0010
Epoch 13/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.4106 - loss: 1.9105



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 57ms/step - accuracy: 0.4106 - loss: 1.9106 - val_accuracy: 0.2349 - val_loss: 2.7738 - learning_rate: 0.0010
Epoch 14/20
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 50ms/step - accuracy: 0.4165 - loss: 1.8749



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 57ms/step - accuracy: 0.4165 - loss: 1.8751 - val_accuracy: 0.2417 - val_loss: 2.7507 - learning_rate: 0.0010
Epoch 15/20
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 50ms/step - accuracy: 0.4259 - loss: 1.8538



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 57ms/step - accuracy: 0.4259 - loss: 1.8539 - val_accuracy: 0.2450 - val_loss: 2.7286 - learning_rate: 0.0010
Epoch 16/20
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 50ms/step - accuracy: 0.4257 - loss: 1.8281



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 59ms/step - accuracy: 0.4257 - loss: 1.8283 - val_accuracy: 0.2360 - val_loss: 2.7124 - learning_rate: 0.0010
Epoch 17/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.4328 - loss: 1.7967



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 58ms/step - accuracy: 0.4328 - loss: 1.7968 - val_accuracy: 0.2513 - val_loss: 2.6967 - learning_rate: 0.0010
Epoch 18/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.4462 - loss: 1.7680



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 57ms/step - accuracy: 0.4462 - loss: 1.7681 - val_accuracy: 0.2478 - val_loss: 2.6520 - learning_rate: 0.0010
Epoch 19/20
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 49ms/step - accuracy: 0.4529 - loss: 1.7308



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 57ms/step - accuracy: 0.4529 - loss: 1.7309 - val_accuracy: 0.2769 - val_loss: 2.5106 - learning_rate: 0.0010
Epoch 20/20
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.4613 - loss: 1.7006



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 58ms/step - accuracy: 0.4613 - loss: 1.7007 - val_accuracy: 0.2865 - val_loss: 2.4972 - learning_rate: 0.0010




Saved baseline model to /content/drive/MyDrive/Buildable-ML-DL-Fellowship/models


In [None]:
# Cell E — build ResNet model using same num_classes (train top, then fine-tune)
from tensorflow.keras.applications import ResNet50
from tensorflow import keras
from tensorflow.keras import layers
import os

def make_resnet_model(input_shape=IMG_SIZE+(3,), num_classes=num_classes, base_trainable=False):
    base = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape, pooling='avg')
    base.trainable = base_trainable
    inputs = keras.Input(shape=input_shape)
    x = layers.RandomFlip("horizontal")(inputs)
    x = layers.RandomRotation(0.05)(x)
    x = layers.RandomZoom(0.05)(x)
    x = layers.Rescaling(1./255)(x)
    x = base(x, training=False)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(256, activation='relu')(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = keras.Model(inputs, outputs, name='resnet50_transfer')
    return model

resnet_model = make_resnet_model(base_trainable=False)
resnet_model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
resnet_model.summary()

# Train top layers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
MODELS_DIR = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship/models'
resnet_ckpt = os.path.join(MODELS_DIR, 'disease_resnet50_top_best.h5')
callbacks = [EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
             ModelCheckpoint(resnet_ckpt, monitor='val_loss', save_best_only=True),
             ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)]
history_resnet_top = resnet_model.fit(train_ds, validation_data=val_ds, epochs=10, callbacks=callbacks)
resnet_model.save(os.path.join(MODELS_DIR,'disease_resnet50_top.h5'))


Epoch 1/10
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - accuracy: 0.1965 - loss: 2.7374



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 172ms/step - accuracy: 0.1965 - loss: 2.7375 - val_accuracy: 0.0778 - val_loss: 3.8467 - learning_rate: 0.0010
Epoch 2/10
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 130ms/step - accuracy: 0.1831 - loss: 2.6440



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 170ms/step - accuracy: 0.1831 - loss: 2.6442 - val_accuracy: 0.0778 - val_loss: 3.5847 - learning_rate: 0.0010
Epoch 3/10
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 167ms/step - accuracy: 0.1861 - loss: 2.6840 - val_accuracy: 0.0778 - val_loss: 3.6888 - learning_rate: 0.0010
Epoch 4/10
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 167ms/step - accuracy: 0.1791 - loss: 2.7483 - val_accuracy: 0.0778 - val_loss: 3.7202 - learning_rate: 0.0010
Epoch 5/10
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 167ms/step - accuracy: 0.1754 - loss: 2.7577 - val_accuracy: 0.0778 - val_loss: 3.6885 - learning_rate: 0.0010
Epoch 6/10
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 167ms/step - accuracy: 0.1421 - loss: 3.0354 - val_accuracy: 0.0778 - val_loss: 3.7



In [None]:
# Cell F — unfreeze last N layers and fine-tune
# find base by name
base_model = None
for layer in resnet_model.layers:
    if 'resnet50' in layer.name:
        base_model = layer
        break
if base_model is None:
    base_model = resnet_model.layers[3]

N = 20
for layer in base_model.layers[:-N]:
    layer.trainable = False
for layer in base_model.layers[-N:]:
    layer.trainable = True

resnet_model.compile(optimizer=keras.optimizers.Adam(1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
history_resnet_ft = resnet_model.fit(train_ds, validation_data=val_ds, epochs=10, callbacks=[
    EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
    ModelCheckpoint(os.path.join(MODELS_DIR,'disease_resnet50_ft_best.h5'), monitor='val_loss', save_best_only=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)
])
resnet_model.save(os.path.join(MODELS_DIR,'disease_resnet50.h5'))


Epoch 1/10
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step - accuracy: 0.1249 - loss: 2.7929



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 201ms/step - accuracy: 0.1249 - loss: 2.7929 - val_accuracy: 0.0778 - val_loss: 3.7034 - learning_rate: 1.0000e-05
Epoch 2/10
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 157ms/step - accuracy: 0.1608 - loss: 2.8320



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 198ms/step - accuracy: 0.1608 - loss: 2.8319 - val_accuracy: 0.0778 - val_loss: 3.6992 - learning_rate: 1.0000e-05
Epoch 3/10
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 157ms/step - accuracy: 0.1797 - loss: 2.6868



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 197ms/step - accuracy: 0.1797 - loss: 2.6870 - val_accuracy: 0.0778 - val_loss: 3.6526 - learning_rate: 1.0000e-05
Epoch 4/10
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 194ms/step - accuracy: 0.1733 - loss: 2.7219 - val_accuracy: 0.0778 - val_loss: 3.7027 - learning_rate: 1.0000e-05
Epoch 5/10
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 194ms/step - accuracy: 0.1819 - loss: 2.6875 - val_accuracy: 0.0778 - val_loss: 3.7469 - learning_rate: 1.0000e-05
Epoch 6/10
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 227ms/step - accuracy: 0.1801 - loss: 2.6573 - val_accuracy: 0.0778 - val_loss: 3.7552 - learning_rate: 1.0000e-05
Epoch 7/10
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 193ms/step - accuracy: 0.1657 - loss: 2.7664 - val_accuracy: 0.0778



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 199ms/step - accuracy: 0.1734 - loss: 2.7192 - val_accuracy: 0.0778 - val_loss: 3.6515 - learning_rate: 5.0000e-06
Epoch 10/10
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 194ms/step - accuracy: 0.1769 - loss: 2.7158 - val_accuracy: 0.0778 - val_loss: 3.6642 - learning_rate: 5.0000e-06




In [None]:
# EfficientNetB0 + Focal Loss + Strong Augmentation training cell
import tensorflow as tf, os, json, time
from tensorflow import keras
from tensorflow.keras import layers
MODELS_DIR = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship/models'
os.makedirs(MODELS_DIR, exist_ok=True)

IMG_SIZE = (224,224)
BATCH_SIZE = 24   # try 24; if OOM, drop to 16 or 12
AUTOTUNE = tf.data.AUTOTUNE

In [None]:
# load unified classes (must exist)
with open(os.path.join(MODELS_DIR,'classes_unified.json'),'r') as f:
    class_names = json.load(f)
num_classes = len(class_names)
print("num_classes:", num_classes)

num_classes: 587


In [None]:
# --- strong augmentation (training-only)
data_augmentation = keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.08),
    layers.RandomZoom(0.08),
    layers.RandomContrast(0.15),
], name='strong_augmentation')

In [None]:
# --- focal loss (categorical)
def categorical_focal_loss(gamma=2.0, alpha=0.25):
    # expects probabilities (after softmax) and one-hot ground truth
    def loss_fn(y_true, y_pred):
        # clip to avoid NaNs
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0 - 1e-7)
        cross_entropy = -y_true * tf.math.log(y_pred)
        weight = alpha * tf.pow(1 - y_pred, gamma)
        loss = weight * cross_entropy
        return tf.reduce_sum(loss, axis=-1)
    return loss_fn

In [None]:
# --- Build model
def make_efficientnet(input_shape=IMG_SIZE+(3,), num_classes=num_classes, base_trainable=False):
    base = tf.keras.applications.EfficientNetB0(
        include_top=False, weights='imagenet', input_shape=input_shape, pooling='avg'
    )
    base.trainable = base_trainable
    inputs = keras.Input(shape=input_shape)
    x = data_augmentation(inputs)
    x = layers.Rescaling(1./255)(x)
    x = base(x, training=False)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(512, activation='swish')(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    model = keras.Model(inputs, outputs, name='efficientnetb0_transfer')
    return model


In [None]:
# make model (head training)
model = make_efficientnet(base_trainable=False)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=categorical_focal_loss(gamma=2.0, alpha=0.25),
    metrics=['accuracy']
)
model.summary()

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# Callbacks
callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7),
    keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR,'disease_efficientnet_head_best.h5'), monitor='val_loss', save_best_only=True)
]

# train head
EPOCHS_HEAD = 8
history_head = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS_HEAD, callbacks=callbacks)

# Save head
model.save(os.path.join(MODELS_DIR,'disease_efficientnet_head.h5'))

Epoch 1/8
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 78ms/step - accuracy: 0.1839 - loss: 0.5706



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 116ms/step - accuracy: 0.1839 - loss: 0.5707 - val_accuracy: 0.0778 - val_loss: 0.8796 - learning_rate: 0.0010
Epoch 2/8
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 80ms/step - accuracy: 0.1919 - loss: 0.5420



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 103ms/step - accuracy: 0.1919 - loss: 0.5421 - val_accuracy: 0.0778 - val_loss: 0.8120 - learning_rate: 0.0010
Epoch 3/8
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 115ms/step - accuracy: 0.1932 - loss: 0.5290 - val_accuracy: 0.0778 - val_loss: 0.8819 - learning_rate: 0.0010
Epoch 4/8
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 101ms/step - accuracy: 0.1861 - loss: 0.5411 - val_accuracy: 0.0778 - val_loss: 0.8693 - learning_rate: 0.0010
Epoch 5/8
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 102ms/step - accuracy: 0.1896 - loss: 0.5419 - val_accuracy: 0.0778 - val_loss: 0.8891 - learning_rate: 0.0010
Epoch 6/8
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 115ms/step - accuracy: 0.1851 - loss: 0.5552 - val_accuracy: 0.0778 - val_loss: 0.8685 -



In [None]:
# --- Fine-tune: unfreeze last blocks of EfficientNet base
# get base
base = None
for layer in model.layers:
    if 'efficientnet' in layer.name or 'efficientnetb0' in layer.name.lower():
        base = layer
        break
# fallback: find first functional layer with many layers
if base is None:
    base = model.layers[3]

# Unfreeze last N layers of base; choose N depending on available GPU/memory
N = 60   # number of layers from the end of the base to unfreeze; lower if OOM
print("Total layers in base:", len(base.layers))
for layer in base.layers[:-N]:
    layer.trainable = False
for layer in base.layers[-N:]:
    layer.trainable = True

# recompile with lower LR
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-5),
    loss=categorical_focal_loss(gamma=2.0, alpha=0.25),
    metrics=['accuracy']
)

# fine-tune
EPOCHS_FINE = 12
callbacks_ft = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7),
    keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR,'disease_efficientnet_ft_best.h5'), monitor='val_loss', save_best_only=True)
]

history_ft = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS_FINE, callbacks=callbacks_ft)

# Save final model
model.save(os.path.join(MODELS_DIR,'disease_efficientnet_final.h5'))

Total layers in base: 239
Epoch 1/12
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 111ms/step - accuracy: 0.0611 - loss: 2.2432



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 135ms/step - accuracy: 0.0612 - loss: 2.2423 - val_accuracy: 0.0778 - val_loss: 0.8649 - learning_rate: 1.0000e-05
Epoch 2/12
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 110ms/step - accuracy: 0.0786 - loss: 1.2703



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 133ms/step - accuracy: 0.0787 - loss: 1.2697 - val_accuracy: 0.0778 - val_loss: 0.8506 - learning_rate: 1.0000e-05
Epoch 3/12
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 109ms/step - accuracy: 0.0941 - loss: 0.7587



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 132ms/step - accuracy: 0.0941 - loss: 0.7586 - val_accuracy: 0.0778 - val_loss: 0.8489 - learning_rate: 1.0000e-05
Epoch 4/12
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 109ms/step - accuracy: 0.0979 - loss: 0.7464



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 145ms/step - accuracy: 0.0980 - loss: 0.7463 - val_accuracy: 0.0778 - val_loss: 0.8479 - learning_rate: 1.0000e-05
Epoch 5/12
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 144ms/step - accuracy: 0.1178 - loss: 0.7219 - val_accuracy: 0.0778 - val_loss: 0.8532 - learning_rate: 1.0000e-05
Epoch 6/12
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 108ms/step - accuracy: 0.1076 - loss: 0.7287



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 131ms/step - accuracy: 0.1076 - loss: 0.7286 - val_accuracy: 0.0778 - val_loss: 0.8423 - learning_rate: 1.0000e-05
Epoch 7/12
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 109ms/step - accuracy: 0.1105 - loss: 0.7097



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 132ms/step - accuracy: 0.1106 - loss: 0.7097 - val_accuracy: 0.0778 - val_loss: 0.8322 - learning_rate: 1.0000e-05
Epoch 8/12
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 130ms/step - accuracy: 0.1247 - loss: 0.6940 - val_accuracy: 0.0778 - val_loss: 0.8363 - learning_rate: 1.0000e-05
Epoch 9/12
[1m1174/1175[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 108ms/step - accuracy: 0.1273 - loss: 0.6838



[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 145ms/step - accuracy: 0.1273 - loss: 0.6837 - val_accuracy: 0.0778 - val_loss: 0.8147 - learning_rate: 1.0000e-05
Epoch 10/12
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 131ms/step - accuracy: 0.1392 - loss: 0.6622 - val_accuracy: 0.0778 - val_loss: 0.8333 - learning_rate: 1.0000e-05
Epoch 11/12
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 131ms/step - accuracy: 0.1349 - loss: 0.6733 - val_accuracy: 0.0778 - val_loss: 0.8261 - learning_rate: 1.0000e-05
Epoch 12/12
[1m1175/1175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 143ms/step - accuracy: 0.1444 - loss: 0.6496 - val_accuracy: 0.0778 - val_loss: 0.8381 - learning_rate: 1.0000e-05




In [None]:
# Save combined training curves
import matplotlib.pyplot as plt
def save_history_plots(h, name):
    plt.figure(figsize=(8,4)); plt.plot(h.history['loss'], label='train_loss'); plt.plot(h.history['val_loss'], label='val_loss'); plt.legend(); plt.title(name+'_loss'); plt.savefig(os.path.join(MODELS_DIR,name+'_loss.png')); plt.close()
    if 'accuracy' in h.history:
        plt.figure(figsize=(8,4)); plt.plot(h.history['accuracy'], label='train_acc'); plt.plot(h.history['val_accuracy'], label='val_acc'); plt.legend(); plt.title(name+'_acc'); plt.savefig(os.path.join(MODELS_DIR,name+'_acc.png')); plt.close()

save_history_plots(history_head, 'efficientnet_head')
save_history_plots(history_ft,   'efficientnet_finetune')

In [None]:
# Save metrics: evaluate on test
loss, acc = model.evaluate(test_ds)
metrics = {'test_loss': float(loss), 'test_acc': float(acc)}
import json
with open(os.path.join(MODELS_DIR,'disease_efficientnet_metrics.json'),'w') as f:
    json.dump(metrics, f)
print("Final test loss, acc:", loss, acc)
print("Saved final model and metrics to", MODELS_DIR)

[1m360/360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 74ms/step - accuracy: 0.0098 - loss: 0.9084
Final test loss, acc: 1.1525895595550537 0.04567558318376541
Saved final model and metrics to /content/drive/MyDrive/Buildable-ML-DL-Fellowship/models


In [None]:
# Fixed Eval Cell — classification report + confusion matrix (robust to missing classes)
import os, json, numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

MODELS_DIR = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship/models'
MODEL_PATH = os.path.join(MODELS_DIR, "disease_efficientnet_final.h5")
CLASSES_PATH = os.path.join(MODELS_DIR, "classes_unified.json")

# load model & classes
model = tf.keras.models.load_model(MODEL_PATH, compile=False)
with open(CLASSES_PATH, "r") as f:
    class_names = json.load(f)
num_classes = len(class_names)
print("Loaded model and", num_classes, "classes")

# collect predictions and true labels from test_ds (assumes test_ds yields (images, one-hot labels))
y_true = []
y_pred = []
for images, labels in test_ds:
    preds = model.predict(images, verbose=0)
    y_pred.extend(np.argmax(preds, axis=1).tolist())
    y_true.extend(np.argmax(labels.numpy(), axis=1).tolist())

y_true = np.array(y_true)
y_pred = np.array(y_pred)
print("Total test samples:", len(y_true))
unique_true = np.unique(y_true)
print("Unique class indices in test set:", len(unique_true))

# Full classification report (labels = full range so target_names length matches)
labels_all = list(range(num_classes))
report_full = classification_report(y_true, y_pred, labels=labels_all, target_names=class_names, zero_division=0)
with open(os.path.join(MODELS_DIR,'disease_classification_report_full.txt'),'w') as f:
    f.write(report_full)
print("Saved full classification report to disease_classification_report_full.txt")

# Also save a reduced report for only classes present in test (more readable)
labels_present = sorted(list(unique_true.tolist()))
target_names_present = [class_names[i] for i in labels_present]
report_present = classification_report(y_true, y_pred, labels=labels_present, target_names=target_names_present, zero_division=0)
with open(os.path.join(MODELS_DIR,'disease_classification_report_test_present.txt'),'w') as f:
    f.write(report_present)
print("Saved condensed report (only classes present in test) to disease_classification_report_test_present.txt")
print("\n--- Sample of condensed report ---\n")
print("\n".join(report_present.splitlines()[:40]))

# Confusion matrix for top-K frequent classes in test set
from collections import Counter
cnt = Counter(y_true)
topk = [c for c,_ in cnt.most_common(30)]  # change 30 -> smaller if you prefer
cm = confusion_matrix(y_true, y_pred, labels=topk)
fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(cm, annot=False, fmt='d', ax=ax, cmap='viridis')
ax.set_title("Confusion matrix for top-{} classes (by frequency)".format(len(topk)))
ax.set_xticks(np.arange(len(topk))+0.5); ax.set_yticks(np.arange(len(topk))+0.5)
ax.set_xticklabels([class_names[i] for i in topk], rotation=90, fontsize=8)
ax.set_yticklabels([class_names[i] for i in topk], rotation=0, fontsize=8)
plt.tight_layout()
plt.savefig(os.path.join(MODELS_DIR,'confusion_matrix_top{}_test.png'.format(len(topk))))
plt.close()
print("Saved confusion matrix for top-{} classes.".format(len(topk)))


Loaded model and 587 classes
Total test samples: 11516
Unique class indices in test set: 181
Saved full classification report to disease_classification_report_full.txt
Saved condensed report (only classes present in test) to disease_classification_report_test_present.txt

--- Sample of condensed report ---

                          precision    recall  f1-score   support

 2700323949_95aa2eaa01_o       0.00      0.00      0.00         2
            Com.G_SpM_FL       0.00      0.00      0.00       864
            Com.G_TgS_FL       0.00      0.00      0.00       806
             Crnl_L.Mold       0.00      0.00      0.00       530
           GCREC_Bact.Sp       0.00      0.00      0.00      1002
            GHLB2ES_Leaf       0.00      0.00      0.00         8
      GHLB2ES_Leaf_136.1       0.00      0.00      0.00         2
      GHLB2ES_Leaf_141.1       0.00      0.00      0.00         2
       GHLB2ES_Leaf_69.1       0.00      0.00      0.00         2
              GHLB2_Leaf      

In [None]:
# CELL 2 — Inference speed benchmark
import time, json

# take one batch
images, labels = next(iter(test_ds))

n_runs = 30
t0 = time.perf_counter()

for _ in range(n_runs):
    _ = model.predict(images, verbose=0)

t1 = time.perf_counter()

avg_ms_per_image = ((t1 - t0) / n_runs) * 1000 / images.shape[0]
print("Avg ms per image =", avg_ms_per_image)

# save metrics together
metrics = {
    "avg_inference_ms_per_image": float(avg_ms_per_image)
}

with open(os.path.join(MODELS_DIR, "disease_inference_metrics.json"), "w") as f:
    json.dump(metrics, f)

print("Saved inference metrics.")


Avg ms per image = 8.93150402291667
Saved inference metrics.


In [None]:
# Corrected CELL 3 — Create disease_tool.py (safe; avoids .format braces problem)
import os, json, textwrap

MODELS_DIR = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship/models'
MODEL_FILE = os.path.join(MODELS_DIR, "disease_efficientnet_final.h5")
CLASSES_FILE = os.path.join(MODELS_DIR, "classes_unified.json")
OUT_PATH = "/content/disease_tool.py"

code = r'''
import tensorflow as tf
import numpy as np
import json
from tensorflow.keras.preprocessing import image
from pathlib import Path

MODEL_PATH = r"{MODEL_PATH}"
CLASSES_PATH = r"{CLASSES_PATH}"

# load model and classes once
_model = None
_classes = None

def _load():
    global _model, _classes
    if _model is None:
        _model = tf.keras.models.load_model(MODEL_PATH, compile=False)
    if _classes is None:
        with open(CLASSES_PATH, "r", encoding="utf8") as f:
            _classes = json.load(f)
    return _model, _classes

IMG_SIZE = (224, 224)

def preprocess(img_path):
    img = image.load_img(img_path, target_size=IMG_SIZE)
    arr = image.img_to_array(img) / 255.0
    return np.expand_dims(arr, axis=0)

def predict_disease(img_path, top_k=3):
    model, classes = _load()
    arr = preprocess(img_path)
    preds = model.predict(arr, verbose=0)[0]
    idxs = preds.argsort()[::-1][:top_k]
    results = []
    for i in idxs:
        label = classes[i] if i < len(classes) else str(i)
        results.append({"label": label, "confidence": float(preds[i])})
    return {"predicted_label": results[0]["label"], "top_k": results, "inference_time_ms": None}

if __name__ == "__main__":
    import sys, time
    if len(sys.argv) < 2:
        print("Usage: python disease_tool.py /path/to/image.jpg")
        sys.exit(1)
    img = sys.argv[1]
    t0 = time.perf_counter()
    out = predict_disease(img, top_k=5)
    t1 = time.perf_counter()
    out["inference_time_ms"] = (t1-t0)*1000.0
    print(json.dumps(out, indent=2))
'''.strip()

# fill in paths safely
code = code.replace("{MODEL_PATH}", MODEL_FILE.replace("\\", "\\\\"))
code = code.replace("{CLASSES_PATH}", CLASSES_FILE.replace("\\", "\\\\"))

with open(OUT_PATH, "w", encoding="utf8") as f:
    f.write(code)

print("Wrote", OUT_PATH)
print("You can download this file and put it into your repo's src/ folder.")


Wrote /content/disease_tool.py
You can download this file and put it into your repo's src/ folder.


In [None]:
import os
import zipfile
from google.colab import files

MODELS_DIR = "/content/drive/MyDrive/Buildable-ML-DL-Fellowship/models"

# Files we need
files_to_collect = [
    "disease_efficientnet_final.h5",
    "disease_efficientnet_head_best.h5",
    "disease_efficientnet_ft_best.h5",
    "classes_unified.json",
    "disease_inference_metrics.json",
    "disease_classification_report_full.txt",
    "disease_classification_report_test_present.txt",
]

# Confusion matrices (all PNGs)
pngs = [f for f in os.listdir(MODELS_DIR) if f.endswith(".png")]
files_to_collect.extend(pngs)

# Add your disease_tool.py inside Colab if created
if os.path.exists("/content/disease_tool.py"):
    files_to_collect.append("/content/disease_tool.py")

# ZIP path
zip_path = "/content/final_artifacts.zip"
with zipfile.ZipFile(zip_path, "w") as zipf:
    for f in files_to_collect:
        full_path = f if f.startswith("/") else os.path.join(MODELS_DIR, f)
        if os.path.exists(full_path):
            zipf.write(full_path, os.path.basename(full_path))

# Download ZIP
files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Robust Cell 0 — mount Drive, search for dataset (zip or folder), unzip if needed
from google.colab import drive
import os, shutil, zipfile, sys, glob, textwrap

# 1) mount Drive (safe even if already mounted)
drive.mount('/content/drive', force_remount=False)

DRIVE_BASE = '/content/drive/MyDrive/Buildable-ML-DL-Fellowship'
FALLBACKS = [
    '/content/plant_disease_fixed',
    '/content/plant_disease',
    DRIVE_BASE + '/plant_disease',
    DRIVE_BASE + '/plant_disease.zip',
    DRIVE_BASE + '/data/processed/plant_disease',
    DRIVE_BASE + '/data/plant_disease',
    DRIVE_BASE + '/data/plant_disease.zip',
    DRIVE_BASE + '/plant_disease.zip',
    DRIVE_BASE + '/data',
    '/content'
]

def find_dataset():
    # look for unzipped folders first
    candidates = []
    for p in FALLBACKS:
        if p.endswith('.zip'):
            if os.path.exists(p):
                candidates.append(('zip', p))
        else:
            if os.path.isdir(p):
                # check if it contains train/val/test or a folder of classes
                subdirs = os.listdir(p)
                # quick heuristic
                if any(x in subdirs for x in ['train','val','test']) or len(subdirs) > 0:
                    candidates.append(('dir', p))
    # additional: search Drive_BASE recursively for any folder named plant_disease or plant_disease_fixed
    if os.path.isdir(DRIVE_BASE):
        for name in ['plant_disease', 'plant_disease_fixed', 'data/processed/plant_disease', 'data/plant_disease']:
            full = os.path.join(DRIVE_BASE, name)
            if os.path.isdir(full):
                candidates.append(('dir', full))
        # find zip files at top level matching plant_disease*
        zips = glob.glob(os.path.join(DRIVE_BASE, 'plant_disease*.zip')) + glob.glob(os.path.join(DRIVE_BASE, 'data','plant_disease*.zip'))
        for z in zips:
            candidates.append(('zip', z))

    # dedupe preserving order
    seen = set()
    out = []
    for t,p in candidates:
        if p not in seen:
            out.append((t,p)); seen.add(p)
    return out

found = find_dataset()

if found:
    print("Found candidate datasets:")
    for t,p in found:
        print(f" - {t}: {p}")
    # Prefer a directory that already contains train/val/test
    chosen = None
    for t,p in found:
        if t == 'dir':
            # prefer one that contains train/val/test
            subs = os.listdir(p)
            if any(s in subs for s in ['train','val','test']):
                chosen = p; break
    if chosen is None:
        # if no candidate dir with splits, pick the first dir candidate
        for t,p in found:
            if t == 'dir':
                chosen = p; break
    if chosen is None:
        # if only zips, pick first zip
        for t,p in found:
            if t == 'zip':
                chosen = p; break

    # If the chosen item is a zip -> unzip into /content/plant_disease
    if chosen.endswith('.zip'):
        dest = '/content/plant_disease'
        print("\nChosen dataset zip:", chosen)
        print("Unzipping to", dest)
        # remove existing dest to avoid mixing
        if os.path.exists(dest):
            print("Removing existing", dest)
            shutil.rmtree(dest)
        os.makedirs(dest, exist_ok=True)
        with zipfile.ZipFile(chosen, 'r') as z:
            z.extractall(dest)
        # after unzipping, look for likely inner folder
        # often zip contains a folder named 'plant_disease' inside dest
        inner = os.listdir(dest)
        if len(inner) == 1 and os.path.isdir(os.path.join(dest, inner[0])):
            # flatten: use inner folder as DATA_ROOT
            DATA_ROOT = os.path.join(dest, inner[0])
        else:
            DATA_ROOT = dest
        print("Unzip complete. DATA_ROOT set to", DATA_ROOT)
    else:
        DATA_ROOT = chosen
        print("\nDATA_ROOT set to", DATA_ROOT)
else:
    print("No dataset found automatically in expected places.")
    print("\nList of files/folders under DRIVE_BASE (please upload the dataset to Drive at this path):")
    if os.path.isdir(DRIVE_BASE):
        for item in sorted(os.listdir(DRIVE_BASE))[:200]:
            print(" -", item)
    else:
        print(" DRIVE_BASE not found:", DRIVE_BASE)
    print(textwrap.dedent("""
    \nWhat to do next:
    1) If you have the dataset as a zip on your laptop, upload it to Google Drive at:
         My Drive/Buildable-ML-DL-Fellowship/plant_disease.zip
       Use Colab left-side Files -> Upload, or upload via drive.google.com.

    2) Or upload the extracted folder (plant_disease or data/processed/plant_disease) into:
         My Drive/Buildable-ML-DL-Fellowship/

    3) After upload, re-run this cell. The code will detect the zip/folder and set DATA_ROOT automatically.

    4) If you prefer to upload directly into Colab, you can use:
         from google.colab import files
         files.upload()
       and then unzip the uploaded file into /content (but Drive is recommended so you keep files persistent).
    """))
    raise FileNotFoundError("Dataset not found. Please upload dataset to Drive or Colab and re-run.")
# show a few sample paths for confirmation
print("\nSample structure under DATA_ROOT (first 40 entries):")
for root, dirs, files in os.walk(DATA_ROOT):
    print(root)
    for i,d in enumerate(dirs[:10]): print("  dir:", d)
    for i,f in enumerate(files[:10]): print("  file:", f)
    break

# expose variables for later cells
print("\nFinal DATA_ROOT =", DATA_ROOT)
PRUNED_DIR = '/content/pruned_plant_disease'
MODELS_DIR = DRIVE_BASE + '/models'
os.makedirs(MODELS_DIR, exist_ok=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found candidate datasets:
 - zip: /content/drive/MyDrive/Buildable-ML-DL-Fellowship/plant_disease.zip
 - dir: /content

DATA_ROOT set to /content

Sample structure under DATA_ROOT (first 40 entries):
/content
  dir: .config
  dir: drive
  dir: sample_data

Final DATA_ROOT = /content
