In [4]:
# 1. Setup and Data Loading
import kagglehub
import os
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetV2L
from tensorflow.keras.applications.efficientnet_v2 import preprocess_input
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Download dataset
print("📥 Downloading agricultural pest dataset...")
DATASET_PATH = kagglehub.dataset_download("vencerlanz09/agricultural-pests-image-dataset")
print(f"✅ Dataset downloaded to: {DATASET_PATH}")

# Configuration
IMG_SIZE = 224
BATCH_SIZE = 8  # CPU optimized
np.random.seed(42)
tf.random.set_seed(42)

print(f"🖥️ TensorFlow: {tf.__version__}")
print(f"🖥️ Running on {'GPU' if tf.config.list_physical_devices('GPU') else 'CPU'}")
print(f"DONE")


📥 Downloading agricultural pest dataset...
✅ Dataset downloaded to: C:\Users\weiji\.cache\kagglehub\datasets\vencerlanz09\agricultural-pests-image-dataset\versions\1
🖥️ TensorFlow: 2.19.0
🖥️ Running on CPU
DONE


In [5]:
# 2. Data Preparation with Proper 70/15/15 Split 
print("🔍 Setting up data generators with correct 70/15/15 split...") 

from sklearn.model_selection import train_test_split
import glob

# Explore dataset
pest_classes = [d for d in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, d))]
NUM_CLASSES = len(pest_classes)
print(f"📊 Found {NUM_CLASSES} pest classes: {pest_classes}")

# Collect all image paths and labels
all_images = []
all_labels = []

for i, class_name in enumerate(pest_classes):
    class_path = os.path.join(DATASET_PATH, class_name)
    images = glob.glob(os.path.join(class_path, "*.jpg"))
    all_images.extend(images)
    all_labels.extend([i] * len(images))

print(f"📊 Total images found: {len(all_images)}")

# First split: 70% train, 30% temp (for val+test)
train_images, temp_images, train_labels, temp_labels = train_test_split(
    all_images, all_labels, test_size=0.3, random_state=42, stratify=all_labels
)

# Second split: Split the 30% into 15% validation and 15% test
val_images, test_images, val_labels, test_labels = train_test_split(
    temp_images, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
)

print(f"✅ Training samples: {len(train_images)} ({len(train_images)/len(all_images)*100:.1f}%)")
print(f"✅ Validation samples: {len(val_images)} ({len(val_images)/len(all_images)*100:.1f}%)")
print(f"✅ Test samples: {len(test_images)} ({len(test_images)/len(all_images)*100:.1f}%)")
print(f"📊 Total samples: {len(all_images)}")

# Create temporary directories for each split
import shutil
import tempfile

# Create temporary directory structure
temp_dir = tempfile.mkdtemp()
train_dir = os.path.join(temp_dir, 'train')
val_dir = os.path.join(temp_dir, 'val') 
test_dir = os.path.join(temp_dir, 'test')

# Create class subdirectories
for split_dir in [train_dir, val_dir, test_dir]:
    for class_name in pest_classes:
        os.makedirs(os.path.join(split_dir, class_name), exist_ok=True)

# Copy images to appropriate directories
def copy_images_to_split(image_paths, labels, target_dir):
    for img_path, label in zip(image_paths, labels):
        class_name = pest_classes[label]
        target_path = os.path.join(target_dir, class_name, os.path.basename(img_path))
        shutil.copy2(img_path, target_path)

print("📁 Creating split directories...")
copy_images_to_split(train_images, train_labels, train_dir)
copy_images_to_split(val_images, val_labels, val_dir)
copy_images_to_split(test_images, test_labels, test_dir)

# Create data generators
train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    # Augmentation parameters for training only
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)

# No augmentation for validation and test
val_test_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input
)

# Create generators
train_gen = train_datagen.flow_from_directory(
    train_dir,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True,
    seed=42
)

val_gen = val_test_datagen.flow_from_directory(
    val_dir,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False,
    seed=42
)

test_gen = val_test_datagen.flow_from_directory(
    test_dir,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False,
    seed=42
)

# Verify the split
total_samples = train_gen.samples + val_gen.samples + test_gen.samples
train_pct = train_gen.samples / total_samples * 100
val_pct = val_gen.samples / total_samples * 100
test_pct = test_gen.samples / total_samples * 100

print(f"\n📈 FINAL SPLIT VERIFICATION:")
print(f"   Training: {train_gen.samples} samples ({train_pct:.1f}%) - Target: 70%")
print(f"   Validation: {val_gen.samples} samples ({val_pct:.1f}%) - Target: 15%")
print(f"   Test: {test_gen.samples} samples ({test_pct:.1f}%) - Target: 15%")

# Verify class distribution
print(f"\n📋 Class mapping: {train_gen.class_indices}")

# Store temp directory path for cleanup later
TEMP_DIR = temp_dir
print(f"\n✅ Data split completed successfully!")

🔍 Setting up data generators with correct 70/15/15 split...
📊 Found 12 pest classes: ['ants', 'bees', 'beetle', 'catterpillar', 'earthworms', 'earwig', 'grasshopper', 'moth', 'slug', 'snail', 'wasp', 'weevil']
📊 Total images found: 5485
✅ Training samples: 3839 (70.0%)
✅ Validation samples: 823 (15.0%)
✅ Test samples: 823 (15.0%)
📊 Total samples: 5485
📁 Creating split directories...
Found 3839 images belonging to 12 classes.
Found 823 images belonging to 12 classes.
Found 823 images belonging to 12 classes.

📈 FINAL SPLIT VERIFICATION:
   Training: 3839 samples (70.0%) - Target: 70%
   Validation: 823 samples (15.0%) - Target: 15%
   Test: 823 samples (15.0%) - Target: 15%

📋 Class mapping: {'ants': 0, 'bees': 1, 'beetle': 2, 'catterpillar': 3, 'earthworms': 4, 'earwig': 5, 'grasshopper': 6, 'moth': 7, 'slug': 8, 'snail': 9, 'wasp': 10, 'weevil': 11}

✅ Data split completed successfully!


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Activation
from tensorflow.keras.optimizers import Adam

model = Sequential([
    # 🔹 First Conv Block
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3), padding='valid'),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),

    # 🔹 Second Conv Block
    Conv2D(64, (3, 3), activation='relu', padding='valid'),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.3),

    # 🔹 Third Conv Block
    Conv2D(128, (3, 3), activation='relu', padding='valid'),
    BatchNormalization(),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.4),

    # 🔹 Flatten + FC Layers
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),

    # 🔹 Output Layer
    Dense(12, activation='softmax')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()


In [None]:
EPOCHS = 20
BATCH_SIZE = 32  # You likely defined this earlier
IMG_SIZE = 224   # Also likely defined
NUM_CLASSES = 12 # You already extracted this


# ModelCheckpoint, EarlyStopping, and ReduceLROnPlateau
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

callbacks = [
    ModelCheckpoint("best_cnn_model.h5", save_best_only=True, monitor="val_accuracy", mode="max"),
    EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-6, verbose=1)
]

# Train the model
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=EPOCHS,
    callbacks=callbacks
)


Epoch 1/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 421ms/step - accuracy: 0.0748 - loss: 2.6144



[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 472ms/step - accuracy: 0.0748 - loss: 2.6143 - val_accuracy: 0.0802 - val_loss: 3.1518 - learning_rate: 0.0010
Epoch 2/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 317ms/step - accuracy: 0.0899 - loss: 2.5146



[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 330ms/step - accuracy: 0.0899 - loss: 2.5149 - val_accuracy: 0.1069 - val_loss: 3.3992 - learning_rate: 0.0010
Epoch 3/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 305ms/step - accuracy: 0.0920 - loss: 2.5519
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 317ms/step - accuracy: 0.0919 - loss: 2.5519 - val_accuracy: 0.0923 - val_loss: 3.2772 - learning_rate: 0.0010
Epoch 4/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 494ms/step - accuracy: 0.0870 - loss: 2.5051 - val_accuracy: 0.1021 - val_loss: 3.2648 - learning_rate: 5.0000e-04
Epoch 5/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 464ms/step - accuracy: 0.0857 - loss: 2.4782 - val_accuracy: 0.1021 - val_loss: 3.0316 - learning_rate: 5.0000e-04
Epoch 6/20
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m



[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 390ms/step - accuracy: 0.0957 - loss: 2.4865 - val_accuracy: 0.1142 - val_loss: 4.2181 - learning_rate: 5.0000e-04
Epoch 7/20
[1m392/480[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m31s[0m 361ms/step - accuracy: 0.0794 - loss: 2.5298

In [None]:
import matplotlib.pyplot as plt

# Accuracy
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Val')
plt.title('📈 Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Val')
plt.title('📉 Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Evaluate the trained model on the test set
test_loss, test_accuracy = model.evaluate(test_gen)
print(f"\n🧪 Test Accuracy: {test_accuracy:.4f}")
print(f"🧪 Test Loss: {test_loss:.4f}")

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Predict class probabilities
pred_probs = model.predict(test_gen)
y_pred = np.argmax(pred_probs, axis=1)
y_true = test_gen.classes
class_labels = list(test_gen.class_indices.keys())

# Classification report
print("\n📋 Classification Report:")
print(classification_report(y_true, y_pred, target_names=class_labels))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title("🔍 Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()
