In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.applications import EfficientNetB0  # Added for transfer learning
import os
import cv2
import numpy as np
import shutil
from PIL import Image
from collections import defaultdict
import hashlib
from collections import Counter

In [None]:
import tensorflow as tf
print("GPU Available:", tf.config.list_physical_devices('GPU'))

**IMAGE PREPROCESSING**
*1.1 Crop distribution*

In [None]:
# Dataset analysis
DATASET_DIR = "/kaggle/input/crop-disease-dataset/disease/train"
class_counts = {}
for class_name in os.listdir(DATASET_DIR):
    class_path = os.path.join(DATASET_DIR, class_name)
    if os.path.isdir(class_path):
        images = [f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        class_counts[class_name] = len(images)

print("----------------------Class Distribution:--------------------------\n")
for class_name, count in class_counts.items():
    print(f"{class_name}: {count} images")

# Total images and classes calculation
total_images = sum(class_counts.values())
n_classes = len(class_counts)
print(f"\n----------------------------------Dataset Summary:-----------------------")
print(f"Total images: {total_images}")
print(f"Number of classes: {n_classes}")
print(f"Average images per class: {total_images/n_classes:.1f}")


In [None]:
# Find min and max class sizes
min_images = min(class_counts.values())
max_images = max(class_counts.values())
min_class_name = min(class_counts, key=class_counts.get)
max_class_name = max(class_counts, key=class_counts.get)
print(f"Smallest class \n{min_class_name}=> {min_images} images\n")
print(f"Largest class \n{max_class_name}=> {max_images} images\n")


*Check for duplicates*

In [None]:
print("-------------------Counting images in each class----------------------------------")
image_extensions=('.jpg', '.jpeg', '.png')      
for class_name in os.listdir(DATASET_DIR):
    class_path = os.path.join(DATASET_DIR, class_name)
    if os.path.isdir(class_path):
        images = [f for f in os.listdir(class_path) 
        if f.lower().endswith(image_extensions)]
        class_counts[class_name] = len(images)
print(class_counts)

In [None]:
# Dataset balance analysis
if class_counts: 
    counts = list(class_counts.values())
    total_images = sum(counts)
    n_classes = len(class_counts)
    
    analysis_results = {
        'total_images': total_images,
        'n_classes': n_classes,
        'average_per_class': total_images / n_classes,
        'min_images': min(counts),
        'max_images': max(counts),
        'std_dev': np.std(counts),
        'median': np.median(counts)
    }
    
    min_class = min(class_counts, key=class_counts.get)
    max_class = max(class_counts, key=class_counts.get)
    
    analysis_results['min_class'] = min_class
    analysis_results['max_class'] = max_class
    
    imbalance_ratio = analysis_results['max_images'] / analysis_results['min_images']
    analysis_results['imbalance_ratio'] = imbalance_ratio
    
    if imbalance_ratio < 2:
        balance_status = "Well Balanced"
    elif imbalance_ratio < 5:
        balance_status = "Slightly Imbalanced"
    elif imbalance_ratio < 10:
        balance_status = "Moderately Imbalanced"
    else:
        balance_status = "Severely Imbalanced"
    
    analysis_results['balance_status'] = balance_status
    
    print("--------------Analysis results------------------")
    print(analysis_results)
else:
    print("No classes found in the dataset!")


**Visualisations**

In [None]:
#figure with multiple subplots
fig = plt.figure(figsize=(25, 15))

# Sort classes by count for better visualization(SHOW ONLY TOP 15)
sorted_classes = dict(sorted(class_counts.items(), key=lambda x: x[1], reverse=True)[:15])
class_names = list(sorted_classes.keys())
class_values = list(sorted_classes.values())

# Clean up class names(better readability)
clean_class_names = [name.replace('_', ' ').title() for name in class_names]

# Color scheme
colors = plt.cm.Set3(np.linspace(0, 1, len(class_names)))

# Main bar plot - sorted by count (TOP 15 ONLY)
ax1 = plt.subplot(2, 3, 1)
bars1 = ax1.bar(range(len(class_names)), class_values, color=colors)
ax1.set_title('Top 15 Classes by Count', fontsize=14, fontweight='bold')
ax1.set_xlabel('Classes', fontsize=12)
ax1.set_ylabel('Number of Images', fontsize=12)
ax1.set_xticks(range(len(class_names)))
ax1.set_xticklabels(clean_class_names, rotation=45, ha='right', fontsize=10)

# value labels on bars
for i, bar in enumerate(bars1):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
    f'{int(height)}', ha='center', va='bottom', fontweight='bold', fontsize=9)

#average line
avg_line = analysis_results['average_per_class']
ax1.axhline(y=avg_line, color='red', linestyle='--', linewidth=2, 
            label=f'Average: {avg_line:.1f}')
ax1.legend()

# Horizontal bar plot(TOP 10 ONLY)
ax2 = plt.subplot(2, 3, 2)
top_10_names = class_names[:10]
top_10_values = class_values[:10]
top_10_clean = clean_class_names[:10]

bars2 = ax2.barh(range(len(top_10_names)), top_10_values, color=colors[:10])
ax2.set_title('Top 10 Classes (Horizontal)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Number of Images', fontsize=12)
ax2.set_ylabel('Classes', fontsize=12)
ax2.set_yticks(range(len(top_10_names)))
ax2.set_yticklabels(top_10_clean, fontsize=11)
ax2.invert_yaxis()  # Highest count at top

# Add value labels
for i, bar in enumerate(bars2):
    width = bar.get_width()
    ax2.text(width + width*0.01, bar.get_y() + bar.get_height()/2.,
            f'{int(width)}', ha='left', va='center', fontweight='bold', fontsize=10)

# Pie chart (TOP 8 ONLY )
ax3 = plt.subplot(2, 3, 3)
top_8_names = clean_class_names[:8]
top_8_values = class_values[:8]
other_sum = sum(class_values[8:])

if other_sum > 0:
    pie_names = top_8_names + ['Others']
    pie_values = top_8_values + [other_sum]
else:
    pie_names = top_8_names
    pie_values = top_8_values

wedges, texts, autotexts = ax3.pie(pie_values, labels=pie_names, autopct='%1.1f%%',
                                   colors=colors[:len(pie_values)], startangle=90)
ax3.set_title('Top 8 Classes Distribution', fontsize=14, fontweight='bold')

# Make percentage text bold and adjust font size
for autotext in autotexts:
    autotext.set_fontweight('bold')
    autotext.set_fontsize(10)

# Adjust label font size
for text in texts:
    text.set_fontsize(9)

# Statistics summary
ax4 = plt.subplot(2, 3, 4)
ax4.axis('off')

# Create a clean statistics table
stats_data = [
    ['Total Classes', len(class_counts)],
    ['Showing Top', len(class_names)],
    ['Total Images', sum(class_counts.values())],
    ['Average per Class', f'{analysis_results["average_per_class"]:.1f}'],
    ['Highest Count', max(class_counts.values())],
    ['Lowest Count', min(class_counts.values())],
    ['Standard Dev', f'{analysis_results.get("std_dev", 0):.1f}']
]

table = ax4.table(cellText=stats_data,
                  colLabels=['Metric', 'Value'],
                  cellLoc='center',
                  loc='center',
                  colWidths=[0.6, 0.4])
table.auto_set_font_size(False)
table.set_fontsize(11)
table.scale(1.2, 2)

ax4.set_title('Dataset Statistics', fontsize=14, fontweight='bold', pad=20)

# Plant type grouping (if applicable)
ax5 = plt.subplot(2, 3, 5)
plant_groups = {}
for class_name, count in class_counts.items():
    if any(word in class_name.lower() for word in ['apple']):
        plant_type = 'Apple'
    elif any(word in class_name.lower() for word in ['tomato']):
        plant_type = 'Tomato'
    elif any(word in class_name.lower() for word in ['corn']):
        plant_type = 'Corn'
    elif any(word in class_name.lower() for word in ['potato']):
        plant_type = 'Potato'
    elif any(word in class_name.lower() for word in ['pepper']):
        plant_type = 'Pepper'
    else:
        plant_type = 'Other'
    
    if plant_type not in plant_groups:
        plant_groups[plant_type] = 0
    plant_groups[plant_type] += count

if len(plant_groups) > 1:
    wedges2, texts2, autotexts2 = ax5.pie(plant_groups.values(), 
                                          labels=plant_groups.keys(), 
                                          autopct='%1.1f%%',
                                          startangle=90,
                                          colors=plt.cm.Set2(np.linspace(0, 1, len(plant_groups))))
    ax5.set_title('Distribution by Plant Type', fontsize=14, fontweight='bold')
    
    for text in texts2:
        text.set_fontsize(11)
        text.set_fontweight('bold')
    for autotext in autotexts2:
        autotext.set_fontsize(10)
        autotext.set_fontweight('bold')
else:
    ax5.text(0.5, 0.5, 'Single Plant Type\nDataset', ha='center', va='center', 
             fontsize=14, fontweight='bold', transform=ax5.transAxes)
    ax5.set_title('Plant Type Analysis', fontsize=14, fontweight='bold')

# Balance analysis
ax6 = plt.subplot(2, 3, 6)
healthy_count = sum(count for class_name, count in class_counts.items() 
                   if 'healthy' in class_name.lower())
disease_count = sum(count for class_name, count in class_counts.items() 
                   if 'healthy' not in class_name.lower())

if healthy_count > 0 and disease_count > 0:
    categories = ['Healthy', 'Disease']
    counts = [healthy_count, disease_count]
    colors_balance = ['lightgreen', 'lightcoral']
    
    bars6 = ax6.bar(categories, counts, color=colors_balance)
    ax6.set_title('Healthy vs Disease Classes', fontsize=14, fontweight='bold')
    ax6.set_ylabel('Total Images', fontsize=12)
    
    for i, bar in enumerate(bars6):
        height = bar.get_height()
        ax6.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                f'{int(height)}', ha='center', va='bottom', fontweight='bold')
else:
    ax6.text(0.5, 0.5, 'No Healthy/Disease\nClassification Available', 
             ha='center', va='center', fontsize=12, fontweight='bold', 
             transform=ax6.transAxes)
    ax6.set_title('Class Balance Analysis', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
EPOCHS = 40  
BATCH_SIZE = 32  
IMG_SIZE = 224 
VALIDATION_SPLIT = 0.2
LEARNING_RATE = 0.001 


In [None]:
# Check validation directory
valid_dir = "/kaggle/input/crop-disease-dataset/disease/valid"
all_valid_classes = os.listdir(valid_dir)
print(f"All validation classes ({len(all_valid_classes)}):")
for i, cls in enumerate(all_valid_classes):
    print(f"{i+1}. {cls}")

# Define classes to keep
classes_to_keep = [
    'Tomato___Late_blight', 'Corn_(maize)___healthy', 'Pepper,_bell___Bacterial_spot',
    'Tomato___Spider_mites Two-spotted_spider_mite', 'Tomato___Leaf_Mold', 'Corn_(maize)___Common_rust_',
    'Potato___Early_blight', 'Apple___healthy', 'Tomato___Tomato_mosaic_virus', 'Potato___Late_blight',
    'Pepper,_bell___healthy', 'Tomato___Target_Spot', 'Apple___Cedar_apple_rust', 'Apple___Black_rot',
    'Tomato___Tomato_Yellow_Leaf_Curl_Virus', 'Tomato___Bacterial_spot', 'Apple___Apple_scab',
    'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot', 'Tomato___Septoria_leaf_spot',
    'Tomato___Early_blight', 'Corn_(maize)___Northern_Leaf_Blight', 'Potato___healthy', 'Tomato___healthy'
]

print(len(classes_to_keep))

In [None]:
crop_classes = len(classes_to_keep)
print(f"Number of classes to train on: {crop_classes}")

In [None]:
#augment the images

train_datagen=ImageDataGenerator(
    rescale=1./255,
    rotation_range=10,
    height_shift_range=0.1,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest',
    validation_split=VALIDATION_SPLIT
)

validation_datagen=ImageDataGenerator(
   rescale=1./255,
   validation_split=VALIDATION_SPLIT
)

#training generator
train_generator=train_datagen.flow_from_directory(
    "/kaggle/input/crop-disease-dataset/disease/train",
    target_size=(IMG_SIZE,IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='training',
    shuffle=True
)

validation_generator=validation_datagen.flow_from_directory(
    "/kaggle/input/crop-disease-dataset/disease/valid",
    target_size=(IMG_SIZE,IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='validation',
    shuffle=False,
    classes=classes_to_keep
)

In [None]:
# Model building with distributed strategy
strategy = tf.distribute.MirroredStrategy()
print(f"Number of devices: {strategy.num_replicas_in_sync}")


In [None]:
# Get number of classes
crop_classes = len(train_generator.class_indices)
print(f"Number of classes: {crop_classes}")
print(f"Class indices: {train_generator.class_indices}")


In [None]:
model = Sequential()
# First layer
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3)))
model.add(BatchNormalization())
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.2))
        
# Second layer
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.2))
        
# Third layer
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.3))

# fourth layer
model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.3))
        
# Dense layers
model.add(GlobalAveragePooling2D())# generalization
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(crop_classes, activation='softmax'))



In [None]:
#compile the model
model.compile(optimizer=Adam(learning_rate=0.0005),loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
print("\n" + "="*50)
print("MODEL SUMMARY")
print("="*50)
model.summary()#summary

In [None]:
#callbacks
earlystopping=EarlyStopping(patience=5,restore_best_weights=True,monitor='val_accuracy',verbose=1)
model_checkpoint=ModelCheckpoint('crop_disease_model.h5',monitor='val_accuracy',save_best_only=True,mode='max')

In [None]:
# Calculate steps
train_steps = train_generator.samples // BATCH_SIZE
validation_steps = validation_generator.samples // BATCH_SIZE

print('-------------------------------------------')
print(f"Training samples: {train_generator.samples}")
print(f"Validation samples: {validation_generator.samples}")
print(f"Training steps per epoch: {train_steps}")
print(f"Validation steps per epoch: {validation_steps}")

In [None]:
#train
history = model.fit(
    train_generator,
    epochs=EPOCHS,
    validation_data=validation_generator,
    callbacks=[earlystopping, model_checkpoint],
    verbose=1
)

In [None]:
# After training
model.save('/kaggle/working/crop_disease_model.h5')

In [None]:

# Evaluate the model
test_loss, test_accuracy = model.evaluate(validation_generator, steps=validation_steps)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")

In [None]:
# After training
model.save('/kaggle/working/crop_disease_model.h5')

# Also save additional files you might need
import json
import pickle

# Save model info
model_info = {
    'accuracy': float(max(history.history['val_accuracy'])),
    'input_shape': model.input_shape,
    'output_shape': model.output_shape,
    'classes': list(train_generator.class_indices.keys()),  # if using ImageDataGenerator
    'training_date': '2024-07-14'
}

with open('/kaggle/working/model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)

# Save training history
with open('/kaggle/working/training_history.pkl', 'wb') as f:
    pickle.dump(history.history, f)

print("Files saved to /kaggle/working/")

In [None]:
def plot_training_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot training & validation accuracy
    ax1.plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
    ax1.set_title('Model Accuracy', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot training & validation loss
    ax2.plot(history.history['loss'], label='Training Loss', linewidth=2)
    ax2.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
    ax2.set_title('Model Loss', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Loss')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Plot training history
plot_training_history(history)

# Load best model for evaluation
model.load_weights('best_crop_disease_model.h5')

In [None]:
# Evaluate the model
print("\n" + "="*50)
print("FINAL EVALUATION")
print("="*50)

test_loss, test_accuracy = model.evaluate(validation_generator, steps=validation_steps)
print(f"Final Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Final Test Loss: {test_loss:.4f}")

# Check if target accuracy is reached
if test_accuracy >= 0.96:
    print("🎉 TARGET ACCURACY ACHIEVED! Model performs at 96%+ accuracy!")
else:
    print(f"Target accuracy not yet reached. Current: {test_accuracy*100:.2f}%")
    print("Consider: More epochs, different augmentation, or model architecture adjustments")


In [None]:
def predict_image(image_path, model, class_indices):
    try:
        # Load and preprocess image
        img = keras.preprocessing.image.load_img(image_path, target_size=(IMG_SIZE, IMG_SIZE))
        img_array = keras.preprocessing.image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0) / 255.0
        
        # Make prediction
        predictions = model.predict(img_array, verbose=0)
        predicted_class_index = np.argmax(predictions[0])
        confidence = predictions[0][predicted_class_index]
        
        # Get class name
        class_names = {v: k for k, v in class_indices.items()}
        predicted_class = class_names[predicted_class_index]
        
        return predicted_class, confidence
    except Exception as e:
        print(f"Error predicting image: {e}")
        return None, 0.0

print(f'\nTo use prediction: predict_image("/kaggle/input/crop-disease-dataset/disease/test/AppleCedarRust1.JPG", model, train_generator.class_indices)')

In [None]:
# Add this for better evaluation
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

def evaluate_model_thoroughly(model, validation_generator):
    # Get predictions
    predictions = model.predict(validation_generator, verbose=1)
    predicted_classes = np.argmax(predictions, axis=1)
    
    # Get true labels
    true_classes = validation_generator.classes
    
    # Generate reports
    class_names = list(validation_generator.class_indices.keys())
    
    print("Classification Report:")
    print(classification_report(true_classes, predicted_classes, target_names=class_names))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_classes, predicted_classes))

In [None]:
# Install required library
!pip install pydrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and upload
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Upload model
uploaded = drive.CreateFile({'title': 'crop_disease_model.h5'})
uploaded.SetContentFile('/kaggle/working/crop_disease_model.h5')
uploaded.Upload()

print(f"Model uploaded to Google Drive with ID: {uploaded['id']}")