In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import json
import warnings
import traceback
import kagglehub

# --- Library Imports ---
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import RobustScaler
from skimage.filters import gabor

warnings.filterwarnings('ignore')

# Directory to save models and results
OUTPUT_DIR = "plant_disease_results_ML"
os.makedirs(OUTPUT_DIR, exist_ok=True)

class PlantDiseaseML:
    def __init__(self, base_path, img_size=(380, 380)):
        self.base_path = base_path
        self.img_size = img_size
        self.class_names = []
        self.label_map = {}
        self.results = {}

    def load_images(self):
        images, labels = [], []
        if not os.path.exists(self.base_path):
            raise FileNotFoundError(f"Dataset path '{self.base_path}' not found!")
            
        self.class_names = sorted([d for d in os.listdir(self.base_path) if os.path.isdir(os.path.join(self.base_path, d))])
        self.label_map = {name: idx for idx, name in enumerate(self.class_names)}
        
        print(f"Found {len(self.class_names)} classes.")
        for class_name in self.class_names:
            class_path = os.path.join(self.base_path, class_name)
            for filename in os.listdir(class_path):
                if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                    img_path = os.path.join(class_path, filename)
                    try:
                        img = cv2.imread(img_path)
                        if img is not None:
                            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                            img = cv2.resize(img, self.img_size)
                            images.append(img)
                            labels.append(self.label_map[class_name])
                    except Exception as e:
                        print(f"Error loading {img_path}: {e}")
                        
        print(f"Loaded {len(images)} images.")
        return np.array(images), np.array(labels)

    def visualize_loaded_images(self, images, labels):
        print("Visualizing a sample of loaded images...")
        plt.figure(figsize=(15, 10))
        random_indices = np.random.choice(len(images), 15, replace=False)
        for i, idx in enumerate(random_indices):
            plt.subplot(3, 5, i + 1)
            plt.imshow(images[idx])
            plt.title(self.class_names[labels[idx]], fontsize=10)
            plt.axis('off')
        plt.tight_layout()
        save_path = os.path.join(OUTPUT_DIR, 'sample_loaded_images.png')
        plt.savefig(save_path, dpi=100)
        plt.close()
        print(f"‚úÖ Sample image plot saved to {save_path}")

    def extract_features_for_ml(self, image):
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
        features = []
        for freq in [0.1, 0.5]:
            for theta in [0, np.pi/4, np.pi/2, 3*np.pi/4]:
                real, _ = gabor(gray, frequency=freq, theta=theta)
                features.extend([np.mean(real), np.std(real)])
        for ch in range(3):
            features.extend([np.mean(image[:,:,ch]), np.std(image[:,:,ch])])
            features.extend([np.mean(hsv[:,:,ch]), np.std(hsv[:,:,ch])])
        return np.nan_to_num(features)

    def generate_evaluation_report(self, y_true, y_pred, model_name):
        report = classification_report(y_true, y_pred, target_names=self.class_names)
        report_path = os.path.join(OUTPUT_DIR, f'{model_name}_classification_report.txt')
        with open(report_path, 'w') as f: f.write(report)
        print(f"‚úÖ Classification report saved to {report_path}")

        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(24, 20))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=self.class_names, yticklabels=self.class_names, annot_kws={"size": 8})
        plt.title(f'Confusion Matrix: {model_name}', fontsize=20)
        plt.ylabel('True Label', fontsize=15)
        plt.xlabel('Predicted Label', fontsize=15)
        plt.tight_layout()
        cm_path = os.path.join(OUTPUT_DIR, f'{model_name}_confusion_matrix.png')
        plt.savefig(cm_path, dpi=600)
        plt.close()
        print(f"‚úÖ Confusion matrix saved to {cm_path}")

    def run_ml_analysis(self):
        print("\n" + "="*80 + "\nüöÄ STARTING MACHINE LEARNING PIPELINE\n" + "="*80)
        
        print("\n[STEP 1/2] Loading Images and Extracting Features...")
        images, labels = self.load_images()
        if len(images) == 0: return
        self.visualize_loaded_images(images, labels)
        
        features = np.array([self.extract_features_for_ml(img) for img in images])
        del images # Clear images from memory

        print("\n[STEP 2/2] Training and Evaluating 5 ML Models...")
        X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        models = {
            'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
            'GradientBoosting': GradientBoostingClassifier(n_estimators=200, random_state=42),
            'SVM': SVC(C=10, kernel='rbf', random_state=42),
            'KNeighbors': KNeighborsClassifier(n_neighbors=5),
            'GaussianNB': GaussianNB()
        }
        
        for name, model in models.items():
            print(f"\n--- Training {name} ---")
            model.fit(X_train_scaled, y_train)
            pred = model.predict(X_test_scaled)
            acc = accuracy_score(y_test, pred)
            print(f"{name} Accuracy: {acc:.4f}")
            self.generate_evaluation_report(y_test, pred, name)
            self.results[name] = acc
        
        print("\n" + "="*80 + "\nüèÜ FINAL ML MODEL ACCURACY\n" + "="*80)
        df = pd.DataFrame(list(self.results.items()), columns=['Model', 'Accuracy'])
        print(df.to_string(index=False))

if __name__ == "__main__":
    try:
        print("Downloading dataset from Kaggle Hub...")
        download_root_path = kagglehub.dataset_download("emmarex/plantdisease")
        dataset_path = os.path.join(download_root_path, 'PlantVillage')

        classifier = PlantDiseaseML(base_path=dataset_path)
        classifier.run_ml_analysis()
    except Exception as e:
        print(f"‚ùå AN UNEXPECTED ERROR OCCURRED: {e}")
        traceback.print_exc()