<a href="https://colab.research.google.com/github/AkarshBhatia/LeafC/blob/main/Leaf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-learn



In [None]:
import os
import cv2
import numpy as np
from skimage.feature import graycomatrix, graycoprops
from sklearn.svm import OneClassSVM, SVC
from sklearn.preprocessing import StandardScaler
import warnings

# Suppress minor warnings for a cleaner output
warnings.filterwarnings("ignore", category=UserWarning, module='skimage')
warnings.filterwarnings("ignore", category=RuntimeWarning)


def segment_leaf(image_path):
    """Segments the leaf from the background."""
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"FATAL: Image not found at {image_path}. Please check the path.")

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    _, binary_mask = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        largest_contour = max(contours, key=cv2.contourArea)
        leaf_mask = np.zeros_like(gray, dtype=np.uint8)
        cv2.drawContours(leaf_mask, [largest_contour], -1, 255, cv2.FILLED)
        return leaf_mask, image
    return np.zeros_like(gray, dtype=np.uint8), image

def calculate_morphological_features(leaf_mask):
    """Calculates area, perimeter, aspect ratio, and circularity."""
    contours, _ = cv2.findContours(leaf_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return {'area': 0, 'perimeter': 0, 'aspect_ratio': 0, 'circularity': 0}

    cnt = max(contours, key=cv2.contourArea)
    area = cv2.contourArea(cnt)
    perimeter = cv2.arcLength(cnt, True)
    x, y, w, h = cv2.boundingRect(cnt)

    aspect_ratio = float(w) / h if h != 0 else 0
    circularity = (4 * np.pi * area) / (perimeter ** 2) if perimeter != 0 else 0

    return {'area': area, 'perimeter': perimeter, 'aspect_ratio': aspect_ratio, 'circularity': circularity}

def calculate_texture_features(image, leaf_mask):
    """Calculates contrast, correlation, and energy."""
    gray_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    masked_gray = cv2.bitwise_and(gray_img, gray_img, mask=leaf_mask)
    masked_gray_norm = cv2.normalize(masked_gray, None, 0, 255, cv2.NORM_MINMAX, dtype=cv2.CV_8U)

    try:
        glcm = graycomatrix(masked_gray_norm, distances=[1], angles=[0], levels=256, symmetric=True, normed=True)
        contrast = graycoprops(glcm, 'contrast')[0, 0]
        correlation = graycoprops(glcm, 'correlation')[0, 0]
        energy = graycoprops(glcm, 'energy')[0, 0]
    except ValueError:
        return {'contrast': 0, 'correlation': 0, 'energy': 0}

    return {'contrast': contrast, 'correlation': correlation, 'energy': energy}

def extract_all_features(image_path):
    """Combines all feature extraction steps for a single image."""
    try:
        leaf_mask, image = segment_leaf(image_path)
        if np.sum(leaf_mask) == 0:
            print(f"Warning: No leaf detected in {image_path}. Returning zero features.")
            return [0] * 7

        morph_features = calculate_morphological_features(leaf_mask)
        texture_features = calculate_texture_features(image, leaf_mask)

        return [
            morph_features['area'], morph_features['perimeter'], morph_features['aspect_ratio'],
            morph_features['circularity'], texture_features['contrast'],
            texture_features['correlation'], texture_features['energy']
        ]
    except Exception as e:
        print(f"Error extracting features from {image_path}: {e}")
        return [0] * 7


def process_training_data(folder_path):
    """Processes all images, extracts features, and organizes them by class."""
    class_data = {}
    class_names = sorted([d for d in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, d))])

    if not class_names:
        raise NotADirectoryError(f"FATAL: No class subdirectories found in '{folder_path}'. Please check your folder structure.")

    print("Processing training data...")
    for class_name in class_names:
        print(f"  - Processing class: {class_name}")
        class_dir = os.path.join(folder_path, class_name)
        class_data[class_name] = []

        for img_file in os.listdir(class_dir):
            if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                img_path = os.path.join(class_dir, img_file)
                features = extract_all_features(img_path)
                if sum(features) > 0:
                    class_data[class_name].append(features)

    print("Data processing complete.")
    return class_data, class_names

def train_models(class_data, class_names):
    """Trains One-Class SVMs for each class and a single Multi-Class SVM."""
    one_class_svms = {}
    scalers = {}

    print("\nTraining One-Class SVMs...")
    for class_name in class_names:
        print(f"  - Training for class: {class_name}")
        features = np.array(class_data[class_name])

        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features)

        oc_svm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)
        oc_svm.fit(scaled_features)

        one_class_svms[class_name] = oc_svm
        scalers[class_name] = scaler

    # Prepare data for Multi-Class SVM
    X_multi, y_multi = [], []
    for i, class_name in enumerate(class_names):
        for features in class_data[class_name]:
            X_multi.append(features)
            y_multi.append(i)

    multi_class_scaler = StandardScaler()
    X_multi_scaled = multi_class_scaler.fit_transform(np.array(X_multi))

    print("\nTraining Multi-Class SVM...")
    multi_class_svm = SVC(kernel='rbf', gamma='auto', probability=True)
    multi_class_svm.fit(X_multi_scaled, np.array(y_multi))
    print("All models trained successfully.")

    return one_class_svms, scalers, multi_class_svm, multi_class_scaler


def classify_leaf(image_path, one_class_svms, scalers, multi_class_svm, multi_class_scaler, class_names):
    """Classifies a single leaf image using the two-step SVM process."""
    print(f"\n--- Classifying {os.path.basename(image_path)} ---")

    features = extract_all_features(image_path)
    if sum(features) == 0:
        print("Result: Could not extract features from the image.")
        return

    features_array = np.array(features).reshape(1, -1)

    print("Step 1: Checking against One-Class SVMs...")
    is_known = False
    for class_name, oc_svm in one_class_svms.items():
        scaled_features = scalers[class_name].transform(features_array)
        prediction = oc_svm.predict(scaled_features)

        if prediction[0] == 1:
            print(f"  - Matches profile of '{class_name}'. Proceeding to final classification.")
            is_known = True
            break

    if not is_known:
        print("\nResult: Leaf type is UNKNOWN. It does not fit the profile of any trained class.")
        return

    print("\nStep 2: Performing Multi-Class SVM for final classification...")
    scaled_features_multi = multi_class_scaler.transform(features_array)
    final_prediction_index = multi_class_svm.predict(scaled_features_multi)[0]
    final_class_name = class_names[final_prediction_index]

    print(f"\n>>> Final Result: The leaf is classified as: {final_class_name}")


if __name__ == "__main__":

    #
    TRAINING_DATA_PATH = '/content/extracted_files/Training'


    TEST_IMAGE_PATH = '/content/WhatsApp Image 2025-10-27 at 03.30.45.jpeg'


    try:
        # Step 1: Process data and train all the SVM models
        class_data, class_names = process_training_data(TRAINING_DATA_PATH)
        oc_svms, class_scalers, mc_svm, mc_scaler = train_models(class_data, class_names)

        # Step 2: Run the classification on the test image
        classify_leaf(
            image_path=TEST_IMAGE_PATH,
            one_class_svms=oc_svms,
            scalers=class_scalers,
            multi_class_svm=mc_svm,
            multi_class_scaler=mc_scaler,
            class_names=class_names
        )

    except (FileNotFoundError, NotADirectoryError) as e:
        print(f"\nAn error occurred: {e}")
        print("Please ensure your paths and folder structure are correct and try again.")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {e}")

Processing training data...
  - Processing class: Alstonia scholrais
  - Processing class: Chinar
  - Processing class: Jatropha
  - Processing class: Mango
  - Processing class: Pominia Pinata
Data processing complete.

Training One-Class SVMs...
  - Training for class: Alstonia scholrais
  - Training for class: Chinar
  - Training for class: Jatropha
  - Training for class: Mango
  - Training for class: Pominia Pinata

Training Multi-Class SVM...
All models trained successfully.

--- Classifying WhatsApp Image 2025-10-27 at 03.30.45.jpeg ---
Step 1: Checking against One-Class SVMs...

Result: Leaf type is UNKNOWN. It does not fit the profile of any trained class.


In [None]:
import numpy as np
from PIL import Image, ImageFilter
from skimage import measure, filters, transform # Import transform for resizing
import skimage.color # Import color for rgb2gray if needed later
from skimage.feature import graycomatrix, graycoprops

# Define a target size for standardization (you can adjust this)
TARGET_SIZE = (512, 512) # Example size

# Function to extract features from a leaf image
def extract_features(image_path):
  # Try to open the image using PIL
  try:
    img_pil = Image.open(image_path)
  except IOError:
    # Print an error message if the image cannot be loaded
    print(f"Error: Couldn't load image from {image_path}")
    return None

  # Convert PIL image to NumPy array for skimage
  img_np = np.array(img_pil)

  # --- NEW: Standardize by resizing ---
  # Resize the image to the target size
  try:
      # Ensure the image is in a format resize can handle (e.g., RGB or Grayscale)
      if img_np.ndim == 3:
          img_resized = transform.resize(img_np, (*TARGET_SIZE, img_np.shape[-1]), anti_aliasing=True)
      else: # Grayscale or other single channel
          img_resized = transform.resize(img_np, TARGET_SIZE, anti_aliasing=True)
  except Exception as e:
      print(f"Error resizing image {image_path}: {e}")
      return None


  # Convert the resized image to grayscale using skimage (more consistent with skimage pipeline)
  # Check if the resized image is already grayscale (2 dimensions) or color (3 dimensions)
  if img_resized.ndim == 3:
      gray_array = skimage.color.rgb2gray(img_resized)
  else:
      gray_array = img_resized # Already grayscale

  # Apply a Gaussian blur to the grayscale image using skimage
  blurred_array = filters.gaussian(gray_array, sigma=3) # Use skimage gaussian filter

  # Apply Otsu's thresholding to create a binary image
  thresh = filters.threshold_otsu(blurred_array) # Use blurred array for thresholding

  # Create a binary image based on the threshold
  binary_array = blurred_array > thresh

  # Label connected regions in the binary image
  labels = measure.label(binary_array)

  # Measure properties of the labeled regions
  # Use the original grayscale array for intensity image to get correct intensity features
  props = measure.regionprops(labels, intensity_image=gray_array)


  # Check if any regions were found
  if not props:
      print("No regions found.")
      return None

  # Find the largest region (assumed to be the leaf)
  leaf_prop = max(props, key=lambda region: region.area)

  # Initialize a dictionary to store the extracted features
  features = {}

  # Calculate basic geometric features
  area = leaf_prop.area
  perimeter = leaf_prop.perimeter
  minr, minc, maxr, maxc = leaf_prop.bbox
  h = maxr - minr
  w = maxc - minc

  # Store geometric features in the dictionary
  features['area'] = area
  features['perimeter'] = perimeter
  # Calculate and store the aspect ratio, handling division by zero
  features['aspect_ratio'] = w / h if h > 0 else 0

  # Calculate and store circularity, handling division by zero
  if perimeter > 0:
      features['circularity'] = (4 * np.pi * area) / (perimeter**2)
  else:
      features['circularity'] = 0

  # Extract the region of interest (the leaf) from the intensity image
  # Ensure ROI is in uint8 for GLCM
  roi = (leaf_prop.intensity_image * 255).astype(np.uint8)


  # Calculate Gray-Level Co-occurrence Matrix (GLCM) for texture features
  # Adjust distances/angles as needed, [1] and [0] is common for basic texture
  glcm = graycomatrix(roi, distances=[1], angles=[0], levels=256, symmetric=True, normed=True)
  # Extract and store GLCM texture features
  features['contrast'] = graycoprops(glcm, 'contrast')[0, 0]
  features['correlation'] = graycoprops(glcm, 'correlation')[0, 0]
  features['energy'] = graycoprops(glcm, 'energy')[0, 0]

  # Return the dictionary of extracted features
  return features



In [None]:
!pip install pillow-heif



In [None]:
from pillow_heif import register_heif_opener
register_heif_opener()

In [None]:
iimport cv2
import numpy as np
import os
import json
from tqdm import tqdm
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern
from skimage import exposure

# --- 1. PRE-PROCESSING FUNCTION ---
# This is a modified version of our function from process_image.py
# It no longer saves or shows the image, but returns it for processing.

def preprocess_image_for_features(image_path, size=(500, 500)):
    """
    Loads an image, standardizes its size, removes the dark background,
    and returns the 4-channel BGRA image.
    """
    img = cv2.imread(image_path)
    if img is None:
        print(f"Warning: Could not read image {image_path}. Skipping.")
        return None

    # --- 1. Standardize (Resize) ---
    img_resized = cv2.resize(img, size, interpolation=cv2.INTER_AREA)

    # --- 2. Remove Black Background ---
    hsv = cv2.cvtColor(img_resized, cv2.COLOR_BGR2HSV)
    lower_black = np.array([0, 0, 0])
    upper_black = np.array([180, 255, 60]) # Adjust this value if needed

    mask = cv2.inRange(hsv, lower_black, upper_black)
    mask_inv = cv2.bitwise_not(mask)

    # Create 4-channel BGRA image
    b, g, r = cv2.split(img_resized)
    img_bgra = cv2.merge([b, g, r, mask_inv])

    return img_bgra

# --- 2. FEATURE EXTRACTION FUNCTION ---
# This function takes the processed BGRA image and extracts features.

def extract_features(img_bgra):
    """
    Extracts a flat list of features from a single processed BGRA leaf image.
    """
    try:
        # Split channels
        b, g, r, a = cv2.split(img_bgra)

        # Create BGR and Grayscale versions for calculations
        img_bgr = cv2.merge([b, g, r])
        img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)

        # Use the alpha channel 'a' as the binary mask
        # This is the 0-255 mask from our pre-processing
        mask = a

        # --- 1. Shape & Morphological Features ---

        # Find the largest contour
        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        if not contours:
            print("Warning: No contours found. Skipping image.")
            return None

        cnt = max(contours, key=cv2.contourArea)

        # Basic Shape
        area = cv2.contourArea(cnt)
        perimeter = cv2.arcLength(cnt, True)

        # Bounding Box -> Aspect Ratio
        x, y, w, h = cv2.boundingRect(cnt)
        aspect_ratio = float(w) / h if h != 0 else 0

        # Solidity (Area vs. Convex Hull Area)
        hull = cv2.convexHull(cnt)
        hull_area = cv2.contourArea(hull)
        solidity = float(area) / hull_area if hull_area != 0 else 0

        # Eccentricity
        eccentricity = 0
        if len(cnt) >= 5: # fitEllipse needs at least 5 points
            (center, (MA, ma), angle) = cv2.fitEllipse(cnt)
            eccentricity = np.sqrt(1 - (ma/MA)**2) if MA != 0 else 0

        # Hu Moments
        moments = cv2.moments(mask)
        hu_moments = cv2.HuMoments(moments).flatten()

        # --- 2. Texture & Venation Features ---

        # Mask the grayscale image so texture is only from the leaf
        masked_gray = cv2.bitwise_and(img_gray, img_gray, mask=mask)

        # LBP (Local Binary Patterns)
        # We calculate LBP on the masked gray image, then build a histogram
        # only from the non-zero (leaf) pixels.
        n_points = 24
        radius = 3
        lbp = local_binary_pattern(masked_gray, n_points, radius, method='uniform')
        # Get pixels *inside* the leaf
        lbp_leaf_pixels = lbp[mask == 255]
        # Calculate histogram
        (lbp_hist, _) = np.histogram(lbp_leaf_pixels,
                                     bins=np.arange(0, n_points + 3),
                                     range=(0, n_points + 2))
        # Normalize
        lbp_hist = lbp_hist.astype("float")
        lbp_hist /= (lbp_hist.sum() + 1e-6) # 1e-6 to avoid division by zero

        # Haralick Textures (from GLCM)
        # Calculate on the masked gray image
        glcm = graycomatrix(masked_gray, distances=[1, 2], angles=[0, np.pi/2], symmetric=True, normed=True)

        contrast = graycoprops(glcm, 'contrast').mean()
        homogeneity = graycoprops(glcm, 'homogeneity').mean()
        energy = graycoprops(glcm, 'energy').mean()
        correlation = graycoprops(glcm, 'correlation').mean()

        # --- 3. Color Features ---

        # Convert BGR to HSV
        img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)

        # Get all HSV pixels *inside* the leaf using the mask
        hsv_leaf_pixels = img_hsv[mask == 255]

        # Mean Color (Mean H, S, V)
        mean_hsv = np.mean(hsv_leaf_pixels, axis=0)

        # Color Histograms (Hue & Saturation)
        h_hist, _ = np.histogram(hsv_leaf_pixels[:, 0], bins=18, range=(0, 180))
        s_hist, _ = np.histogram(hsv_leaf_pixels[:, 1], bins=16, range=(0, 256))

        # Normalize histograms
        h_hist = h_hist.astype("float")
        h_hist /= (h_hist.sum() + 1e-6)

        s_hist = s_hist.astype("float")
        s_hist /= (s_hist.sum() + 1e-6)

        # --- 4. Combine all features into one flat list ---
        all_features = np.concatenate([
            [aspect_ratio, solidity, eccentricity, area, perimeter],
            hu_moments,
            lbp_hist,
            [contrast, homogeneity, energy, correlation],
            mean_hsv,
            h_hist,
            s_hist
        ])

        return all_features.tolist() # Return as a simple list

    except Exception as e:
        print(f"Error extracting features: {e}")
        return None

# --- 3. MAIN DATASET PROCESSING FUNCTION ---

def process_dataset(dataset_path):
    """
    Loops through the dataset_path (e.g., 'dataset/'), finds subfolders
    (e.g., 'species_A', 'species_B'), processes all images,
    and returns the feature list (X) and label list (y).
    """
    all_features = [] # This will be our list of lists
    all_labels = []

    label_map = {} # To map 'species_A' -> 0, 'species_B' -> 1, etc.
    current_label_index = 0

    # Get a list of species folders
    try:
        species_folders = [f for f in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, f))]
    except FileNotFoundError:
        print(f"Error: Dataset directory not found at {dataset_path}")
        print("Please set the 'DATASET_DIR' variable to the correct path.")
        return None, None, None

    print(f"Found {len(species_folders)} classes: {species_folders}")

    for species_name in species_folders:
        # Assign a number to this species
        if species_name not in label_map:
            label_map[species_name] = current_label_index
            current_label_index += 1

        species_label = label_map[species_name]
        species_path = os.path.join(dataset_path, species_name)

        print(f"\nProcessing class: {species_name} (Label: {species_label})")

        image_files = os.listdir(species_path)

        # Use tqdm for a progress bar
        for image_name in tqdm(image_files, desc=species_name):
            image_path = os.path.join(species_path, image_name)

            # Check for valid image extensions
            if not image_name.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
                continue

            # 1. Pre-process the image
            processed_bgra = preprocess_image_for_features(image_path)

            if processed_bgra is None:
                continue # Skip if pre-processing failed

            # 2. Extract features
            features = extract_features(processed_bgra)

            if features is not None:
                # 3. Add to our lists
                all_features.append(features)
                all_labels.append(species_label)

    return all_features, all_labels, label_map

# --- 4. EXECUTION ---

if __name__ == "__main__":
    # --- IMPORTANT ---
    # Change this path to point to your main dataset folder
    # Your folder structure should be:
    # - dataset/
    #   - species_A/
    #     - leaf1.jpg
    #     - leaf2.jpg
    #     ...
    #   - species_B/
    #     - leaf10.jpg
    #     - leaf11.jpg
    #     ...

    DATASET_DIR = "dataset" # <-- CHANGE THIS

    # Run the processing
    X, y, label_map = process_dataset(DATASET_DIR)

    if X:
        print("\n" + "="*30)
        print("FEATURE EXTRACTION COMPLETE")
        print(f"Total images processed: {len(X)}")
        print(f"Total features per image: {len(X[0])}")
        print(f"Classes found: {label_map}")

        # Convert to NumPy arrays for easy handling and saving
        X_array = np.array(X)
        y_array = np.array(y)

        print(f"Feature matrix shape (X): {X_array.shape}")
        print(f"Label vector shape (y): {y_array.shape}")

        # Save the results to disk
        np.save('features.npy', X_array)
        np.save('labels.npy', y_array)
        with open('label_map.json', 'w') as f:
            json.dump(label_map, f)

        print("\nSuccessfully saved:")
        print("- features.npy (your 'list of lists' for training)")
        print("- labels.npy (the corresponding labels)")
        print("- label_map.json (shows which number corresponds to which species)")

        print("\nYou are now ready to load these files and train your SVMs!")
    else:
        print("No features were extracted. Please check your DATASET_DIR path.")

In [None]:
import os
import pandas as pd

dataset_path = '/content/extracted_files/Training' # Updated path to the extracted directory

# Check if the dataset directory exists
if not os.path.exists(dataset_path):
    print(f"Error: The directory '{dataset_path}' was not found.")
    print("Please ensure the zip file was extracted correctly and the path is accurate.")
else:
    all_features = []
    all_labels = []

    print("⚙️  Starting feature extraction from all images...")

    # --- 2. Loop through each class folder ---
    for class_folder in os.listdir(dataset_path):
        folder_path = os.path.join(dataset_path, class_folder)

        # Ensure it's a directory
        if os.path.isdir(folder_path):
            print(f"   -> Processing folder: {class_folder}")
            for image_name in os.listdir(folder_path):
                image_path = os.path.join(folder_path, image_name)

                # Use your function to extract features
                features = extract_features(image_path)

                # If features were successfully extracted, add them to our lists
                if features:
                    all_features.append(features)
                    all_labels.append(class_folder)

    print("\n✅ Feature extraction complete!")

    # --- 3. Create a pandas DataFrame and save the data ---
    df = pd.DataFrame(all_features)
    df['label'] = all_labels

    # Save everything to a CSV file for the next step
    df.to_csv('classifier_features.csv', index=False)

    print("\n📊 Dataset successfully created and saved to 'classifier_features.csv'")
    print("\nHere's a summary of your dataset:")
    print(df['label'].value_counts())

⚙️  Starting feature extraction from all images...
   -> Processing folder: Pominia Pinata
   -> Processing folder: Jatropha
   -> Processing folder: Mango
   -> Processing folder: Chinar
   -> Processing folder: Alstonia scholrais

✅ Feature extraction complete!

📊 Dataset successfully created and saved to 'classifier_features.csv'

Here's a summary of your dataset:
label
Pominia Pinata        200
Jatropha              200
Mango                 200
Chinar                200
Alstonia scholrais    200
Name: count, dtype: int64


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
leafdata = extract_features('/content/WhatsApp Image 2025-10-27 at 02.03.38.jpeg')
if leafdata:
    print("Extracted Features:")
    for key, value in leafdata.items():
        print(f"- {key.capitalize()}: {value:.4f}")
print(leafdata)

Error: Couldn't load image from /content/WhatsApp Image 2025-10-27 at 02.03.38.jpeg
None


In [None]:
leafdata = extract_features('/content/WhatsApp Image 2025-10-27 at 01.45.13 (1).jpeg')
if leafdata:
    print("Extracted Features:")
    for key, value in leafdata.items():
        print(f"- {key.capitalize()}: {value:.4f}")

# Get a view object of the dictionary's values
values_view = leafdata.values()

# Convert the view object to a list and convert each element to a standard float
sample = [float(value) for value in values_view]

print(sample)

Extracted Features:
- Area: 42903.0000
- Perimeter: 1220.0113
- Aspect_ratio: 0.7325
- Circularity: 0.3622
- Contrast: 69.7272
- Correlation: 0.9865
- Energy: 0.4537
[42903.0, 1220.0113267570428, 0.7325227963525835, 0.36221780952953203, 69.7271909827761, 0.9864824493793735, 0.45366128534374717]


In [None]:
df_list = df.values.tolist()
print(df_list)

[[46929.0, 1171.6305550595491, 2.2486486486486488, 0.4296054534706245, 25.600833604689022, 0.9903593014843362, 0.4039997041766849, 'Pominia Pinata'], [40589.0, 877.0752518939717, 1.2748815165876777, 0.6630472863356806, 37.82192119968876, 0.9913994240870481, 0.2863310520140041, 'Pominia Pinata'], [62254.0, 987.5016587651645, 1.097902097902098, 0.8022347270543013, 43.16820080877589, 0.9931357707908459, 0.30734423571515335, 'Pominia Pinata'], [117765.0, 2056.3382745853737, 1.0899742930591259, 0.34997496777691994, 19.01371644575714, 0.9906889470234757, 0.30260596129857886, 'Pominia Pinata'], [50857.0, 1157.653895656662, 2.005, 0.47687343157582895, 22.105012499999994, 0.9918791844279294, 0.3782723556884973, 'Pominia Pinata'], [35926.0, 1133.2346314604056, 0.825, 0.3515434848276729, 126.3094543147208, 0.9784223753306814, 0.23631002829918973, 'Pominia Pinata'], [18449.0, 539.2691193458119, 0.9181286549707602, 0.79720789623324, 41.830896686159846, 0.9874304009283233, 0.30648968309676883, 'Pomi

In [None]:
# Assuming df_list is already created from the DataFrame

# Create a dictionary to hold lists for each class
class_lists_dict = {}

# Iterate through the df_list and group by the last element (label)
for row in df_list:
    label = row[-1]  # Get the last element, which is the label
    features = row[:-1] # Get features without the label

    # If the label is not already a key in the dictionary, add it with an empty list
    if label not in class_lists_dict:
        class_lists_dict[label] = []

    # Append the features (without the label) to the list for this label
    class_lists_dict[label].append(features)


for label, data_list in class_lists_dict.items():
    variable_name = label.replace(" ", "_").replace("-", "_").replace(".", "_") + "_list" # Add _list suffix for clarity
    globals()[variable_name] = data_list
    print(f"Created list variable: {variable_name}")



Created list variable: Pominia_Pinata_list
Created list variable: Jatropha_list
Created list variable: Mango_list
Created list variable: Chinar_list
Created list variable: Alstonia_scholrais_list


In [None]:
leaf_type_1 = Pominia_Pinata_list

leaf_type_2 = Jatropha_list

leaf_type_3 = Mango_list

leaf_type_4 = Chinar_list

leaf_type_5 = Alstonia_scholrais_list


In [None]:
from sklearn.svm import OneClassSVM
import numpy as np
lit = []
x = 0
def known(train,sample):
    label = 'blank'
    X = np.asarray(train, dtype=float)
    model = OneClassSVM(kernel='rbf', nu=0.6, gamma='scale')
    model.fit(X)
    score = model.decision_function([sample])[0]
    if score > 0.0:
        label = 'known'
    else:
        label = 'unknown'
    return label
classifier = [known(leaf_type_1, sample), known(leaf_type_2,sample), known(leaf_type_3,sample), known(leaf_type_4,sample), known(leaf_type_5,sample)]
print(classifier)
if 'known' not in classifier:
    print('The sample is Unknown')
else:
    for i in classifier:
        x = x + 1
        if i in ('known'):
            lit.append(f'leaf_type_{x}')
    print('The sample is known')
    print(f'sample is preliminarliry classified as {lit}')
    print('Moving on to Supervised Multi-class SVM for final classification')


['known', 'unknown', 'unknown', 'unknown', 'unknown']
The sample is known
sample is preliminarliry classified as ['leaf_type_1']
Moving on to Supervised Multi-class SVM for final classification


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Combine the data and labels from the DataFrame
# Drop rows with NaN values
df_cleaned = df.dropna()

# Exclude the last column which is the label
X = df_cleaned.drop('label', axis=1).values.tolist()
y = df_cleaned['label'].values.tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a multi-class SVM classifier
multi_svm_model = SVC(kernel='linear', random_state=42)
multi_svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = multi_svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Multi-class SVM Classifier Results:")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Multi-class SVM Classifier Results:
Accuracy: 0.4500
Classification Report:
                    precision    recall  f1-score   support

Alstonia scholrais       0.21      0.14      0.16        44
            Chinar       0.76      0.74      0.75        38
          Jatropha       0.57      0.27      0.37        48
             Mango       0.34      0.73      0.47        37
    Pominia Pinata       0.50      0.48      0.49        33

          accuracy                           0.45       200
         macro avg       0.47      0.47      0.45       200
      weighted avg       0.47      0.45      0.43       200



In [None]:
import os
import sys
import pandas as pd
import numpy as np
from skimage import io, color, measure, filters
from skimage.feature import graycomatrix, graycoprops
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from concurrent.futures import ProcessPoolExecutor, as_completed

INPUT_DIR = '/content/extracted_files/Training' # Corrected path

# The name of the CSV file to save features
OUTPUT_CSV = 'leaf_features.csv'

USER_CSV_PATH = 'my_leaf_data.csv'

# New: Resize images to this standard size for MUCH faster processing
TARGET_SIZE = (512, 512)

# New: Use multiple CPU cores to process images in parallel
MAX_WORKERS = os.cpu_count()

# --- End Configuration ---


def calculate_features(image_path):
    """
    Calculates a feature vector for a single image.

    Returns:
        A list of features or None if processing fails.
    """
    try:
        # Load the image
        img = io.imread(image_path)


        img_resized = resize(img, TARGET_SIZE, anti_aliasing=True)

        # 1. Convert to grayscale
        # Use the resized image from now on
        img_gray = color.rgb2gray(img_resized)

        # 2. Binarize the image to find the leaf
        # We assume the leaf is the main object
        thresh = filters.threshold_otsu(img_gray)
        binary = img_gray > thresh

        # 3. Label regions and get properties
        label_image = measure.label(binary)
        props = measure.regionprops(label_image)

        # Find the largest region (assuming it's the leaf)
        if not props:

            return None

        main_region = max(props, key=lambda x: x.area)

        # --- Feature Calculation ---

        # Area:
        area = main_region.area

        # Perimeter:
        perimeter = main_region.perimeter

        # Aspect Ratio:
        major_axis = main_region.major_axis_length
        minor_axis = main_region.minor_axis_length
        if minor_axis == 0:
            aspect_ratio = 1  # Avoid division by zero
        else:
            aspect_ratio = major_axis / minor_axis

        # Circularity:
        if perimeter == 0:
            circularity = 0 # Avoid division by zero
        else:
            circularity = (4 * np.pi * area) / (perimeter**2)

        # Texture Features (GLCM)
        # Convert to uint8 for GLCM
        # Note: skimage.resize returns float (0-1), so multiply by 255
        img_gray_uint8 = (img_gray * 255).astype(np.uint8)

        # Calculate GLCM
        glcm = graycomatrix(img_gray_uint8,
                            distances=[5],
                            angles=[0],
                            levels=256,
                            symmetric=True,
                            normed=True)

        # Contrast:
        contrast = graycoprops(glcm, 'contrast')[0, 0]

        # Correlation:
        correlation = graycoprops(glcm, 'correlation')[0, 0]

        # Energy:
        energy = graycoprops(glcm, 'energy')[0, 0]

        return [area, perimeter, aspect_ratio, circularity, contrast, correlation, energy]

    except Exception as e:
        print(f"ERROR: Could not process {image_path}. Reason: {e}")
        return None


def extract_features():
    """
    Walks through the augmented image directory, calculates
    features for each, and saves them to a CSV file.

    --- NEW: Now uses multiprocessing ---
    """
    print(f"Starting feature extraction from: {INPUT_DIR}")

    all_image_paths = []
    all_labels = []

    # Walk the directory to get a list of all jobs
    for root, dirs, files in os.walk(INPUT_DIR):
        if root == INPUT_DIR:
            continue # Skip the root folder itself

        # The subfolder name is our label (e.g., 'leaf_type_1')
        label = os.path.basename(root)

        for file in files:
            if file.endswith('.jpg'):
                file_path = os.path.join(root, file)
                all_image_paths.append(file_path)
                all_labels.append(label)

    if not all_image_paths:
        print("No .jpg images found in subdirectories. Please check your INPUT_DIR.")
        return None

    print(f"Found {len(all_image_paths)} images to process.")
    print(f"Starting parallel processing with {MAX_WORKERS} workers...")

    all_features_data = []

    # Use ProcessPoolExecutor to run 'calculate_features' in parallel
    with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Create a dictionary to map futures to their labels
        future_to_label = {}
        for img_path, label in zip(all_image_paths, all_labels):
            future = executor.submit(calculate_features, img_path)
            future_to_label[future] = label

        completed_count = 0
        total_count = len(all_image_paths)

        for future in as_completed(future_to_label):
            label = future_to_label[future]
            try:
                features = future.result()
            except Exception as e:
                print(f"  A job failed with error: {e}") # Handle potential crashes in a worker
                features = None

            if features:
                # Add the label to the start of the feature list
                all_features_data.append([label] + features)
            else:
                # This will let us know if any images failed
                # print(f"Skipped an image from label: {label}") # Too noisy
                pass

            completed_count += 1
            # Print a progress update every 50 images or at the end
            if completed_count % 50 == 0 or completed_count == total_count:
                # Use sys.stdout.write and \r to print on the same line
                progress_percent = completed_count / total_count * 100
                sys.stdout.write(f"\r  Processed {completed_count}/{total_count} images ({progress_percent:.1f}%)")
                sys.stdout.flush()

    # Add a newline after the progress bar is done
    print()
    print("Parallel processing complete. Assembling feature table...")

    # Now, combine the labels with the results
    # This loop is removed as it's now done inside the 'as_completed' loop.

    if not all_features_data:
        print("No features were extracted. All images may have failed processing.")
        return None

    # Define column headers
    columns = [
        'label', 'Area', 'Perimeter', 'Aspect_ratio', 'Circularity',
        'Contrast', 'Correlation', 'Energy'
    ]

    # Create a DataFrame and save to CSV
    df = pd.DataFrame(all_features_data, columns=columns)
    df.to_csv(OUTPUT_CSV, index=False)

    print(f"\n--- Feature extraction complete! ---")
    print(f"Data saved to {OUTPUT_CSV}")
    return df


def train_model(df):
    """
    Trains a multi-class SVM model on the feature DataFrame.
    """
    print("\n--- Starting Model Training ---")

    # 1. Define Features (X) and Target (y)
    X = df.drop('label', axis=1)
    y = df['label']

    # 2. Split into Training and Testing sets
    # stratify=y ensures balanced classes in train/test splits
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    print(f"Total samples: {len(y)}")
    print(f"Training samples: {len(y_train)}")
    print(f"Testing samples: {len(y_test)}")

    # 3. Scale the features
    # SVMs are sensitive to feature scales
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # 4. Initialize and Train the SVM
    # 'ovr' = One-vs-Rest, a common strategy for multi-class
    print("Training SVM model... (This may take a moment)")
    # Added verbose=True to show training progress
    model = SVC(kernel='linear', decision_function_shape='ovr', probability=True, verbose=True)
    model.fit(X_train_scaled, y_train)
    print("\nModel training complete.") # Added newline for cleaner output

    # 5. Evaluate the Model
    y_pred = model.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print("\n--- Model Evaluation ---")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(report)


if __name__ == "__main__":

    feature_df = None

    # --- NEW: Check for pre-computed CSV first ---
    if os.path.exists(USER_CSV_PATH):
        print(f"Found precomputed CSV: {USER_CSV_PATH}")
        print("Loading data... Assuming label is the LAST column.")
        try:
            # Load CSV, assuming no header
            df_raw = pd.read_csv(USER_CSV_PATH, header=None)

            # Get last column as label
            label_col_index = df_raw.columns[-1]
            y = df_raw[label_col_index]

            # Get all other columns as features
            X = df_raw.drop(label_col_index, axis=1)

            # Re-create the DataFrame in the format train_model() expects (label first)
            feature_df = pd.concat([y, X], axis=1)

            # Assign the standard column names
            # IMPORTANT: Assumes your CSV has 7 features in the correct order
            columns = [
                'label', 'Area', 'Perimeter', 'Aspect_ratio', 'Circularity',
                'Contrast', 'Correlation', 'Energy'
            ]
            feature_df.columns = columns
            print("Data loaded successfully.")

        except Exception as e:
            print(f"ERROR: Could not read {USER_CSV_PATH}. Error: {e}")
            print("Please check the file format. Falling back to image extraction.")

    else:
        print(f"Did not find '{USER_CSV_PATH}'.")
        print("Falling back to extracting features from images...")

    # --- Fallback to image extraction if precomputed data wasn't loaded ---
    if feature_df is None:
        if not os.path.isdir(INPUT_DIR):
            print(f"Error: Input directory '{INPUT_DIR}' not found.")
            print(f"Please run 'augment_leaves.py' first or provide '{USER_CSV_PATH}'.")
            sys.exit(1)

        # Step 1: Extract features and get the DataFrame
        feature_df = extract_features()

    # Step 2: Train the model
    if feature_df is not None and not feature_df.empty:
        train_model(feature_df)
    else:
        print("Model training skipped as no features were loaded or extracted.")

Did not find 'my_leaf_data.csv'.
Falling back to extracting features from images...
Starting feature extraction from: /content/extracted_files/Training
No .jpg images found in subdirectories. Please check your INPUT_DIR.
Model training skipped as no features were loaded or extracted.


In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np


class1 = leaf_type_1

class2 = leaf_type_2

class3 = leaf_type_3

class4 = leaf_type_4

class5 = leaf_type_5


X = np.array(class1 + class2 + class3 + class4 + class5)
y = np.array(
    [0]*len(class1)
    + [1]*len(class2)
    + [2]*len(class3)
    + [3]*len(class4)
    + [4]*len(class5)
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


svm_model = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', probability=True, random_state=42)
)
svm_model.fit(X_train, y_train)


y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))


predicted_class = svm_model.predict([sample])[0]
print(f"\nPredicted class for new sample: {predicted_class}")


probabilities = svm_model.predict_proba([sample])[0]
print("Class probabilities:", probabilities)


Accuracy: 0.8

Classification report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85        40
           1       0.74      0.65      0.69        40
           2       0.85      0.82      0.84        40
           3       0.84      0.93      0.88        40
           4       0.71      0.75      0.73        40

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.80       200
weighted avg       0.80      0.80      0.80       200


Predicted class for new sample: 1
Class probabilities: [0.04793604 0.86616968 0.00342867 0.04364838 0.03881724]
