In [34]:
# !pip install scikit-image
# !pip install --upgrade scipy scikit-image

import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
import xgboost as xgb
import os
import mahotas
# from skimage import exposure
from skimage.feature import hog, local_binary_pattern


image_count = {}
default_image_size = tuple((64, 64))  # Reduced image size

data = []
def compute_hog_features(image):
    fd, hog_image = hog(image, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=True)
    hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))
    return fd

# Function to compute LBP features
def compute_lbp_features(image):
    lbp = local_binary_pattern(image, P=8, R=1, method='uniform')
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 10), range=(0, 9))
    return hist

for root, _, files in os.walk('/kaggle/input/plantdisease/PlantVillage'):
    disease = os.path.basename(root)
    image_count[disease] = 0
    
    # Include images based on the starting word of the disease
    if disease.lower().startswith(('tomato')):
        print(disease)
        for file in files:
            if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.JPG') or file.endswith('.PNG') or file.endswith('JPEG') or file.endswith('jpeg'):
                image_path = os.path.join(root, file)

                if image_count[disease] >= 500:
                    continue

                # Read the original image and resize
                original_image = cv2.imread(image_path)
                original_image = cv2.resize(original_image, default_image_size)

                # Perform Canny edge detection
                gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
                edges = cv2.Canny(gray_image, 50, 150)

                # Compute color histograms for each channel
                hist_b = cv2.calcHist([original_image], [0], None, [256], [0, 256]).flatten().astype(int)
                hist_g = cv2.calcHist([original_image], [1], None, [256], [0, 256]).flatten().astype(int)
                hist_r = cv2.calcHist([original_image], [2], None, [256], [0, 256]).flatten().astype(int)

                # Compute GLCM texture features
                textures = mahotas.features.haralick(gray_image)
                mean_texture = textures.mean(axis=0)
                
                # Compute shape features using Hu Moments
                moments = cv2.HuMoments(cv2.moments(gray_image)).flatten()

                hog_features = compute_hog_features(gray_image)

                lbp_features = compute_lbp_features(gray_image)

                # Flatten the images and histograms to 1D arrays and convert to NumPy array
                flattened_original_image = original_image.flatten().astype(int)
                flattened_edges = edges.flatten().astype(int)
                flattened_hist_b = hist_b.astype(int)
                flattened_hist_g = hist_g.astype(int)
                flattened_hist_r = hist_r.astype(int)
                flattened_texture = mean_texture.astype(int)
                flattened_moments = moments.astype(int)

                # Concatenate the flattened pixel values, histograms, texture, moments, and disease name in data
                combined_features = np.concatenate([
                    flattened_original_image, 
                    flattened_edges, 
                    flattened_hist_b, 
                    flattened_hist_g, 
                    flattened_hist_r, 
                    flattened_texture, 
                    flattened_moments,
                    hog_features,
                    lbp_features
                ])                
                data.append([combined_features, disease])

                # Increment the counter for the current disease
                image_count[disease] += 1


df = pd.DataFrame(data, columns=['image_pixels', 'disease'])

# Split the data into training and testing sets
X = np.vstack(df['image_pixels'].to_numpy())
y = df['disease']

# Encode disease labels using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model (using RandomForest as an example)
# model = RandomForestClassifier(random_state=42)
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Collecting scipy
  Obtaining dependency information for scipy from https://files.pythonhosted.org/packages/e0/9e/80e2205d138960a49caea391f3710600895dd8292b6868dc9aff7aa593f9/scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-image
  Obtaining dependency information for scikit-image from https://files.pythonhosted.org/packages/f1/6c/49f5a0ce8ddcdbdac5ac69c129654938cc6de0a936303caa6cad495ceb2a/scikit_image-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading scikit_image-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Accuracy: 0.84
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89       107
           1       0.67      0.82      0.74        93
           2       0.82      0.64      0.72        96
           3       0.85      0.84      0.85       112
           4       0.79      0.73      0.76       101
           5       0.86      0.91      0.89        93
           6       0.86      0.87      0.86        95
           7       0.91      0.87      0.89       102
           8       0.88      0.86      0.87        84
           9       0.92      0.97      0.94        92

    accuracy                           0.84       975
   macro avg       0.84      0.84      0.84       975
weighted avg       0.84      0.84      0.84       975

Confusion Matrix:
 [[97  1  4  1  2  0  0  1  0  1]
 [ 4 76  4  3  2  1  0  3  0  0]
 [ 1 19 61  2  4  0  4  3  0  2]
 [ 1  8  0 94  4  3  0  0  2  0]
 [ 4  4  3  6 74  0  3  1  6  0]
 [ 0  0  0  2  1 8

In [None]:
# model = RandomForestClassifier(n_estimators=100,random_state=42)
# model.fit(X_train, y_train)

# # Make predictions
# y_pred = model.predict(X_test)


In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define the parameter grid for Grid Search
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Create a RandomForestClassifier
# rf = RandomForestClassifier(random_state=42)

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

# # Print the best parameters found by Grid Search
# print("Best Parameters:", grid_search.best_params_)

# # Get the best model
# best_rf = grid_search.best_estimator_
# y_pred = best_rf.predict(X_test)


In [None]:
# param_dist = {
#     'max_depth': [3, 5, 7, 10, None],
#     'learning_rate': [0.01, 0.1, 0.2, 0.3],
#     'n_estimators': [50, 100, 200, 300],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0],
#     'gamma': [0, 1, 2],
#     'min_child_weight': [1, 2, 3]
# }

# # Create an XGBClassifier
# xgb_model = xgb.XGBClassifier(random_state=42)

# # Initialize RandomizedSearchCV
# random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=10, scoring='accuracy', cv=3, n_jobs=-1, random_state=42)

# # Fit the random search to the data
# random_search.fit(X_train, y_train)

# # Print the best parameters found by RandomizedSearchCV
# print("Best Parameters:", random_search.best_params_)

# # Get the best model
# best_xgb = random_search.best_estimator_

# # Make predictions on the test set
# y_pred = best_xgb.predict(X_test)

In [None]:
# import xgboost as xgb
# # Now you can use XGBoost with the binary labels
# model = xgb.XGBClassifier()
# model.fit(X_train, y_train)

In [None]:
# y_pred = model.predict(X_test)



In [None]:
len(X_train[0])

In [None]:
print(image_count)

In [35]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.841025641025641


In [36]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.8400113401554286

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

In [37]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.84
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89       107
           1       0.67      0.82      0.74        93
           2       0.82      0.64      0.72        96
           3       0.85      0.84      0.85       112
           4       0.79      0.73      0.76       101
           5       0.86      0.91      0.89        93
           6       0.86      0.87      0.86        95
           7       0.91      0.87      0.89       102
           8       0.88      0.86      0.87        84
           9       0.92      0.97      0.94        92

    accuracy                           0.84       975
   macro avg       0.84      0.84      0.84       975
weighted avg       0.84      0.84      0.84       975

Confusion Matrix:
 [[97  1  4  1  2  0  0  1  0  1]
 [ 4 76  4  3  2  1  0  3  0  0]
 [ 1 19 61  2  4  0  4  3  0  2]
 [ 1  8  0 94  4  3  0  0  2  0]
 [ 4  4  3  6 74  0  3  1  6  0]
 [ 0  0  0  2  1 8