In [2]:
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
import xgboost as xgb
import os
import mahotas

image_count = {}
default_image_size = tuple((64, 64))  # Reduced image size

data = []

for root, _, files in os.walk('/kaggle/input/plantdisease/PlantVillage'):
    disease = os.path.basename(root)
    image_count[disease] = 0
    
    # Include images based on the starting word of the disease
    if disease.lower().startswith(('tomato')):
        print(disease)
        for file in files:
            if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.JPG') or file.endswith('.PNG') or file.endswith('JPEG') or file.endswith('jpeg'):
                image_path = os.path.join(root, file)

                if image_count[disease] >= 500:
                    continue

                # Read the original image and resize
                original_image = cv2.imread(image_path)
                original_image = cv2.resize(original_image, default_image_size)

                # Perform Canny edge detection
                gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
                edges = cv2.Canny(gray_image, 50, 150)

                # Compute color histograms for each channel
                hist_b = cv2.calcHist([original_image], [0], None, [256], [0, 256]).flatten().astype(int)
                hist_g = cv2.calcHist([original_image], [1], None, [256], [0, 256]).flatten().astype(int)
                hist_r = cv2.calcHist([original_image], [2], None, [256], [0, 256]).flatten().astype(int)

                # Compute GLCM texture features
                textures = mahotas.features.haralick(gray_image)
                mean_texture = textures.mean(axis=0)

                # Compute shape features using Hu Moments
                moments = cv2.HuMoments(cv2.moments(gray_image)).flatten()

                # Flatten the images and histograms to 1D arrays and convert to NumPy array
                flattened_original_image = original_image.flatten().astype(int)
                flattened_edges = edges.flatten().astype(int)
                flattened_hist_b = hist_b.astype(int)
                flattened_hist_g = hist_g.astype(int)
                flattened_hist_r = hist_r.astype(int)
                flattened_texture = mean_texture.astype(int)
                flattened_moments = moments.astype(int)

                # Concatenate the flattened pixel values, histograms, texture, moments, and disease name in data
                combined_features = np.concatenate([flattened_original_image, flattened_edges, flattened_hist_b, flattened_hist_g, flattened_hist_r, flattened_texture, flattened_moments])
                data.append([combined_features, disease])

                # Increment the counter for the current disease
                image_count[disease] += 1


df = pd.DataFrame(data, columns=['image_pixels', 'disease'])

# Split the data into training and testing sets
X = np.vstack(df['image_pixels'].to_numpy())
y = df['disease']

# Encode disease labels using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model (using RandomForest as an example)
# model = RandomForestClassifier(random_state=42)
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Tomato_Leaf_Mold
Tomato__Tomato_YellowLeaf__Curl_Virus
Tomato_Bacterial_spot
Tomato_Septoria_leaf_spot
Tomato_healthy
Tomato_Spider_mites_Two_spotted_spider_mite
Tomato_Early_blight
Tomato__Target_Spot
Tomato_Late_blight
Tomato__Tomato_mosaic_virus


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Accuracy: 0.83
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.90      0.88       107
           1       0.68      0.74      0.71        93
           2       0.81      0.62      0.71        96
           3       0.81      0.85      0.83       112
           4       0.82      0.75      0.78       101
           5       0.84      0.87      0.85        93
           6       0.76      0.84      0.80        95
           7       0.85      0.88      0.87       102
           8       0.91      0.83      0.87        84
           9       0.95      0.97      0.96        92

    accuracy                           0.83       975
   macro avg       0.83      0.83      0.83       975
weighted avg       0.83      0.83      0.83       975

Confusion Matrix:
 [[96  2  4  0  0  0  1  4  0  0]
 [ 3 69  4  4  2  1  4  6  0  0]
 [ 2 15 60  5  4  1  4  4  0  1]
 [ 1  5  1 95  3  3  0  1  3  0]
 [ 2  3  4  6 76  0  5  0  3  2]
 [ 1  1  0  1  0 8

In [3]:
# model = RandomForestClassifier(n_estimators=100,random_state=42)
# model.fit(X_train, y_train)

# # Make predictions
# y_pred = model.predict(X_test)


In [4]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define the parameter grid for Grid Search
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Create a RandomForestClassifier
# rf = RandomForestClassifier(random_state=42)

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

# # Print the best parameters found by Grid Search
# print("Best Parameters:", grid_search.best_params_)

# # Get the best model
# best_rf = grid_search.best_estimator_
# y_pred = best_rf.predict(X_test)


In [5]:
# param_dist = {
#     'max_depth': [3, 5, 7, 10, None],
#     'learning_rate': [0.01, 0.1, 0.2, 0.3],
#     'n_estimators': [50, 100, 200, 300],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0],
#     'gamma': [0, 1, 2],
#     'min_child_weight': [1, 2, 3]
# }

# # Create an XGBClassifier
# xgb_model = xgb.XGBClassifier(random_state=42)

# # Initialize RandomizedSearchCV
# random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=10, scoring='accuracy', cv=3, n_jobs=-1, random_state=42)

# # Fit the random search to the data
# random_search.fit(X_train, y_train)

# # Print the best parameters found by RandomizedSearchCV
# print("Best Parameters:", random_search.best_params_)

# # Get the best model
# best_xgb = random_search.best_estimator_

# # Make predictions on the test set
# y_pred = best_xgb.predict(X_test)

In [6]:
# import xgboost as xgb
# # Now you can use XGBoost with the binary labels
# model = xgb.XGBClassifier()
# model.fit(X_train, y_train)

In [7]:
# y_pred = model.predict(X_test)



In [8]:
len(X_train[0])

17172

In [9]:
print(image_count)

{'PlantVillage': 0, 'Pepper__bell___Bacterial_spot': 0, 'Potato___healthy': 0, 'Tomato_Leaf_Mold': 500, 'Tomato__Tomato_YellowLeaf__Curl_Virus': 500, 'Tomato_Bacterial_spot': 500, 'Tomato_Septoria_leaf_spot': 500, 'Tomato_healthy': 500, 'Tomato_Spider_mites_Two_spotted_spider_mite': 500, 'Tomato_Early_blight': 500, 'Tomato__Target_Spot': 500, 'Pepper__bell___healthy': 0, 'Potato___Late_blight': 0, 'Tomato_Late_blight': 500, 'Potato___Early_blight': 0, 'Tomato__Tomato_mosaic_virus': 373}


In [10]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8266666666666667


In [13]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.8255367922988757

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))