In [166]:
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
import xgboost as xgb
import os

image_count = {}
default_image_size = tuple((128, 128))  # Reduced image size

data = []

for root, _, files in os.walk('/kaggle/input/plantdisease/PlantVillage'):
    disease = os.path.basename(root)
    image_count[disease] = 0
    
    # Include images based on the starting word of the disease
    if disease.lower().startswith(('potato')):
        print(disease)
        for file in files:
            if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.JPG') or file.endswith('.PNG') or file.endswith('JPEG') or file.endswith('jpeg'):
                image_path = os.path.join(root, file)

                if image_count[disease] >= 1000:
                    continue

                # Read the original image and resize
                original_image = cv2.imread(image_path)
                original_image = cv2.resize(original_image, default_image_size)

                # Perform Canny edge detection
                gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
                edges = cv2.Canny(gray_image, 50, 150)

                # Compute color histograms for each channel
                hist_b = cv2.calcHist([original_image], [0], None, [256], [0, 256]).flatten().astype(int)
                hist_g = cv2.calcHist([original_image], [1], None, [256], [0, 256]).flatten().astype(int)
                hist_r = cv2.calcHist([original_image], [2], None, [256], [0, 256]).flatten().astype(int)

                # Flatten the images and histograms to 1D arrays and convert to NumPy array
                flattened_original_image = original_image.flatten().astype(int)
                flattened_edges = edges.flatten().astype(int)
                flattened_hist_b = hist_b.astype(int)
                flattened_hist_g = hist_g.astype(int)
                flattened_hist_r = hist_r.astype(int)

                # Concatenate the flattened pixel values, histograms, and disease name in data
                combined_features = np.concatenate([flattened_original_image, flattened_edges, flattened_hist_b, flattened_hist_g, flattened_hist_r])
                data.append([combined_features, disease])

                # Increment the counter for the current disease
                image_count[disease] += 1

df = pd.DataFrame(data, columns=['image_pixels', 'disease'])

# Split the data into training and testing sets
X = np.vstack(df['image_pixels'].to_numpy())
y = df['disease']

# Encode disease labels using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model (using RandomForest as an example)
# model = RandomForestClassifier(random_state=42)
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Potato___healthy
Potato___Late_blight
Potato___Early_blight


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Accuracy: 0.94
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97       209
           1       0.90      0.98      0.94       194
           2       1.00      0.50      0.67        28

    accuracy                           0.94       431
   macro avg       0.96      0.82      0.86       431
weighted avg       0.95      0.94      0.94       431

Confusion Matrix:
 [[202   7   0]
 [  4 190   0]
 [  1  13  14]]


In [167]:
# model = RandomForestClassifier(n_estimators=100,random_state=42)
# model.fit(X_train, y_train)

# # Make predictions
# y_pred = model.predict(X_test)


In [168]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define the parameter grid for Grid Search
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Create a RandomForestClassifier
# rf = RandomForestClassifier(random_state=42)

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

# # Print the best parameters found by Grid Search
# print("Best Parameters:", grid_search.best_params_)

# # Get the best model
# best_rf = grid_search.best_estimator_
# y_pred = best_rf.predict(X_test)


In [169]:
# param_dist = {
#     'max_depth': [3, 5, 7, 10, None],
#     'learning_rate': [0.01, 0.1, 0.2, 0.3],
#     'n_estimators': [50, 100, 200, 300],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0],
#     'gamma': [0, 1, 2],
#     'min_child_weight': [1, 2, 3]
# }

# # Create an XGBClassifier
# xgb_model = xgb.XGBClassifier(random_state=42)

# # Initialize RandomizedSearchCV
# random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=10, scoring='accuracy', cv=3, n_jobs=-1, random_state=42)

# # Fit the random search to the data
# random_search.fit(X_train, y_train)

# # Print the best parameters found by RandomizedSearchCV
# print("Best Parameters:", random_search.best_params_)

# # Get the best model
# best_xgb = random_search.best_estimator_

# # Make predictions on the test set
# y_pred = best_xgb.predict(X_test)

In [170]:
# import xgboost as xgb
# # Now you can use XGBoost with the binary labels
# model = xgb.XGBClassifier()
# model.fit(X_train, y_train)

In [171]:
# y_pred = model.predict(X_test)



In [172]:
len(X_train[0])

66304

In [173]:
print(image_count)

{'PlantVillage': 0, 'Pepper__bell___Bacterial_spot': 0, 'Potato___healthy': 152, 'Tomato_Leaf_Mold': 0, 'Tomato__Tomato_YellowLeaf__Curl_Virus': 0, 'Tomato_Bacterial_spot': 0, 'Tomato_Septoria_leaf_spot': 0, 'Tomato_healthy': 0, 'Tomato_Spider_mites_Two_spotted_spider_mite': 0, 'Tomato_Early_blight': 0, 'Tomato__Target_Spot': 0, 'Pepper__bell___healthy': 0, 'Potato___Late_blight': 1000, 'Tomato_Late_blight': 0, 'Potato___Early_blight': 1000, 'Tomato__Tomato_mosaic_virus': 0}


In [174]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9419953596287703


In [175]:
f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.937617327233348

In [176]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[202,   7,   0],
       [  4, 190,   0],
       [  1,  13,  14]])

In [177]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.94
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97       209
           1       0.90      0.98      0.94       194
           2       1.00      0.50      0.67        28

    accuracy                           0.94       431
   macro avg       0.96      0.82      0.86       431
weighted avg       0.95      0.94      0.94       431

Confusion Matrix:
 [[202   7   0]
 [  4 190   0]
 [  1  13  14]]
