In [None]:
import pandas as pd
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb
import os
import mahotas
from skimage.feature import hog, local_binary_pattern
from skimage import exposure
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def compute_hog_features(image):
    fd, hog_image = hog(image, orientations=8, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=True)
    hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))
    return fd

def compute_lbp_features(image):
    lbp = local_binary_pattern(image, P=8, R=1, method='uniform')
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 10), range=(0, 9))
    return hist

def compute_texture_features(image):
    textures = mahotas.features.haralick(image)
    mean_texture = textures.mean(axis=0)
    return mean_texture

def compute_moments_features(image):
    moments = cv2.HuMoments(cv2.moments(image)).flatten()
    return moments

def preprocess_image(image_path, image_size=(64, 64)):
    original_image = cv2.imread(image_path)
    original_image = cv2.resize(original_image, image_size)
    gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
    return original_image, gray_image

# Load the dataset
image_count = {}
data = []

for root, _, files in os.walk('/kaggle/input/plantdisease/PlantVillage'):
    disease = os.path.basename(root)
    image_count[disease] = 0
    
    # Include images based on the starting word of the disease
    if disease.lower().startswith(('tomato')):
        print(disease)
        for file in files:
            if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.JPG') or file.endswith('.PNG') or file.endswith('JPEG') or file.endswith('jpeg'):
                image_path = os.path.join(root, file)

                if image_count[disease] >= 1000:
                    continue

                original_image, gray_image = preprocess_image(image_path)

                # Compute features
                hog_features = compute_hog_features(gray_image)
                lbp_features = compute_lbp_features(gray_image)
                texture_features = compute_texture_features(gray_image)
                moments_features = compute_moments_features(gray_image)

                # Flatten the features
                flattened_features = np.concatenate([
                    original_image.flatten().astype(int),
                    gray_image.flatten().astype(int),
                    hog_features,
                    lbp_features,
                    texture_features,
                    moments_features
                ])

                data.append([flattened_features, disease])
                image_count[disease] += 1

df = pd.DataFrame(data, columns=['image_features', 'disease'])

# Split the data into training and testing sets
X = np.vstack(df['image_features'].to_numpy())
y = df['disease']

# Encode disease labels using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# Normalize or scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Create DMatrix for training and validation
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softmax',
    'num_class': 10,  
    'max_depth': 8,
    'learning_rate': 0.05,
    'eval_metric': 'mlogloss',  # Change to 'merror' if you want classification error instead of logloss
    'n_estimators': 750,
    'min_child_weight': 1,
    'subsample': 0.6,
    'colsample_bytree': 1.0,
    'gamma': 0.5,
    'reg_alpha': 0.1,
    'reg_lambda': 2
}

num_boost_round = 3000

evals = [(dvalid, 'eval')]
early_stopping_rounds = 10

# Train the model
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=evals,
    early_stopping_rounds=early_stopping_rounds
)

# Make predictions on the validation set
y_pred = model.predict(dvalid)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Tomato_Leaf_Mold
Tomato__Tomato_YellowLeaf__Curl_Virus
Tomato_Bacterial_spot
Tomato_Septoria_leaf_spot
Tomato_healthy
Tomato_Spider_mites_Two_spotted_spider_mite
Tomato_Early_blight
Tomato__Target_Spot
Tomato_Late_blight


In [None]:
# model = RandomForestClassifier(n_estimators=100,random_state=42)
# model.fit(X_train, y_train)

# # Make predictions
# y_pred = model.predict(X_test)


In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define the parameter grid for Grid Search
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Create a RandomForestClassifier
# rf = RandomForestClassifier(random_state=42)

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

# # Print the best parameters found by Grid Search
# print("Best Parameters:", grid_search.best_params_)

# # Get the best model
# best_rf = grid_search.best_estimator_
# y_pred = best_rf.predict(X_test)


In [None]:
# param_dist = {
#     'max_depth': [3, 5, 7, 10, None],
#     'learning_rate': [0.01, 0.1, 0.2, 0.3],
#     'n_estimators': [50, 100, 200, 300],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0],
#     'gamma': [0, 1, 2],
#     'min_child_weight': [1, 2, 3]
# }

# # Create an XGBClassifier
# xgb_model = xgb.XGBClassifier(random_state=42)

# # Initialize RandomizedSearchCV
# random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=10, scoring='accuracy', cv=3, n_jobs=-1, random_state=42)

# # Fit the random search to the data
# random_search.fit(X_train, y_train)

# # Print the best parameters found by RandomizedSearchCV
# print("Best Parameters:", random_search.best_params_)

# # Get the best model
# best_xgb = random_search.best_estimator_

# # Make predictions on the test set
# y_pred = best_xgb.predict(X_test)

In [None]:
# import xgboost as xgb
# # Now you can use XGBoost with the binary labels
# model = xgb.XGBClassifier()
# model.fit(X_train, y_train)

In [None]:
# y_pred = model.predict(X_test)



In [None]:
len(X_train[0])

In [None]:
print(image_count)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

In [None]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='weighted')
f1

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))