In [None]:
import os
import cv2
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

TARGET_SIZE = (64, 64)
DATASET_DIR = "/kaggle/input/plantvillage-dataset/color"


In [None]:
def load_data(directory):
    X = []
    y = []

    class_folders = sorted([
        folder for folder in os.listdir(directory)
        if os.path.isdir(os.path.join(directory, folder))
    ])

    print("Classes found:", len(class_folders))

    for label in class_folders:
        label_path = os.path.join(directory, label)

        for img_name in os.listdir(label_path):
            img_path = os.path.join(label_path, img_name)

            img = cv2.imread(img_path)
            if img is None:
                continue
                
            img_resized = cv2.resize(img, TARGET_SIZE)

            vector = img_resized.flatten()

            X.append(vector)
            y.append(label)

    return np.array(X), np.array(y)


In [None]:
X, y = load_data(DATASET_DIR)

print("Total samples:", X.shape[0])
print("Feature size:", X.shape[1])
print("Number of classes:", len(np.unique(y)))


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training samples:", X_train.shape)
print("Testing samples:", X_test.shape)


In [None]:
dummy_model = DummyClassifier(strategy="most_frequent")
dummy_model.fit(X_train, y_train)

dummy_preds = dummy_model.predict(X_test)
dummy_acc = accuracy_score(y_test, dummy_preds)

print("Dummy Accuracy:", dummy_acc)


In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)


In [None]:
rf_preds = rf_model.predict(X_test)


In [None]:
rf_accuracy = accuracy_score(y_test, rf_preds)
cm = confusion_matrix(y_test, rf_preds)

print("Random Forest Accuracy:", rf_accuracy)
print("\nConfusion Matrix:\n", cm)
