# Feature Extraction Functions

In [1]:
import os
import cv2
import numpy as np
import mahotas as mt

def extract_color_histogram(image):
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist_features = [cv2.calcHist([hsv_image], [i], None, [256], [0, 256]) for i in range(3)]
    hist_features = np.concatenate(hist_features, axis=None)
    return hist_features

def extractHaralick(image):
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    textures = mt.features.haralick(image).mean(axis=0)
    return textures

def extractHuMoments(image):
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    moments = cv2.moments(image)
    huMoments = cv2.HuMoments(moments)
    huMoments = -1 * np.sign(huMoments) * np.log10(np.abs(huMoments))
    return huMoments.flatten()


In [3]:
# Initializing lists for different types of features
color_features, texture_features, shape_features = [], [], []
labels = []

dataset_folders = ['color', 'grayscale', 'segmented']

# Mapping the folders to corresponding functions and lists
feature_extraction_map = {
    'color': (extract_color_histogram, color_features),
    'grayscale': (extractHaralick, texture_features),
    'segmented': (extractHuMoments, shape_features)
}

# Iterating over the image files and apply the appropriate feature extraction
for folder in dataset_folders:
    subfolder_path = os.path.join('plantvillage-dataset', folder)
    for label in os.listdir(subfolder_path):
        if label.lower().startswith('tomato'):
            label_path = os.path.join(subfolder_path, label)
            for image_file in os.listdir(label_path):
                image_path = os.path.join(label_path, image_file)
                image = cv2.imread(image_path)
                if image is not None:
                    # Extracting features based on the folder and append to the correct list
                    feature_func, feature_list = feature_extraction_map[folder]
                    features = feature_func(image)
                    feature_list.append(features)
                    if folder == 'color':  # Assuming 'color' folder images are unique and a basis for labeling
                        labels.append(label)

# Converting lists to numpy arrays
color_features = np.array(color_features)
texture_features = np.array(texture_features)
shape_features = np.array(shape_features)
labels = np.array(labels)

print("Number of color features:", len(color_features))
print("Number of texture features:", len(texture_features))
print("Number of shape features:", len(shape_features))
print("Number of labels:", len(labels))


Number of color features: 18160
Number of texture features: 18160
Number of shape features: 18160
Number of labels: 18160


# Load Images and Extract Features


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

#Defining and train models

def train_model(X, y, model, test_size=0.1, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Models
models = {
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(max_iter=10000)
}

# Evaluating each model on each feature set
for name, model in models.items():
    if name in ["Random Forest", "SVM"]:  # Assuming color features for these models
        print(f"Accuracy of {name} on Color Features: {train_model(color_features, labels, model)}")
    elif name == "Decision Tree":  # Assuming shape features
        print(f"Accuracy of {name} on Shape Features: {train_model(shape_features, labels, model)}")
    elif name == "K-Nearest Neighbors":  # Assuming texture features
        print(f"Accuracy of {name} on Texture Features: {train_model(texture_features, labels, model)}")
    elif name == "Logistic Regression":  # Assuming Haralick features
        # Scaling features before logistic regression
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(texture_features)
        print(f"Accuracy of {name} on Haralick Features: {train_model(scaled_features, labels, model)}")

Accuracy of Random Forest on Color Features: 0.9647577092511013
Accuracy of SVM on Color Features: 0.9487885462555066
Accuracy of Decision Tree on Shape Features: 0.2555066079295154
Accuracy of K-Nearest Neighbors on Texture Features: 0.4856828193832599
Accuracy of Logistic Regression on Haralick Features: 0.5831497797356828
