In [1]:
import cv2 as cv
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
import os

In [2]:
# Set the path to the data folder
data_path = '../data/'

classes = os.listdir(data_path)
data = []
labels = []

for type in classes:
    all_apples = os.listdir(data_path + type)
    for apple in all_apples:
        img = cv.imread(data_path + type + '/' + apple)
        img = cv.resize(img, (512, 512))
        data.append(img)
        labels.append(type)


In [3]:
# Split data and labels into a training set and a test set
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.2, random_state=42)

## Bag of visual words (BoVW) + KNN

In [4]:
# Step 1: Feature extraction
orb = cv.ORB_create()
descriptors = []
for img in train_data:
    kp, des = orb.detectAndCompute(img, None)
    descriptors.extend(des)


In [5]:
# Step 2: Codebook generation
kmeans = KMeans(n_clusters=650)
kmeans.fit(descriptors)
codebook = kmeans.cluster_centers_


  super()._check_params_vs_input(X, default_n_init=10)


In [6]:
# Step 3: Image representation
def get_histogram(targer_images):
    features = []
    for img in targer_images:
        kp, des = orb.detectAndCompute(img, None)
        histogram = np.zeros(len(codebook))
        for d in des:
            idx = kmeans.predict([d])
            histogram[idx] += 1
        features.append(histogram)
    return features

train_features = get_histogram(train_data)
test_features = get_histogram(test_data)


In [7]:
# Normalize the histograms
scaler = StandardScaler().fit(train_features)
train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)


In [8]:
# Step 4: Classification with hyperparameter tuning
dt = DecisionTreeClassifier()

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 20, 30, 40, 50, 60],
    'min_samples_split': [10, 20, 30, 50],
    'min_samples_leaf': [1, 2, 5, 10, 20, 50, 100],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 5, 10, 20, 30]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, verbose=1)

# Fit the GridSearchCV object to the training data
grid_search.fit(train_features, train_labels)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Score:", best_score)


Fitting 5 folds for each of 20160 candidates, totalling 100800 fits


25200 fits failed out of a total of 100800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25200 fits failed with the following error:
Traceback (most recent call last):
  File "e:\conda\envs\uni\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\conda\envs\uni\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "e:\conda\envs\uni\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "e:\conda\envs\uni\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.uti

Best Parameters: {'criterion': 'entropy', 'max_depth': 20, 'max_features': None, 'max_leaf_nodes': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'splitter': 'best'}
Best Score: 0.4394736842105263




In [9]:
# Fit the dt classifier with the best parameters
dt = DecisionTreeClassifier(**best_params)
dt.fit(train_features, train_labels)

In [10]:
# Predict the labels for the test data
predicted_labels = dt.predict(test_features)

print("\nAccuracy Score:")
print((accuracy_score(test_labels, predicted_labels))*100, "%")


Accuracy Score:
37.5 %
