Random Forests for (Fruits and Vegetables) Image Classification

Import numpy and sklearn method.

In [None]:
import numpy as np
from sklearn.datasets import load_files

print(np.__version__)

Copy dataset from git repo.

In [None]:
!git clone https://github.com/Horea94/Fruit-Images-Dataset

Set the path directory to the cloned test and train data.
load_dataset() returns the filenames, integer classes and string classes that are stored in file directory.


1.   names_train is a vector that contains the filepath of all images from the training set
2.   names_test is a vector that contains the filepath of all images from the test set
3.   intclass_train is a vector containing the int class values (1-131) of all images from the training set
4.   intclass_test is a vector containing the int class values (1-131) of all images from the test set
5.   stringclass_train is a vector containing the string label of class of all images from the training set
6.   stringclass_test is a vector containing the string label of class of all images from the test set






In [None]:
import os
base_dir = os.getcwd()
base_dir

In [None]:
train_dir = base_dir+'/Fruit-Images-Dataset/Training/'
test_dir = base_dir+'/Fruit-Images-Dataset/Test/'

def load_dataset(path):
    data = load_files(path)
    files = np.array(data['filenames'])
    targets = np.array(data['target'])
    target_labels = np.array(data['target_names'])
    return files, targets, target_labels

names_train, intclass_train, stringclass_train = load_dataset(train_dir)
names_test, intclass_test, stringclass_test = load_dataset(test_dir)

print('Loading complete!')
print('Training set size : ',  names_train.shape[0])
print('Testing set size : ', names_test.shape[0])
print(stringclass_train)
print(intclass_train.shape)
print(stringclass_train.shape)

In [None]:
print(stringclass_train.shape)
print(stringclass_test.shape)


Show distribution of images to the different classes.

In [None]:
import matplotlib.pyplot as plt

(intclass, counts) = np.unique(intclass_train, return_counts=True)
plt.bar(intclass, counts)
plt.title('distribution of full training and test data')
full_train_frequencies = np.asarray((intclass, counts)).T

(intclass, counts) = np.unique(intclass_test, return_counts=True)
plt.bar(intclass, counts)
full_test_frequencies = np.asarray((intclass, counts)).T

Datasets can be reduced for compiling: original size of the training dataset is 67692 images; original size of the test dataset is 22688 images.

In [None]:
#train_datasize = 40000
#test_datasize = 10000
#names_train = names_train[:train_datasize]
#names_test = names_test[:test_datasize]
#intclass_train = intclass_train[:train_datasize]
#intclass_test = intclass_test[:test_datasize]
print('Training set size : ',  names_train.shape[0])
print('Testing set size : ', names_test.shape[0])
print(stringclass_train)
print(intclass_train.shape)
print(intclass_test.shape)

In [None]:
(intclass, counts) = np.unique(intclass_train, return_counts=True)
plt.bar(intclass, counts)
plt.title('distribution of reduced training and test data')
reduced_train_frequencies = np.asarray((intclass, counts)).T

(intclass, counts) = np.unique(intclass_test, return_counts=True)
plt.bar(intclass, counts)
reduced_test_frequencies = np.asarray((intclass, counts)).T



In [None]:
difference_train = full_train_frequencies - reduced_train_frequencies
print(difference_train)
difference_test = full_test_frequencies - reduced_test_frequencies
print(difference_test)

In [None]:
i = 0
while i < 20:
  print('Name : ', names_train[i])
  print('Intclass : ', intclass_train[i])
  i+=1


In [None]:
print('Training set size : ',  names_train.shape[0])
print('Testing set size : ', names_test.shape[0])

Amount of different classes in the test set.

In [None]:
n_classes = len(np.unique(intclass_test))
n_classes


Change name of image to actual pixel array.
The _images_array are the inputs (100x100 pixels with 3 color channels).


In [None]:
#from keras.preprocessing.image import array_to_img, img_to_array, load_img
from keras.utils import array_to_img, img_to_array, load_img

def convert_image_to_array(files):
    images_as_array=[]
    for file in files:
        # Convert to Numpy Array
        images_as_array.append(img_to_array(load_img(file)))
    return images_as_array

train_images_array = np.array(convert_image_to_array(names_train))
print('Training set shape : ', train_images_array.shape)

test_images_array = np.array(convert_image_to_array(names_test))
print('Test set shape : ', test_images_array.shape)

print('1st training image shape ',train_images_array[0].shape)

Pixel arrays of one image (100x100 pixels, 3 color channels).

In [None]:
print('1st training image as array',train_images_array[0])

Rescale pixel values from 0-255 range to 0-1.


1.   train_images_array is an array containing the normalized pixel values of the train images.
2.   test_images_array is an array containing the normalized pixel values of the test images.
3.   valid_images_array is an array containing the normalized pixel values of the validation images.



In [None]:
train_images_array = train_images_array.astype('float32')/255
test_images_array = test_images_array.astype('float32')/255

Following the code in the link below

https://www.analyticsvidhya.com/blog/2022/01/image-classification-using-machine-learning/


In [None]:
train_images_array.shape

In [None]:
#sklearn expects i/p to be 2d array-model.fit(x_train,y_train)=>reshape to 2d array
nsamples, nx, ny, nrgb = train_images_array.shape
train_flat_images_array = train_images_array.reshape((nsamples,nx*ny*nrgb))

In [None]:
test_images_array.shape

In [None]:
#so,eventually,model.predict() should also be a 2d input
nsamples, nx, ny, nrgb = test_images_array.shape
test_flat_images_array = test_images_array.reshape((nsamples,nx*ny*nrgb))

In [None]:
stringclass_train.shape

In [None]:
intclass_train.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()

from sklearn.model_selection import GridSearchCV

param_grid={
    'n_estimators':[1,10,100,1000],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth':[10,100,1000],
    'max_features':['sqrt', 'log2', None],
    'min_impurity_decrease':[0.05,0.1,0.5],
    'bootstrap':[False],
    'random_state':[42],
    'ccp_alpha':[0.0,0.05,0.1,0.5],
}
#applying 10-fold cross validation to choose hyperparameter
rf_gs=GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=10)

In [None]:
rf_gs.fit(train_flat_images_array,intclass_train)

In [None]:
best_params = rf_gs.best_estimator_.get_params()
best_params

In [None]:
intclass_test_pred=rf_gs.best_estimator_.predict(test_flat_images_array)
intclass_test_pred

In [None]:
train_score = score(train_flat_images_array,intclass_train)
test_score = score(test_flat_images_array,intclass_test)
print(f'The model has {train_score*100}% accuracy in the train set')
print(f'The model has {test_score*100}% accuracy in the test set')


In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
acc = accuracy_score(intclass_test_pred,intclass_test)
acc

In [None]:
print("The predicted Data is :")
print(np.array(intclass_test_pred))
print("The actual data is:")
print(np.array(intclass_test))
print(f"The model is {acc*100}% accurate (in the test set)")

In [None]:
print(classification_report(intclass_test_pred,intclass_test))

# Ploting the scores of gridsearcchcv
https://stackoverflow.com/questions/37161563/how-to-graph-grid-scores-from-gridsearchcv

# Ploting the Consusion Matrix

In [None]:
cm = confusion_matrix(y_pred,intclass_test)
cm

In [None]:
df = pd.DataFrame(cm).replace(0, np.nan)

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=[20,20])
sns.heatmap(df, annot=True, fmt='g', cmap="Blues")
plt.show()

In [None]:
#selecting the higher values outside of the diagonal (biggest confusion between classing)
#and associating it with the original class and the predicted class

#for example if green apples and red apples have some confusion it is not as much a problem as bananas and red apples