# Random Forests 
## for (Fruits and Vegetables) Image Classification

Install necessary packages

In [None]:
!pip install tensorflow --user

Import necessary packages

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow
from tensorflow import keras
from keras.utils import array_to_img, img_to_array, load_img
from datetime import datetime
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report


In [None]:
#check versions of some packages
print(np.__version__)
print(pd.__version__)

Copy dataset from git repo.

In [None]:
!git clone https://github.com/Horea94/Fruit-Images-Dataset

Set the path directory to the cloned test and train data.
load_dataset() returns the filenames, integer classes and string classes that are stored in file directory.


1.   names_train is a vector that contains the filepath of all images from the training set
2.   names_test is a vector that contains the filepath of all images from the test set
3.   intclass_train is a vector containing the int class values (1-131) of all images from the training set
4.   intclass_test is a vector containing the int class values (1-131) of all images from the test set
5.   stringclass_train is a vector containing the string label of class of all images from the training set
6.   stringclass_test is a vector containing the string label of class of all images from the test set






In [None]:
#recognise the directory of the notebook file
base_dir = os.getcwd()
base_dir

In [None]:
train_dir = base_dir+'/Fruit-Images-Dataset/Training/'
test_dir = base_dir+'/Fruit-Images-Dataset/Test/'

def load_dataset(path):
    data = load_files(path)
    files = np.array(data['filenames'])
    targets = np.array(data['target'])
    target_labels = np.array(data['target_names'])
    return files, targets, target_labels

names_train, intclass_train, stringclass_train = load_dataset(train_dir)
names_test, intclass_test, stringclass_test = load_dataset(test_dir)

print('Loading complete!')
print('Training set size : ',  names_train.shape[0])
print('Testing set size : ', names_test.shape[0])
print(stringclass_train)
print(intclass_train.shape)
print(stringclass_train.shape)

In [None]:
print(stringclass_train.shape)
print(stringclass_test.shape)

Show distribution of images to the different classes.

In [None]:
(intclass, counts) = np.unique(intclass_train, return_counts=True)
plt.bar(intclass, counts)
plt.title('distribution of full training and test data')
full_train_frequencies = np.asarray((intclass, counts)).T

(intclass, counts) = np.unique(intclass_test, return_counts=True)
plt.bar(intclass, counts)
full_test_frequencies = np.asarray((intclass, counts)).T

Datasets can be reduced for compiling: original size of the training dataset is 67692 images; original size of the test dataset is 22688 images.

In [None]:
i = 0
while i < 20:
  print('Name : ', names_train[i])
  print('Intclass : ', intclass_train[i])
  i+=1


In [None]:
print('Training set size : ',  names_train.shape[0])
print('Testing set size : ', names_test.shape[0])

Amount of different classes in the test set.

In [None]:
n_classes = len(np.unique(intclass_test))
n_classes


Change name of image to actual pixel array.
The _images_array are the inputs (100x100 pixels with 3 color channels).


In [None]:
def convert_image_to_array(files):
    images_as_array=[]
    for file in files:
        # Convert to Numpy Array
        images_as_array.append(img_to_array(load_img(file)))
    return images_as_array

train_images_array = np.array(convert_image_to_array(names_train))
print('Training set shape : ', train_images_array.shape)

test_images_array = np.array(convert_image_to_array(names_test))
print('Test set shape : ', test_images_array.shape)

print('1st training image shape ',train_images_array[0].shape)

Pixel arrays of one image (100x100 pixels, 3 color channels).

In [None]:
print('1st training image as array',train_images_array[0])

Rescale pixel values from 0-255 range to 0-1.


1.   train_images_array is an array containing the normalized pixel values of the train images.
2.   test_images_array is an array containing the normalized pixel values of the test images.
3.   valid_images_array is an array containing the normalized pixel values of the validation images.



In [None]:
train_images_array = train_images_array.astype('float32')/255
test_images_array = test_images_array.astype('float32')/255

In [None]:
#split the training dataset ito trainset anbd testset (reduce computation power)
new_train_images_array, new_test_images_array, new_train_intclass, new_test_intclass = train_test_split(train_images_array, intclass_train, test_size=0.3, random_state=42)

In [None]:
print('Training set size : ',  names_train.shape[0])
print('Testing set size : ', names_test.shape[0])
print('Training set LABELS size : ',intclass_train.shape)
print('Testing set LABELS size : ',intclass_test.shape)

In [None]:
n_classes_train = len(np.unique(intclass_train))
print('Number of classes in training: ',n_classes_train)
n_classes_test = len(np.unique(intclass_test))
print('Number of classes in testing: ',intclass_test)

In [None]:
print('New Training set size : ',  new_train_images_array.shape[0])
print('New Testing set size : ', new_test_images_array.shape[0])
print('New Training set LABELS size : ', new_train_intclass.shape)
print('New Testing set LABELS size : ',new_test_intclass.shape)

In [None]:
n_classes_train = len(np.unique(new_train_intclass))
print('Number of classes in new training: ',n_classes_train)
n_classes_test = len(np.unique(new_test_intclass))
print('Number of classes in new testing: ',n_classes_test)

Following the code in the link below

https://www.analyticsvidhya.com/blog/2022/01/image-classification-using-machine-learning/


Flattening the arrays
shape of the arrays should be a vector of 30000 features
in order to be undersantable by skleaarn

In [None]:
#sklearn expects i/p to be 2d array-model.fit(x_train,y_train)=>reshape to 2d array
nsamples, nx, ny, nrgb = new_train_images_array.shape
train_flat_images_array = new_train_images_array.reshape((nsamples,nx*ny*nrgb))

In [None]:
#so,eventually,model.predict() should also be a 2d input
nsamples, nx, ny, nrgb = new_test_images_array.shape
test_flat_images_array = new_test_images_array.reshape((nsamples,nx*ny*nrgb))

#### 1st gridsearch

In [None]:
# Getting the current date and time
dt1 = datetime.now()

# getting the timestamp
ts1 = datetime.timestamp(dt1)

print("Date and time is:", dt1)
print("Timestamp is:", ts1)

In [None]:
#defining the Random Forest model from sklearn
rf=RandomForestClassifier()

#range of values for each of the parameters
param_grid={
    'n_estimators':[1,10, 20],
    'criterion': ['gini', 'entropy'],
    'max_depth':[10,50,100],
    'max_features':['sqrt', 'log2'], 
    'min_impurity_decrease':[0.05,0.1,0.5],
    'bootstrap':[False],
    'random_state':[42],
    'ccp_alpha':[0.05,0.1,0.5],
}
#applying 5-fold cross validation to choose hyperparameter
rf_gs=GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=5, verbose=2)

In [None]:
#fiting the model to the "new training" dataset
rf_gs.fit(train_flat_images_array,new_train_intclass)

In [None]:
# Getting the current date and time
dt2 = datetime.now()

# getting the timestamp
ts2 = datetime.timestamp(dt2)

print("Date and time is:", dt2)
print("Timestamp is:", ts2)

In [None]:
# difference between dates in timedelta
# time necessary to get the best model with this parameter options
delta = dt2 - dt1
print(f'Difference is {delta.seconds} seconds')
print(f'Difference is {delta.seconds/60} minutes')
print(f'Difference is {delta.seconds/3600} hours')

In [None]:
# parameters for the best model found in the grid search
best_params = rf_gs.best_estimator_.get_params()
best_params

In [None]:
# predictions for the "new test" dataset
intclass_test_pred=rf_gs.best_estimator_.predict(test_flat_images_array)
intclass_test_pred

In [None]:
# calculate accuracy on the "new test" dataset
acc = accuracy_score(intclass_test_pred,new_test_intclass)
acc

In [None]:
print("The predicted Data is :")
print(np.array(intclass_test_pred))
print("The actual data is:")
print(np.array(new_test_intclass))
print(f"The model is {acc*100}% accurate (in the test set)")

In [None]:
#extra metrics (precision, recall and f1-score)
print(classification_report(intclass_test_pred,new_test_intclass))

Ploting the Consusion Matrix

In [None]:
cm = confusion_matrix(intclass_test_pred,new_test_intclass)
import pandas as pd
df = pd.DataFrame(cm).replace(0, np.nan)
import seaborn as sns
plt.figure(figsize=[20,20])
sns.heatmap(df, annot=True, fmt='g', cmap="Blues")
plt.show()

#### 2nd ieration of gridsearch for parameters
repeat the same steps but changing te parameters

In [None]:
rf=RandomForestClassifier()

In [None]:
# Getting the current date and time
dt1 = datetime.now()

# getting the timestamp
ts1 = datetime.timestamp(dt1)

print("Date and time is:", dt1)
print("Timestamp is:", ts1)

In [None]:

param_grid={
    'n_estimators':[10, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth':[5,10,15],
    'max_features':['log2', 'auto'],
    'min_impurity_decrease':[0.0, 0.025, 0.05],
    'bootstrap':[False],
    'random_state':[42],
    'ccp_alpha':[0.0, 0.025, 0.05],
}
#applying 5-fold cross validation to choose hyperparameter
rf_gs2=GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=5, verbose=2)

In [None]:
rf_gs2.fit(train_flat_images_array,new_train_intclass)

In [None]:
# Getting the current date and time
dt2 = datetime.now()

# getting the timestamp
ts2 = datetime.timestamp(dt2)

print("Date and time is:", dt2)
print("Timestamp is:", ts2)

In [None]:
# difference between dates in timedelta
delta = dt2 - dt1
print(f'Difference is {delta.seconds} seconds')
print(f'Difference is {delta.seconds / 60} minutes')
print(f'Difference is {delta.seconds / 3600} hours')

In [None]:
best_params2 = rf_gs2.best_estimator_.get_params()
best_params2

In [None]:
intclass_test_pred2 = rf_gs2.best_estimator_.predict(test_flat_images_array)
intclass_test_pred2

In [None]:
acc2 = accuracy_score(intclass_test_pred2,new_test_intclass)
acc2

In [None]:
print("The predicted Data is :")
print(np.array(intclass_test_pred2))
print("The actual data is:")
print(np.array(new_test_intclass))
print(f"The model is {acc2*100}% accurate (in the test set)")

In [None]:
print(classification_report(intclass_test_pred2,new_test_intclass))

In [None]:
cm2 = confusion_matrix(intclass_test_pred2,new_test_intclass)

df2 = pd.DataFrame(cm2).replace(0, np.nan)

plt.figure(figsize=[20,20])
sns.heatmap(df2, annot=True, fmt='g', cmap="Blues")
plt.show()

#### Retraining a Random Forest
with the best found parameters 
- with 5-fold cv on the "new training" dataset
(evaluating the performance on the cv, on the "new train" set, on the "new test" set and on the original test set)
- with the full "new training" dataset
(evaluating the performance on the "new train" set, on the "new test" set and on the original test set)

In [None]:
final_rf = RandomForestClassifier( #best_params2
    ,
    random_state=42)
cv_scores = cross_val_score(final_rf, train_flat_images_array,new_train_intclass, cv=5)
cv_scores

In [None]:
print("{0.2f}% accuracy with a standard deviation of {0.2f}%".format (scores.mean()*100, scores.std()*100))