# Classification binaire

In [None]:
import os
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import random as rd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# Recuperation des images 
def data_recovery(size=128, path ="cats_dogs/Train/*"):
    NoneType = type(None)       
    images_set = []
    labels_set = [] 
    for directory_path in glob.glob(path): 
        label = directory_path.split("\\")[-1]
        print(label)
        for img_path in glob.glob(os.path.join(directory_path, "*.jpg")):
            path_list = img_path.split('\\') #return list of 3 items
            img_path =  path_list[0] + '/' + path_list[1] +'/'+ path_list[2] #switch '\' par '/'
            print('---',img_path)
            img = cv2.imread(img_path,0)
            if type(img) == NoneType :
                continue
            else :            
                img = cv2.resize(img, (size, size)) #Resize images
                img = img.flatten()
                images_set.append(img)
                labels_set.append(label)
                
    images_set = np.array(images_set)
    labels_set = np.array(labels_set)
    return images_set, labels_set

In [None]:
images_set, labels_set = data_recovery() #return 2 arrays, the 1st is images_set & 2nd is labels of our images

In [None]:
# save our images & labels using pickle
pick_img, pick_label = open('images_set.pickle', 'wb'), open('labels_set.pickle', 'wb') 
pickle.dump(images_set, pick_img)
pickle.dump(labels_set, pick_label)
pick_img.close()
pick_label.close()

In [None]:
#load our data
pick_img, pick_label = open('pick/images_set.pickle', 'rb'), open('pick/labels_set.pickle', 'rb')
img_set = pickle.load(pick_img) #load images set
label_set = pickle.load(pick_label) # load labels set
pick_img.close()
pick_label.close()

In [None]:
# convert our img and label array to an Dataframe
images_set = pd.DataFrame(img_set)
labels_set = pd.DataFrame(label_set, columns=['Label'])

In [None]:
# vizualize our dataframe
print(images_set.head())
nb_img =  img_set.shape[0]
print('nombre d\'image au total: ', nb_img)

print(labels_set.head(3))
print(labels_set.tail(3))

In [None]:
#count and plot a bar graph.
def count_labels():
    nb_dog, nb_cat = 0,0
    for cpt in label_set:    
        if cpt == 'Cat' : nb_cat += 1  
        elif cpt  == 'Dog' : nb_dog += 1
    ca = [nb_dog, nb_cat]    
    plt.bar(['dog','cat'], ca, color=['c','y'])
    for i in range(2):
        plt.text(i-0.1, ca[i], (ca[i]))
    sns.set(font_scale=1.6)
    return plt.show()

In [None]:
# count labels & plot a bar graph
count_labels()

In [None]:
def pick_random(data):
    length = len(data)
    n = rd.randint(0,length)
    #dim = (size*size)-1
    img = data.loc[n].values
    img = img.reshape(128,128)
    plt.xlabel('label : '+labels_set['Label'].loc[n])
    return plt.imshow(img,cmap='gray')

In [None]:
pick_random(images_set)

In [None]:
# Encode label
encoder = LabelEncoder()
labels_set_encoded =  encoder.fit_transform(labels_set)

In [None]:
# Normalize pixel values to between 0 and 1 using minmaxscaler
scaler = MinMaxScaler()

images_set = scaler.fit_transform(images_set) 

In [None]:
#Split data into test and train datasets 
x_train, x_test, y_train, y_test = train_test_split(images_set, labels_set_encoded,
                                                    test_size=0.01, stratify=labels_set_encoded)

In [None]:
pick_xtrain, pick_xtest, pick_ytrain, pick_ytest = open('pick/x_train.pickle', 'wb'), open('pick/x_test.pickle', 'wb'), open('pick/y_train.pickle', 'wb') , open('pick/y_test.pickle', 'wb')  

pickle.dump(x_train, pick_xtrain)
pickle.dump(x_test, pick_xtest)
pickle.dump(y_train, pick_ytrain)
pickle.dump(y_test, pick_ytest)

pick_xtrain.close()
pick_xtest.close()
pick_ytrain.close()
pick_ytest.close()

In [None]:
pick_xtrain, pick_xtest, pick_ytrain, pick_ytest = open('pick/x_train.pickle', 'rb'), open('pick/x_test.pickle', 'rb'), open('pick/y_train.pickle', 'rb') , open('pick/y_test.pickle', 'rb')  

x_train, x_test, y_train, y_test = pickle.load(pick_xtrain), pickle.load(pick_xtest), pickle.load(pick_ytrain), pickle.load(pick_ytest)

pick_xtrain.close()
pick_xtest.close()
pick_ytrain.close()
pick_ytest.close()

In [None]:
print(f'Xtrain: {x_train.shape} \nXtest: {x_test.shape} \nYtrain: {y_train.shape} \nYtest: {y_test.shape}')

In [None]:
#Define the classifier
model = RandomForestClassifier(n_estimators = 50)

In [None]:
# Fit the model on training data
model.fit(x_train, y_train) 

In [None]:
#Predict on test
test_prediction = model.predict(x_test)

#Inverse le transform to get original label back. 
test_prediction = encoder.inverse_transform(test_prediction)
true_label = encoder.inverse_transform(y_test)

In [None]:
#Print overall accuracy
print ("Accuracy score = ", accuracy_score(true_label, test_prediction), '\n\n')
cm=confusion_matrix(true_label, test_prediction)

sns.heatmap(cm, annot=True, fmt='.0f')
plt.show()

In [None]:
#save model 
pick = open('model.sav', 'wb')
pickle.dump(model, pick)
pick.close()

In [None]:
#load model
pick = open('pick/model.sav', 'rb')
model = pickle.load(pick)
pick.close()

In [None]:
def test_model(X):
    data = X
    length = len(data)
    n = rd.randint(0,length)
    img2predict = data[n]
    get_prediction = model.predict([data[n]])
    get_prediction = encoder.inverse_transform(get_prediction)
    org_img = true_label[n]
    print("The prediction for this image is: ", get_prediction[0])
    print("The actual label for this image is: ", org_img)

    plt.xlabel('label:'+ org_img)
    plt.title('prediction: '+ get_prediction[0])
    img2predict = data[n].reshape(128,128)
    plt.imshow(img2predict.reshape(128,128),cmap='gray')
    return plt.show()

In [None]:
test_model(x_test)