# Import des librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import time
import cv2

from tqdm import tqdm, tqdm_notebook

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV # Pour répartir les données
from sklearn.cluster import MeanShift, KMeans, estimate_bandwidth
from sklearn.ensemble import RandomForestClassifier # Pour instancier et entraîner un modèle Random Forest
from sklearn.metrics import classification_report
from sklearn.svm import SVC

from Filtering import filter_color_threshold 
from Filtering import filter_Kmeans1,filter_Kmeans2,filter_KmeansXYRGB
from Filtering import filter_MeanShift
from Filtering import EqualizerImg

import warnings
warnings.filterwarnings('ignore')

# 1.1 Charger les données

# Import du Dataframe df_Infos
#### Ce dataframe contient les informations de chaque image du dataset (chemin, type, clé, nom, extension, taille, width, height)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import time

from tqdm import tqdm

import cv2

from Filtering import filter_color_threshold 
from Filtering import filter_Kmeans1,filter_Kmeans2,filter_KmeansXYRGB
from Filtering import filter_MeanShift
from Filtering import EqualizerImg

In [3]:
df_infos = pd.read_csv("df_infos.csv")
df_infos

Unnamed: 0,path_image,category,name,extension,width,height,dim,size
0,PBC_dataset_normal_DIB/basophil/BA_689200.jpg,basophil,BA_689200,jpg,363,360,3,363 x 360
1,PBC_dataset_normal_DIB/basophil/BA_883452.jpg,basophil,BA_883452,jpg,363,360,3,363 x 360
2,PBC_dataset_normal_DIB/basophil/BA_382161.jpg,basophil,BA_382161,jpg,369,366,3,369 x 366
3,PBC_dataset_normal_DIB/basophil/BA_175579.jpg,basophil,BA_175579,jpg,363,360,3,363 x 360
4,PBC_dataset_normal_DIB/basophil/BA_775722.jpg,basophil,BA_775722,jpg,363,360,3,363 x 360
...,...,...,...,...,...,...,...,...
17087,PBC_dataset_normal_DIB/platelet/PLATELET_49591...,platelet,PLATELET_495918,jpg,363,360,3,363 x 360
17088,PBC_dataset_normal_DIB/platelet/PLATELET_89723...,platelet,PLATELET_897238,jpg,363,360,3,363 x 360
17089,PBC_dataset_normal_DIB/platelet/PLATELET_75043...,platelet,PLATELET_750430,jpg,363,360,3,363 x 360
17090,PBC_dataset_normal_DIB/platelet/PLATELET_81043...,platelet,PLATELET_810431,jpg,363,360,3,363 x 360


In [4]:
print(len(df_infos),'\n')
print(df_infos.category.value_counts())

17092 

neutrophil      3329
eosinophil      3117
ig              2895
platelet        2348
erythroblast    1551
monocyte        1420
basophil        1218
lymphocyte      1214
Name: category, dtype: int64


# Faire une fonction pour créer le dataset (.parquet)

In [5]:
def dataset_creation(dataset, nb_img, size_img, filter_list, save_directory='',save_option ='parquet'):
    start = time.time()
    
    df = pd.DataFrame()
    data = []
    
    # Si le nombre d'image pour chaque catégorie et différent de 'all' alors on crée un dataset avec n echantillon (nb_img) de chaque catégorie
    if nb_img != "all":
        size = nb_img
        for i in dataset["category"].value_counts().index:
            df_type= dataset[dataset["category"]==i]
            df_tmp = df_type.sample(n=size)
            df = pd.concat([df,df_tmp], axis = 0)
        df = df.reset_index()
    
    # sinon on prend le dataset complet
    else:
        df = dataset
    
    for f in tqdm_notebook(filter_list):
    
        # Ici on applique le filtre selectionné sur les images du dataset crée ce dessus.
        for i in tqdm_notebook(df.index): 
            filename = df.path_image[i]

            if f == "rgb":
                img = cv2.imread(filename,cv2.IMREAD_COLOR)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            elif f == "gray":
                img = cv2.imread(filename,cv2.IMREAD_GRAYSCALE)

            elif f == "hsv":
                img = cv2.imread(filename,cv2.IMREAD_COLOR)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

            elif f == "lab":
                img = cv2.imread(filename,cv2.IMREAD_COLOR)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)

            elif f == "canny":
                img = cv2.imread(filename,cv2.IMREAD_COLOR)
                img = cv2.GaussianBlur(img,(3,3),1)
                img = cv2.Canny(img,125,250)

            elif f == "kmeans1":
                img = filter_Kmeans1(filename)

            elif f == "kmeans2":
                img = filter_Kmeans2(filename)

            elif f == "kmeansXYRGB":
                img = filter_KmeansXYRGB(filename)

            elif f == "threshold_color":
                img = filter_color_threshold(filename)

            elif f == 'equalizer' :
                img = EqualizerImg(filename)

            elif f == "mean_shift":
                img = filter_MeanShift(filename)

            else:
                print("Select 'rgb','gray, 'hsv', 'lab', 'canny', 'kmeans1', 'kmeans2', 'kmeansXYRGB','threshold_color','equalizer', 'mean_shift'")


            # On resize les images suivant la taille indiqué en paramètre
            img = cv2.resize(img,dsize = (size_img,size_img))

            # On applatit l'image
            flat_image = img.reshape(-1)

            # et on concatene l'image applatit avec sa catégorie
            t_cell = [df.loc[i,"category"]] 
            data.append(np.concatenate((t_cell,flat_image),axis=0))

        # on convertit le tableau en Dataframe et on renomme le nom des colonnes
        data_df = pd.DataFrame(data)
        data_df = data_df.add_prefix('pixel_')
        data_df = data_df.rename(columns={data_df.columns[0]:'type'})  

        # On enregistre le dataframe soit en .parquet, soit en .csv
        if save_option == 'parquet':
            data_df.to_parquet('created_datasets/'+ 'dataset'+'_'+str(nb_img) + '_img__s_' + str(size_img) +'x' + str(size_img) + '__f_' + 
                               str(f)+ '.parquet')

        elif save_option == 'csv':
            data_df.to_csv('created_datasets/'+ 'dataset'+'_'+str(nb_img) + '_img__s_' + str(size_img) +'x' + str(size_img) + '__f_' + 
                               str(f)+ '.csv')
        
    end = time.time()
    duration = round(end - start, 2)
    print(f"durée: {duration}")
    

# Fonction pour charger le dataset (.parquet)

In [6]:
def load_dataset(dataset_name):
    start = time.time()
    
    # on charge les données suivant leur format.
    if "parquet" in dataset_name:
        data_df = pd.read_parquet(dataset_name)
    elif "csv" in dataset_name:
        data_df = pd.read_csv(dataset_name, index_col=0)
    else:
        print('Unknown format. Only "csv" and "parquet" formats are supported')
        
    end = time.time()
    load_time = round(end - start,3)
    
    return data_df, load_time

# Fonction pour les modeles

In [7]:
def modeling(dataset,model,display_results_list, param, cv):

    start = time.time()
    
    # Extraire les features et la variable cible
    
    target = dataset["type"]
    features = dataset.drop(["type"], axis = 1)
    
    # spliter les données
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, shuffle = True, random_state = 123)
    
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    '''
    if 'confusion_matrix' in display_results_list:
        cm_train = pd.crosstab (y_train, y_pred_train, rownames=['Reality'], colnames=['Prediction'])
        
    if 'classification_report' in display_results_list:
        CLASSIF_ON_TRAIN = classification_report(y_train, y_pred_train)
    
    if 'confusion_matrix' in display_results_list:
        cm_test = pd.crosstab (y_test, y_pred_test, rownames=['Reality'], colnames=['Prediction'])
        
    if 'classification_report' in display_results_list:
        CLASSIF_ON_TEST = classification_report(y_test, y_pred_test)
    '''
    acc_train = round(model.score(X_train, y_train),2) # retourne l'accuracy du dataset d'entrainement
    acc_test = round(model.score(X_test, y_test),2) # retourne l'accuracy du dataset d'entrainement

    
    #----------- Optimisation des paramètre du notre modèle avec RandomizedSearchCV ------------#
    
    if str(model) == 'RandomForestClassifier()':   
        clf = RandomizedSearchCV(estimator = model,
                                 param_distributions = param,
                                 cv = cv,
                                 # n_jobs = -1
                                 n_iter = 5
                                )

        clf.fit(X_train, y_train)
        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)

        acc_train_cv = clf.score(X_train, y_train) # retourne l'accuracy du dataset d'entrainement
        acc_test_cv = clf.score(X_test, y_test) # retourne l'accuracy du dataset d'entrainement

        acc_train_cv = round(acc_train_cv,3)
        acc_test_cv = round(acc_test_cv,2)

        Best_params = clf.best_params_
        Best_estimator =  clf.best_estimator_

        # résultats à mettre dans un DataFrame selon le type du modèle (RF ou SVM) 
        DF_cv_results = pd.DataFrame(clf.cv_results_)[['param_max_depth', 'param_n_estimators','param_criterion',
                                                               'param_bootstrap', 'param_max_leaf_nodes','param_max_features',
                                                               'mean_test_score', 'rank_test_score']]
    else :
        acc_train_cv = 'null'
        acc_test_cv = 'null'
        
    end = time.time()
    duration = round(end - start,2)
    
    # Choisir les résultats que nous voulons que la fonction retourne après l'exécution et ça en prenant en compte
    # le modèle utilisé (RF ou SVM)
    if str(model) == 'RandomForestClassifier()' :
        modele = 'RF'
    elif str(model) == 'SVC()' :
        modele = 'SVM'
        
    #if 'confusion_matrix' in display_results_list: 
        #return modele, acc_train, acc_test, cm_train, cm_test, duration
    
    #elif 'classification_report' in display_results_list:
        #return modele, acc_train, acc_test, CLASSIF_ON_TRAIN, CLASSIF_ON_TEST, duration
    
    #else :
        #return modele, acc_train, acc_test , acc_train_cv, acc_test_cv, duration
    return modele, acc_train, acc_test , acc_train_cv, acc_test_cv, duration

# Etude pour définir le rapport Taille/ Nombre d'images

Nous possédons un dataset de 17092 image réparties en 8 catégories. On va partir du principe que nous allons utiliser l'intégralité du dataset. 

Nous allons donc tenter de définir une taille d'image permettant d'utiliser l'intégralité du dataset sans déteriorer les résultats et en prenant en compte l'overfitting.

Par defaut nous utiliserons les images bruts sans filtre, c'est à dire en RGB

## Fonction pour ressortir les Resultats sous format d'un DataFrame

In [8]:
def Result_modeling (filter_list, size_img, nb_img, models, param, cv) :
    '''
    Cette fonction permet de réetirer la fonction "modeling" du machine learning sur différents datasets, les datsets
    possèdes tous le même nombre d'images et la même taille des images mais chacun de ces datasets possède un filtre 
    différent. Ensuite resortir les résultats du machine learning sous forme d'un DataFrame
    
    '''
    # créer un dict où stocker le machine learning
    result_df = {'nbr_images': [],
                 'filter': [],
                 'taille' : [] ,
                 'load_time (s)' : [],
                 'model': [],
                 'duration (s)': [],
                 'accuracy_train': [],
                 'accuracy_test': [],
                 'methode' : [],
                 'accuracy_train_cv' :[],
                 'accuracy_test_cv' :[]
                         }
    
    for filter_img in tqdm_notebook(filter_list) :
        data_df, load_time = load_dataset('created_datasets/' + 'dataset'+'_'+str(nb_img) + '_img__s_' + str(size_img)+ 'x' + 
                                               str(size_img) + '__f_' + str(filter_img) + '.parquet')
        
        for model in models :
            modele, acc_train, acc_test , acc_train_cv, acc_test_cv, duration = modeling(data_df, model, ("confusion_matrix","classification_report" ),param,cv) 
            
            # mettre à jour le dictionnaire avec les résultats à chaque itération 
            result_df['nbr_images'].append(str(nb_img))
            result_df['filter'].append(str(filter_img))
            result_df['taille'].append(str(size_img)+'x'+str(size_img))
            result_df['load_time (s)'].append(load_time)
            result_df['model'].append(modele)
            result_df['duration (s)'].append(duration)
            result_df['accuracy_train'].append(acc_train)
            result_df['accuracy_test'].append(acc_test)
            result_df['methode'].append('RandomizedSearchCV')
            result_df['accuracy_train_cv'].append(acc_train_cv)
            result_df['accuracy_test_cv'].append(acc_test_cv)

    df_results = pd.DataFrame.from_dict(result_df)
    return df_results
    

In [9]:
# utiliser du fin-tuning pour améliorer les résultats des modèles simples 'RandomForestClassifier()' et 'SVC()'  

# Paramètres du RF
param_rf = {'max_depth': [10,50, 100, 150,200, None], 
            'n_estimators':[10, 50, 100, 150, 200], 
            'criterion' : ['gini', 'entropy'],
            'bootstrap':[True, False],
            'max_leaf_nodes':[None,10, 50, 100, 150, 200],
            'max_features':['auto', 'sqrt']
           }

# Paramètres du SVM
param_svc = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf', 'linear'],
            }

In [20]:
dataset_creation(df_infos,'all',45,['rgb','hsv','lab','gray','canny','kmeans1','kmeans2','kmeansXYRGB','threshold_color','equalizer', 'mean_shift'])


  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

  0%|          | 0/17092 [00:00<?, ?it/s]

durée: 113.06


# All Taille 60x60 

In [10]:
# On selectionne le datset que nous voulons analyser on donnant ()
# Afficher les resultas dans un DataFrame
Resultats_60 = Result_modeling(['rgb','hsv','lab','gray','canny','kmeans1','kmeans2','kmeansXYRGB','threshold_color',
                             'equalizer', 'mean_shift'], 60,"all", [RandomForestClassifier()], param=param_rf, cv = 5)
Resultats_60 

  0%|          | 0/11 [00:00<?, ?it/s]

Unnamed: 0,nbr_images,filter,taille,load_time (s),model,duration (s),accuracy_train,accuracy_test,methode,accuracy_train_cv,accuracy_test_cv
0,all,rgb,60x60,5.333,RF,1190.17,1.0,0.85,RandomizedSearchCV,0.883,0.82
1,all,hsv,60x60,5.311,RF,2111.64,1.0,0.86,RandomizedSearchCV,1.0,0.86
2,all,lab,60x60,5.342,RF,1027.62,1.0,0.83,RandomizedSearchCV,1.0,0.84
3,all,gray,60x60,2.196,RF,433.15,1.0,0.79,RandomizedSearchCV,0.795,0.76
4,all,canny,60x60,1.296,RF,274.87,1.0,0.54,RandomizedSearchCV,1.0,0.55
5,all,kmeans1,60x60,4.64,RF,647.18,1.0,0.83,RandomizedSearchCV,1.0,0.83
6,all,kmeans2,60x60,4.326,RF,664.74,1.0,0.86,RandomizedSearchCV,1.0,0.85
7,all,kmeansXYRGB,60x60,5.275,RF,1177.25,1.0,0.86,RandomizedSearchCV,1.0,0.86
8,all,threshold_color,60x60,5.463,RF,517.46,1.0,0.78,RandomizedSearchCV,0.75,0.7
9,all,equalizer,60x60,6.644,RF,2113.41,1.0,0.83,RandomizedSearchCV,1.0,0.81


In [12]:
Resultats_60.to_excel('Analysis/' + 'Resultats_MachineLearning_RF_60.xlsx', index=False)

# All Taille 45x45

In [21]:
Resultats_45 = Result_modeling(['rgb','hsv','lab','gray','canny','kmeans1','kmeans2','threshold_color'], 
                               45,"all", [RandomForestClassifier()], param=param_rf, cv = 5)
                             
Resultats_45

  0%|          | 0/8 [00:00<?, ?it/s]

Unnamed: 0,nbr_images,filter,taille,load_time (s),model,duration (s),accuracy_train,accuracy_test,methode,accuracy_train_cv,accuracy_test_cv
0,all,rgb,45x45,2.871,RF,1582.75,1.0,0.84,RandomizedSearchCV,0.889,0.83
1,all,hsv,45x45,2.698,RF,716.02,1.0,0.85,RandomizedSearchCV,0.881,0.81
2,all,lab,45x45,2.716,RF,1103.64,1.0,0.83,RandomizedSearchCV,1.0,0.83
3,all,gray,45x45,0.883,RF,581.36,1.0,0.79,RandomizedSearchCV,0.86,0.78
4,all,canny,45x45,0.714,RF,139.55,1.0,0.53,RandomizedSearchCV,0.631,0.52
5,all,kmeans1,45x45,2.464,RF,334.56,1.0,0.83,RandomizedSearchCV,0.857,0.8
6,all,kmeans2,45x45,2.35,RF,359.85,1.0,0.85,RandomizedSearchCV,1.0,0.87
7,all,threshold_color,45x45,2.291,RF,356.26,1.0,0.78,RandomizedSearchCV,0.805,0.73


In [22]:
Resultats_45.to_excel('Analysis/' + 'Resultats_MachineLearning_RF_45.xlsx', index=False)

In [27]:
Resultats_final = pd.concat([Resultats_60, Resultats_45])

In [28]:
Resultats_final.to_excel('Analysis/' + 'Resultats_MachineLearning_RF.xlsx', index=False)

In [29]:
Resultats=pd.read_excel('Analysis/' + 'Resultats_MachineLearning_RF.xlsx')
Resultats

Unnamed: 0,nbr_images,filter,taille,load_time (s),model,duration (s),accuracy_train,accuracy_test,methode,accuracy_train_cv,accuracy_test_cv
0,all,rgb,60x60,5.333,RF,1190.17,1,0.85,RandomizedSearchCV,0.883,0.82
1,all,hsv,60x60,5.311,RF,2111.64,1,0.86,RandomizedSearchCV,1.0,0.86
2,all,lab,60x60,5.342,RF,1027.62,1,0.83,RandomizedSearchCV,1.0,0.84
3,all,gray,60x60,2.196,RF,433.15,1,0.79,RandomizedSearchCV,0.795,0.76
4,all,canny,60x60,1.296,RF,274.87,1,0.54,RandomizedSearchCV,1.0,0.55
5,all,kmeans1,60x60,4.64,RF,647.18,1,0.83,RandomizedSearchCV,1.0,0.83
6,all,kmeans2,60x60,4.326,RF,664.74,1,0.86,RandomizedSearchCV,1.0,0.85
7,all,kmeansXYRGB,60x60,5.275,RF,1177.25,1,0.86,RandomizedSearchCV,1.0,0.86
8,all,threshold_color,60x60,5.463,RF,517.46,1,0.78,RandomizedSearchCV,0.75,0.7
9,all,equalizer,60x60,6.644,RF,2113.41,1,0.83,RandomizedSearchCV,1.0,0.81
