## Problème de prédiction du type de cancer de la peau : distinction entre tumeurs bénignes et malignes à l’aide de modèles d’apprentissage automatique

#### Importer les bibliothèque essentielles

In [3]:
! pip install pandas
! pip install xgboost
! pip install scikit-learn
! pip install opencv-python
! pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.1.21-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf!=4.21.0,!

In [4]:
pip install --upgrade keras

Note: you may need to restart the kernel to use updated packages.


In [5]:
from pathlib import Path
from  glob import glob
import os
import cv2
import numpy as  np
import pandas as pd
import copy
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import xgboost as xgb
import time
import matplotlib.pyplot as plt

In [6]:
# import machine learning algorithms
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [7]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation, Input

2025-01-23 13:24:59.477087: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-23 13:24:59.477657: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-23 13:24:59.479964: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-23 13:24:59.485377: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737635099.495172   16503 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737635099.49

#### Définition d'une classe pour charger les données

In [5]:

class CustomDataset:
    def __init__(self, proportion: float, root: Path, train: bool, size : tuple):
        """
        Initialisation de l'objet CustomDataset avec la proportion des données à utiliser
        et le répertoire racine contenant les images.

        :param proportion: Proportion des données à charger (0 à 1).
        :param root: Répertoire racine où se trouvent les images.
        """
        self.proportion = proportion
        self.root = root
        self.data = {"Features": [], "Target": []}
        self.target = ["benign", "malignant"]
       
        self.dataType = "train" if train else "test" 

        self.size =  size

    def dataloader(self):
        """
        Charge et traite les images depuis les répertoires 'benign' et 'malignant', 
        en appliquant une proportion des données spécifiée.

        :return: Tuple contenant les caractéristiques (Features) et les cibles (Target) des données.
        """
        # Chargement des images et valeurs cibles benign et malignant
        for item in self.target:
            
            files = glob(os.path.join(self.root, f"{self.dataType}/{item}/*.jpg"))
            num_files = int(len(files) * self.proportion)

            self.data["Features"].extend([cv2.resize(cv2.imread(imagePath), self.size).flatten().astype(np.float32) for imagePath in files[:num_files]])
            self.data["Target"].extend([item for _ in range(num_files)])
   
        return self.data["Features"], self.data["Target"]


### Charger les données disponibles dans un dossier dont vous connaissez le chemin

##### Définir le chemin vers le dossier contenant les données

In [None]:
root = ...

##### Définir la proportion des données à charger

In [6]:
proportion = ...

In [7]:
datasets = CustomDataset(proportion=proportion, root = root, train=True, size=(224,224))

In [8]:
features , target = datasets.dataloader()

In [9]:
train = pd.DataFrame(data={"features": features, "target": target})

In [10]:
train

Unnamed: 0,features,target
0,"[162.0, 158.0, 223.0, 161.0, 160.0, 222.0, 168...",benign
1,"[161.0, 154.0, 229.0, 166.0, 159.0, 232.0, 169...",benign
2,"[164.0, 149.0, 241.0, 160.0, 147.0, 239.0, 164...",benign
3,"[60.0, 79.0, 112.0, 68.0, 83.0, 115.0, 71.0, 8...",benign
4,"[112.0, 126.0, 154.0, 113.0, 124.0, 154.0, 111...",benign
...,...,...
786,"[105.0, 95.0, 125.0, 100.0, 93.0, 120.0, 102.0...",malignant
787,"[0.0, 0.0, 10.0, 0.0, 0.0, 24.0, 43.0, 55.0, 1...",malignant
788,"[81.0, 95.0, 153.0, 111.0, 127.0, 186.0, 128.0...",malignant
789,"[154.0, 146.0, 183.0, 145.0, 143.0, 185.0, 142...",malignant


In [11]:
Train = pd.DataFrame(train['features'].tolist(), columns=[f'feature_{i+1}' for i in range(len(train['features'][0]))])

In [12]:
Train

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_150519,feature_150520,feature_150521,feature_150522,feature_150523,feature_150524,feature_150525,feature_150526,feature_150527,feature_150528
0,162.0,158.0,223.0,161.0,160.0,222.0,168.0,161.0,222.0,171.0,...,200.0,132.0,140.0,199.0,130.0,141.0,198.0,132.0,144.0,202.0
1,161.0,154.0,229.0,166.0,159.0,232.0,169.0,162.0,237.0,167.0,...,218.0,158.0,153.0,222.0,158.0,153.0,222.0,157.0,152.0,214.0
2,164.0,149.0,241.0,160.0,147.0,239.0,164.0,146.0,239.0,164.0,...,221.0,145.0,137.0,220.0,143.0,135.0,218.0,138.0,132.0,214.0
3,60.0,79.0,112.0,68.0,83.0,115.0,71.0,86.0,118.0,69.0,...,155.0,125.0,132.0,157.0,120.0,127.0,154.0,118.0,124.0,153.0
4,112.0,126.0,154.0,113.0,124.0,154.0,111.0,125.0,154.0,107.0,...,153.0,111.0,122.0,150.0,104.0,118.0,146.0,100.0,114.0,143.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786,105.0,95.0,125.0,100.0,93.0,120.0,102.0,93.0,120.0,110.0,...,155.0,146.0,134.0,152.0,144.0,132.0,152.0,145.0,133.0,153.0
787,0.0,0.0,10.0,0.0,0.0,24.0,43.0,55.0,109.0,67.0,...,183.0,97.0,118.0,186.0,100.0,119.0,186.0,90.0,109.0,176.0
788,81.0,95.0,153.0,111.0,127.0,186.0,128.0,144.0,211.0,123.0,...,202.0,111.0,130.0,197.0,116.0,134.0,199.0,104.0,122.0,187.0
789,154.0,146.0,183.0,145.0,143.0,185.0,142.0,141.0,181.0,158.0,...,181.0,166.0,151.0,182.0,181.0,152.0,185.0,162.0,147.0,179.0


#### Encoder la variable cible pour pouvoir faire l'entraînement

In [12]:
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column. 
train["target"]= label_encoder.fit_transform(train["target"]) 

In [13]:
X_Train = Train.values
Y_Train = train["target"].values

#### À l'aide de sklearn, écrivez une fonction pour diviser les données en ensemble d'apprentissage et de validation

In [14]:
x_train, x_val, y_train, y_val = train_test_split(X_Train, Y_Train, test_size=0.25, shuffle=True, random_state=42)

### À l'aide de différents algorithmes de scikit-learn, entraînez le modèle. 

In [None]:
...

### Faire l'inférence du modèle construit

In [None]:
...

## Problème de prédiction du type de cancer de la peau : distinction entre tumeurs bénignes et malignes à l’aide de modèles d’apprentissage profond

#### En utilisant l'ensemble d'entraînement et l'ensemble de validation, définissez un modèle d'apprentissage profond.


In [None]:
model = Sequential() # Create sequential model

# Add network layers
model.add(Dense(.., ..))


#### Sommaire des différentes couches de votre modèle

In [None]:
model.summary()

#### Définir la fonction coût

In [None]:
loss = ....


#### Définir l'algorithme d'optimisation

In [None]:
optimizer = ...

#### Définir la métrique d'évaluation

In [None]:
metrics = ...

#### Compiler votre modèle

In [None]:
# Compile model
model.compile(optimizer=optimizer,
              loss=loss,
              metrics=metrics)

### Définir le nombre d'époques et le nombre d'exemples à mettre dans l'algorithme

In [None]:
verbose=1
epochs=...
batch_size = ...

#### Entraîner le modèle

In [None]:
# Fit the model 
history = model.fit(train, train, 
                              epochs=epochs, 
                              batch_size=batch_size, 
                              verbose=verbose,
                              validation_split=0.2,
                              shuffle=True)

#### Fonction coût pour l'ensemble d'entraînement et de test

In [None]:
print(history.history.keys())
print(history.history['val_accuracy'][-1])
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

####  Continuer