In [2]:
! pip install pandas
! pip install xgboost
! pip install scikit-learn
! pip install opencv-python



In [3]:
from pathlib import Path
from  glob import glob
import os
import cv2
import numpy as  np
import pandas as pd
import copy
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import xgboost as xgb
import time

In [4]:
# import machine learning algorithms
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [5]:

class CustomDataset:
    def __init__(self, proportion: float, root: Path, train: bool, size : tuple):
        """
        Initialisation de l'objet CustomDataset avec la proportion des données à utiliser
        et le répertoire racine contenant les images.

        :param proportion: Proportion des données à charger (0 à 1).
        :param root: Répertoire racine où se trouvent les images.
        """
        self.proportion = proportion
        self.root = root
        self.data = {"Features": [], "Target": []}
        self.target = ["benign", "malignant"]
       
        self.dataType = "train" if train else "test" 

        self.size =  size

    def dataloader(self):
        """
        Charge et traite les images depuis les répertoires 'benign' et 'malignant', 
        en appliquant une proportion des données spécifiée.

        :return: Tuple contenant les caractéristiques (Features) et les cibles (Target) des données.
        """
        # Chargement des images et valeurs cibles benign et malignant
        for item in self.target:
            
            files = glob(os.path.join(self.root, f"{self.dataType}/{item}/*.jpg"))
            num_files = int(len(files) * self.proportion)

            self.data["Features"].extend([cv2.resize(cv2.imread(imagePath), self.size).flatten().astype(np.float32) for imagePath in files[:num_files]])
            self.data["Target"].extend([item for _ in range(num_files)])
   
        return self.data["Features"], self.data["Target"]


In [6]:
proportion = 0.3

In [7]:
datasets = CustomDataset(proportion=proportion, root = "/home/laris/laris/cours/CV50/inputs/raw/data/", train=True, size=(224,224))

In [8]:
features , target = datasets.dataloader()

In [9]:
train = pd.DataFrame(data={"features": features, "target": target})

In [10]:
train

Unnamed: 0,features,target
0,"[162.0, 158.0, 223.0, 161.0, 160.0, 222.0, 168...",benign
1,"[161.0, 154.0, 229.0, 166.0, 159.0, 232.0, 169...",benign
2,"[164.0, 149.0, 241.0, 160.0, 147.0, 239.0, 164...",benign
3,"[60.0, 79.0, 112.0, 68.0, 83.0, 115.0, 71.0, 8...",benign
4,"[112.0, 126.0, 154.0, 113.0, 124.0, 154.0, 111...",benign
...,...,...
786,"[105.0, 95.0, 125.0, 100.0, 93.0, 120.0, 102.0...",malignant
787,"[0.0, 0.0, 10.0, 0.0, 0.0, 24.0, 43.0, 55.0, 1...",malignant
788,"[81.0, 95.0, 153.0, 111.0, 127.0, 186.0, 128.0...",malignant
789,"[154.0, 146.0, 183.0, 145.0, 143.0, 185.0, 142...",malignant


In [11]:
Train = pd.DataFrame(train['features'].tolist(), columns=[f'feature_{i+1}' for i in range(len(train['features'][0]))])

In [12]:
Train

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_150519,feature_150520,feature_150521,feature_150522,feature_150523,feature_150524,feature_150525,feature_150526,feature_150527,feature_150528
0,162.0,158.0,223.0,161.0,160.0,222.0,168.0,161.0,222.0,171.0,...,200.0,132.0,140.0,199.0,130.0,141.0,198.0,132.0,144.0,202.0
1,161.0,154.0,229.0,166.0,159.0,232.0,169.0,162.0,237.0,167.0,...,218.0,158.0,153.0,222.0,158.0,153.0,222.0,157.0,152.0,214.0
2,164.0,149.0,241.0,160.0,147.0,239.0,164.0,146.0,239.0,164.0,...,221.0,145.0,137.0,220.0,143.0,135.0,218.0,138.0,132.0,214.0
3,60.0,79.0,112.0,68.0,83.0,115.0,71.0,86.0,118.0,69.0,...,155.0,125.0,132.0,157.0,120.0,127.0,154.0,118.0,124.0,153.0
4,112.0,126.0,154.0,113.0,124.0,154.0,111.0,125.0,154.0,107.0,...,153.0,111.0,122.0,150.0,104.0,118.0,146.0,100.0,114.0,143.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786,105.0,95.0,125.0,100.0,93.0,120.0,102.0,93.0,120.0,110.0,...,155.0,146.0,134.0,152.0,144.0,132.0,152.0,145.0,133.0,153.0
787,0.0,0.0,10.0,0.0,0.0,24.0,43.0,55.0,109.0,67.0,...,183.0,97.0,118.0,186.0,100.0,119.0,186.0,90.0,109.0,176.0
788,81.0,95.0,153.0,111.0,127.0,186.0,128.0,144.0,211.0,123.0,...,202.0,111.0,130.0,197.0,116.0,134.0,199.0,104.0,122.0,187.0
789,154.0,146.0,183.0,145.0,143.0,185.0,142.0,141.0,181.0,158.0,...,181.0,166.0,151.0,182.0,181.0,152.0,185.0,162.0,147.0,179.0


In [12]:
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column. 
train["target"]= label_encoder.fit_transform(train["target"]) 

In [13]:
X_Train = Train.values
Y_Train = train["target"].values

In [14]:
x_train, x_val, y_train, y_val = train_test_split(X_Train, Y_Train, test_size=0.25, shuffle=True, random_state=42)

# Gradient Boosting Classifier

In [15]:
gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.5, max_features=2, max_depth = 2, random_state = 0)

In [16]:
start = time.time()
gb.fit(x_train, y_train)
stop = time.time()

In [17]:
y_pred = gb.predict(x_val)

In [18]:
print(f"training time for {proportion}% of training data is : {stop-start}")

training time for 0.7% of training data is : 0.07137298583984375


In [24]:
# Évaluer la performance
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.789587852494577
Confusion Matrix:
[[211  49]
 [ 48 153]]


# XGBoost

In [20]:
# Convertir les données en DMatrix (format efficace pour XGBoost)
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_val, label=y_val)

In [21]:

# Paramètres de l'algorithme XGBoost pour classification binaire
params = {
    'objective': 'binary:logistic',  # Classification binaire
    'eval_metric': 'logloss',  # Mesure d'évaluation (logarithmic loss)
    'max_depth': 3,  # Profondeur maximale des arbres
    'learning_rate': 0.1,  # Taux d'apprentissage
    'silent': 1,  # Ne pas afficher de messages
}

# Entraîner le modèle
num_round = 100  # Nombre d'itérations



In [22]:
start = time.time()
bst = xgb.train(params, dtrain, num_round)
stop = time.time()

Parameters: { "silent" } are not used.



In [23]:
print(f"training time for {proportion}% of training data is : {stop-start}")

training time for 0.7% of training data is : 92.23999762535095


In [26]:
# Prédictions sur le jeu de test
y_pred_prob = bst.predict(dtest)  # Probabilités des prédictions
y_pred = (y_pred_prob > 0.5).astype(int)  # Conversion des probabilités en classes (0 ou 1)

# Évaluer la performance
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.841648590021692
Confusion Matrix:
[[218  42]
 [ 31 170]]


Proportion: 0.3%


    Gradient Boosting Classifier

        training time for 0.3% of training data is : 0.047365665435791016

        Accuracy: 0.8232323232323232
        Confusion Matrix:
        [[96 20]
        [15 67]]

    XGBoost

        training time for 0.3% of training data is : 61.51552224159241

        Accuracy: 0.8484848484848485
        Confusion Matrix:
        [[99 17]
        [13 69]]


Proportion: 0.5%


    Gradient Boosting Classifier

        training time for 0.5% of training data is : 0.05202317237854004

        Accuracy: 0.7272727272727273
        Confusion Matrix:
        [[143  40]
        [ 50  97]]

    XGBoost

        training time for 0.3% of training data is : 76.7379252910614

        Accuracy: 0.796969696969697
        Confusion Matrix:
        [[155  28]
        [ 39 108]]

Proportion: 0.7%


    Gradient Boosting Classifier

        training time for 0.7% of training data is : 92.23999762535095

        Accuracy: 0.789587852494577
        Confusion Matrix:
        [[211  49]
        [ 48 153]]

    XGBoost

        training time for 0.3% of training data is : 76.7379252910614
        
        Accuracy: 0.841648590021692
        Confusion Matrix:
        [[218  42]
        [ 31 170]]

Proportion: 0.95%

   Erreur: 
   
      The Kernel crashed while executing code in the current cell or a previous cell. 
      Please review the code in the cell(s) to identify a possible cause of the failure. 
      Click here for more info. 
      View Jupyter log for further details.