# **Seminário de Introdução a Imagens Médicas - Inteligência artificial aplica a imagens médicas**

### *Breast Cancer Detection using Machine Learning Techniques*

**Alunos:** _Caio Fernandes Lott Primola_     - 20193001742<br>
_Henrique Rodrigues Lima_         - 20193009473

Este trabalho consiste na simulação e análise de diferentes modelos de machine learning acerca da detecção de cancêr de mama, replicando as metodologias utilizadas no artigo referência.  

As técnicas utilizadas foram:
    - Redes neurais convolucionais (CNN);<br>
    - K-nearest neighbor (KNN);<br>
    - Random Forest;<br>
    - Regressão Logística;<br>
    - Support Vector Machine (SVM);<br>
    - Naive Bayes (GNB)<br>

Para instalar as bibliotecas necessárias, utilize a célula abaixo

In [1]:
# import sys
# !{sys.executable} -m pip install -r requirements.txt

### Inicialização e carregamento dos dados

In [2]:
import os
import cv2
import re
from collections import Counter
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score
from sklearn.utils import shuffle
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from skimage import filters, measure

In [3]:
dataset_path = "histology_slides"  
regex_label = r"[A-Z]+_([A-Z])_[A-Z]+-\d{2}-[A-Z\d]+-(\d+)-\d+\.png"
magnification = "400X" #diretorio da magnificação desejada

In [4]:
def load_data(dataset_path, magnification, img_size):
    X = []
    y = []
    num_files = 0

    for root, dirs, files in os.walk(dataset_path):
        if os.path.basename(root) == magnification: 
            for file in files:
                if file.endswith('.png'):
                    img_path = os.path.join(root, file)
                    img = cv2.imread(img_path)
                    img = cv2.resize(img, img_size)
                    X.append(img)

                    # Assumindo que na label B (benigno) e M (maligno)
                    match_obj = re.search(regex_label, file)
                    label = match_obj.group(1)  # B or M
                    y.append(True if label == "M" else False)

                    num_files += 1

    print(f"Total images processed: {num_files}")

    X = np.array(X)
    y = np.array(y)
    return X, y

In [5]:
img_size = (700, 460)
X, y = load_data(dataset_path, magnification, img_size)

Total images processed: 1820


In [6]:
count = Counter(y)
print(count)

Counter({True: 1232, False: 588})


## Preprocessemamento dos dados

### Normalização dos dados

In [7]:
X = X / 255

### Feature Selection

In [8]:
X_flat = X.reshape(X.shape[0], -1)
selector = SelectKBest(f_classif, k=500) 
X_new = selector.fit_transform(X_flat, y)

### Recursive Feature Elimination

In [9]:
lr = LogisticRegression(max_iter=1000, random_state=42)
rfe = RFE(estimator=lr, n_features_to_select=100, step=50)
X_rfe = rfe.fit_transform(X_new, y)

### Separação entre dados de treino e teste

In [10]:
X_rfe, y = shuffle(X_rfe, y)
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.2, random_state=42)

## Modelos
### Redes Neurais Convolucionais

In [11]:
def build_cnn(input_shape):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(len(np.unique(y)), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [12]:
cnn_model = build_cnn((X_train.shape[1],))
cnn_model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6117 - loss: 0.6894 - val_accuracy: 0.6648 - val_loss: 0.6654
Epoch 2/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6448 - loss: 0.6673 - val_accuracy: 0.6648 - val_loss: 0.6395
Epoch 3/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6854 - loss: 0.6293 - val_accuracy: 0.6648 - val_loss: 0.6348
Epoch 4/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6610 - loss: 0.6542 - val_accuracy: 0.6648 - val_loss: 0.6241
Epoch 5/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6925 - loss: 0.6065 - val_accuracy: 0.6731 - val_loss: 0.6108
Epoch 6/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6918 - loss: 0.5948 - val_accuracy: 0.6841 - val_loss: 0.6040
Epoch 7/10
[1m46/46[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x191de734190>

In [13]:
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test, y_test)
cnn_predictions = np.argmax(cnn_model.predict(X_test), axis=-1)
cnn_precision = precision_score(y_test, cnn_predictions, average='weighted')

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 774us/step - accuracy: 0.7266 - loss: 0.5870
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


### K-nearest Neighbors

In [14]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_precision = precision_score(y_test, knn_predictions, average='weighted')

### Random Forest Generator

In [15]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions, average='weighted')

### Logistic Regression

In [16]:
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions, average='weighted')

### Naive Bayes

In [17]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_predictions = nb.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_precision = precision_score(y_test, nb_predictions, average='weighted')

### SVM

In [18]:
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
svm_predictions = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions, average='weighted')

In [19]:
print(f"CNN - Accuracy: {cnn_accuracy:.4f}, Precision: {cnn_precision:.4f}")
print(f"KNN - Accuracy: {knn_accuracy:.4f}, Precision: {knn_precision:.4f}")
print(f"Random Forest - Accuracy: {rf_accuracy:.4f}, Precision: {rf_precision:.4f}")
print(f"Logistic Regression - Accuracy: {lr_accuracy:.4f}, Precision: {lr_precision:.4f}")
print(f"Naive Bayes - Accuracy: {nb_accuracy:.4f}, Precision: {nb_precision:.4f}")
print(f"SVM - Accuracy: {svm_accuracy:.4f}, Precision: {svm_precision:.4f}")

CNN - Accuracy: 0.7253, Precision: 0.7151
KNN - Accuracy: 0.7115, Precision: 0.6967
Random Forest - Accuracy: 0.7390, Precision: 0.7321
Logistic Regression - Accuracy: 0.7500, Precision: 0.7508
Naive Bayes - Accuracy: 0.6456, Precision: 0.6965
SVM - Accuracy: 0.7335, Precision: 0.7322
