<a href="https://colab.research.google.com/github/FranciscoBPereira/Bioinformatica25-26/blob/main/Bioinfo2526_Aula12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


**0. Leitura e preparação dos Datasets**

In [None]:
# Leitura do dataset. O ficheiro 'B.xlsx' deve estar colocado na diretoria de trabalho
# O ficheiro tem 3 folhas: Original, Best, PCA (ver aula anterior)
# A identificação das classes está apenas na folha Original
# Os dados originais são transformados (standardizados). Os restantes já foram transformados

from sklearn.preprocessing import StandardScaler

df= pd.read_excel('B.xlsx', 'Original')

D_Or = df.drop(['ID'], axis=1)
Classe = df['ID']

# Standardizar dados originais
scaler = StandardScaler()
D_Or = pd.DataFrame(scaler.fit_transform(D_Or), columns = D_Or.columns)

D_PCA = pd.read_excel('B.xlsx', 'PCA')
D_Best = pd.read_excel('B.xlsx', 'Best')


In [None]:
# Transformar a identificação das classes em valores numéricos: 0 (Normal) e 1(Tumor)

Classe = np.where(Classe == 'N', 0, Classe)
Classe = np.where(Classe == 'T', 1, Classe)
Classe = Classe.astype('int64')

In [None]:
# Informação do Dataset Original

print('Dataset Original: ', D_Or.shape)

D_Or.describe().round(2)

In [None]:
# Informação do Dataset Best

print('Dataset Best: ', D_Best.shape)

print(D_Best.describe().round(2))

D_Best.boxplot(figsize=(10,10))
plt.show()

In [None]:
# Informação do Dataset PCA

print('Dataset PCA: ', D_PCA.shape)

print(D_PCA.describe().round(2))

D_PCA.boxplot(figsize=(10,10))
plt.show()

**1. Aprendizagem Supervisionada**

Vão ser testados vários algoritmos supervisionados nos 3 datasets: KNN, Decision Tree, MLP, Random Forest

Será adotada uma estratégia de cross-validation com leave one out.


In [None]:
# Função auxiliar para mostrar matriz de confusão

from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    sns.set(font_scale=2)
    print(cm)
    sns.set(font_scale=1.5)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    #thresh = cm.max() / 2.
    thresh = 0.5
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
from sklearn.metrics import recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
# Treinar um modelo KNN
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html


# Completar com valores para os parâmetros. No caso do KNN, o número de vizinhos é a única variável a considerar

viz = ##### COMPLETAR ####

KNN = KNeighborsClassifier(n_neighbors = viz)


# Escolher Dataset: D_Or, D_Best, D_PCA

D = ### COMPLETAR #####

predKNN = cross_val_predict(KNN, D, Classe, cv=LeaveOneOut())

print('Accuracy: %.2f ' %accuracy_score(predKNN, Classe))

print('Precision: %.2f ' %precision_score(Classe, predKNN, average='binary'))

print('Recall: %.2f ' %recall_score(Classe, predKNN, average='binary'))

# Plot normalized confusion matrix
plt.figure(figsize=(10,10))
class_names = np.asarray(['N', 'T'])
np.set_printoptions(precision=2)

plot_confusion_matrix(Classe, predKNN, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.tight_layout()
plt.show()



**Questões:**

1. Qual o impacto do parâmetro número de vizinhos?

2. Qual o impacto da transformação nos dados (Original, Best, PCA)?


In [None]:
# Treinar uma árvore de decisão
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

# Verificar quais os parâmetros que podem ser ajustados e testar alternativas

#### Definir parâmetros da árvore de decisão #####

DT = DecisionTreeClassifier(random_state=10)

# Escolher Dataset: Original, Best, PCA

D = #### COMPLETAR #####

pred = cross_val_predict(DT, D, Classe, cv=LeaveOneOut())

print('Accuracy: %.2f ' %accuracy_score(pred, Classe))

print('Precision: %.2f ' %precision_score(Classe, pred, average='binary'))

print('Recall: %.2f ' %recall_score(Classe, pred, average='binary'))

# Plot normalized confusion matrix
plt.figure(figsize=(10,10))
class_names = np.asarray(['N', 'T'])
np.set_printoptions(precision=2)

plot_confusion_matrix(Classe, pred, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.tight_layout()
plt.show()


In [None]:
# Treinar um modelo MLP
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html



In [None]:
# Treinar uma Random Forest
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html




**Efetuar uma análise abrangente de resultados**

1. Impacto dos algoritmos de aprendizagem

2. Impacto da parametrização

3. Impacto da transformação de dados
