# Importando bibliotecas

In [2]:
import numpy as np
import os
from pandas import read_csv
import time
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import normalize
import argparse
import csv

## Carregando o dataset

O dataset se encontra no link abaixo, e utilizamos o header para identificar cada um dos 32 atributos.

In [10]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
colunas = ["ID","Diagnóstico",
           "Raio - Média","Textura - Média","Perímetro - Média","Área - Média","Suavidade - Média","Compactação - Média","Concavidade - Média","Pontos Concavos - Média","Simetria - Média","Dimensão Fractal - Média",
           "Raio - Desvio Padrão","Textura - Desvio Padrão","Perímetro - Desvio Padrão","Área - Desvio Padrão","Suavidade - Desvio Padrão","Compactação - Desvio Padrão","Concavidade - Desvio Padrão","Pontos Concavos - Desvio Padrão","Simetria - Desvio Padrão","Dimensão Fractal - Desvio Padrão",
           "Raio - Máximo","Textura - Máximo","Perímetro - Máximo","Área - Máximo","Suavidade - Máximo","Compactação - Máximo","Concavidade - Máximo","Pontos Concavos - Máximo","Simetria - Máximo","Dimensão Fractal - Máximo"]
dataset = read_csv(url, names = colunas).replace(['B','M'], [0,1])

print("Shape do dataset:", dataset.shape)
dataset.head(5)

Shape do dataset: (569, 32)


Unnamed: 0,ID,Diagnóstico,Raio - Média,Textura - Média,Perímetro - Média,Área - Média,Suavidade - Média,Compactação - Média,Concavidade - Média,Pontos Concavos - Média,...,Raio - Máximo,Textura - Máximo,Perímetro - Máximo,Área - Máximo,Suavidade - Máximo,Compactação - Máximo,Concavidade - Máximo,Pontos Concavos - Máximo,Simetria - Máximo,Dimensão Fractal - Máximo
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [12]:
array = dataset.values

X = array[:, 2:]
y = array[: , 1]

X = normalize(X, 'max', axis=0)

In [21]:
def create_clasifiers(cl):
    
    classifiers = []
    parameters = []
    names = []
        
    # KNN
    neighbors = [1, 5, 10, 15, 20]
    for k in neighbors:

        name = "knn_" + str(k) + '_neighbors'
        classifiers.append(KNeighborsClassifier(n_neighbors=k))
        parameters.append(k)
        names.append(name)

    # SVM
    c = [0.1, 1, 5, 10]
    for Cval in c:

        name = "svm_" + str(Cval) + '_Cval'
        classifiers.append(LinearSVC(random_state=42, C=Cval, max_iter=10000))
        parameters.append(Cval)
        names.append(name)

    # DT
    max_f = ['sqrt', 'log2', 'auto', None]
    for maxf in max_f:

        name = "dt_" + str(maxf) + '_feat'
        classifiers.append(DecisionTreeClassifier(random_state=42, max_features=maxf))
        parameters.append(maxf)
        names.append(name)

    # MLP
    layers = [5, 10, 30, 50]
    for lay in layers:

        name = "mlp_" + str(lay) + '_layers'
        classifiers.append(MLPClassifier(random_state=42, hidden_layer_sizes=(lay,), max_iter=1000))
        parameters.append(lay)
        names.append(name)
    
    return classifiers, names, parameters

In [22]:
classifiers, names, parameters = create_clasifiers(cl)

In [25]:
kf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True) 

In [28]:
def sum_confusion_matrix(conf_matrix, conf_matrix_sum):

    for i in range(0,2):
        for j in range(0,2):
            conf_matrix_sum[i][j] = conf_matrix_sum[i][j] + conf_matrix[i][j]

    return conf_matrix_sum

In [29]:
for i in range(len(classifiers)):
    print(names[i])
    clf = classifiers[i]
    y_test_all = np.asarray([], dtype=np.uint8)
    y_pred_all = np.asarray([], dtype=np.uint8)
    conf_matrix_sum = np.zeros((2,2), dtype=np.uint16)

    for train_index, test_index in kf.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        conf_matrix = confusion_matrix(y_test, y_pred)
        y_test_all = np.concatenate((y_test_all, y_test))
        y_pred_all = np.concatenate((y_pred_all, y_pred))
        conf_matrix_sum = sum_confusion_matrix(conf_matrix, conf_matrix_sum)


    print(conf_matrix_sum)
    precision, recall, fscore, _ = precision_recall_fscore_support(y_test_all, y_pred_all, average='macro')
    print('Precision: ', precision)
    print('Recall: ', recall)
    print('Fscore: ', fscore)
    print('\n=======================================================\n')

knn_1_neighbors
[[344  13]
 [ 16 196]]
Precision:  0.9466772993088783
Recall:  0.944056868030231
Fscore:  0.9453350427520316


knn_5_neighbors
[[351   6]
 [ 13 199]]
Precision:  0.9675087108013938
Recall:  0.9609362612969716
Fscore:  0.9640420811755588


knn_10_neighbors
[[355   2]
 [ 15 197]]
Precision:  0.974704604101589
Recall:  0.9618215210612546
Fscore:  0.9676268503365161


knn_15_neighbors
[[354   3]
 [ 16 196]]
Precision:  0.9708406899361673
Recall:  0.9580624702711273
Fscore:  0.9638182444937533


knn_20_neighbors
[[353   4]
 [ 21 191]]
Precision:  0.9616687234334294
Recall:  0.944869457216849
Fscore:  0.9521876060863748


svm_0.1_Cval
[[355   2]
 [ 24 188]]
Precision:  0.9630745729759755
Recall:  0.9405951059669151
Fscore:  0.9499986480640277


svm_1_Cval
[[354   3]
 [ 13 199]]
Precision:  0.9748630857636172
Recall:  0.9651379419692405
Fscore:  0.9696266047455094


svm_5_Cval
[[354   3]
 [ 12 200]]
Precision:  0.9762173948154729
Recall:  0.9674964325352783
Fscore:  0.97155426

Rascunho:

Utilizar KNN para classificação, Regressão Logística para regressão e KMeans para cluster. Todos utilizando K-Fold. Com isso já abrangemos 3 tópicos que o professor comentou.