In [2]:
# Bibliotecas básicas para manipular qualquer modelo
import numpy as np
import pandas as pd
import sklearn

# Separador entre base de treino e de teste
from sklearn.model_selection import train_test_split

# Ferramente de normalização, essencial para o modelo
from sklearn.preprocessing import StandardScaler

# O modelo de classificação propramente dito
from sklearn.neighbors import KNeighborsClassifier

# Algumas funções para testar o modelo posteriormente
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [3]:
dataset = pd.read_csv('dataset/diabetes.csv')
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
len(dataset) # Como o dataset é pequeno, podemos usar tranquilamente o algoritmo do KNN

768

In [7]:
dataset.describe() 
# Há algumas variáveis que são impossíveis para um ser humano apresentar tal valor, como por exemplo:
# Glicose = 0, Pressão Sanguínea = 0, Espessura da pele = 0, Insulina = 0, IMC = 0
# Portanto é necessário substituir esses valores.

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [9]:
# Construímos uma lista com esses dados propriamente ditos
nao_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

for A in nao_zero:
    # Percorre cada feature na lista substituindo 0 por NaN
    dataset[A] = dataset[A].replace(0, np.NaN)
    # Define a média das colunas
    media = int(dataset[A].mean(skipna=True))
    # Substitui os dados não preenchidos pela média
    dataset[A] = dataset[A].replace(np.NaN, media)
    

In [10]:
dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.682292,72.386719,29.108073,155.28125,32.450911,0.471876,33.240885,0.348958
std,3.369578,30.435999,12.096642,8.791221,85.02155,6.875366,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,155.0,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,155.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [57]:
# Separando dataset em treino e teste
X = dataset.iloc[:, 0:8] # Todas as colunas, menos o diagnóstico (features)
y = dataset['Outcome'] # Resultado que nós queremos (target)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

In [58]:
# Normalizando
sc_X = StandardScaler()

In [59]:
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

In [60]:
# Aplicação do modelo
np.sqrt(len(dataset)*0.2) 
# Calculando a raiz da quantidade de data points na base de test, e, escolhendo um ímpar próximo, temos que K = 13

12.393546707863734

In [61]:
classifier = KNeighborsClassifier(n_neighbors=13, metric='euclidean')

In [62]:
classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=13)

In [63]:
y_previsao = classifier.predict(X_test)
y_previsao

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
      dtype=int64)

In [64]:
# Avaliando o teste

# Matriz de confusao
confusion = confusion_matrix(y_test, y_previsao)
print(confusion)

[[89 10]
 [19 36]]


In [65]:
# F1 Score
f1_score(y_test, y_previsao)

0.7128712871287128

In [66]:
# Acúracia
accuracy_score(y_test, y_previsao)

0.8116883116883117

In [92]:
X_test[1]

array([-0.83752176,  0.99333807, -1.2022432 , -0.1235376 ,  0.04566497,
       -1.05199526, -0.94288191, -0.95299549])

In [93]:
pessoa1 = [[-0.83752176,  0.99333807, -1.2022432 , -0.1235376 ,  0.04566497,
       -1.05199526, -0.94288191, -0.95299549]]

In [94]:
pessoa1 = np.array(pessoa1)

In [95]:
y_prev1 = classifier.predict(pessoa1)
y_prev1

array([0], dtype=int64)

In [96]:
X_test

array([[ 0.98334907,  0.49634632,  0.10422673, ..., -1.06628413,
         0.41446124,  1.55648438],
       [-0.83752176,  0.99333807, -1.2022432 , ..., -1.05199526,
        -0.94288191, -0.95299549],
       [ 0.6798706 , -0.39823885, -1.2022432 , ..., -1.20917289,
        -0.86457365, -0.52032655],
       ...,
       [ 1.28682754,  2.21925108,  0.47750385, ...,  2.06297959,
        -1.06469476,  0.86421407],
       [ 0.37639213,  0.49634632,  0.85078098, ..., -0.20895161,
         0.3941591 ,  3.11409258],
       [ 0.07291366, -1.02776175, -0.08241183, ..., -0.78050662,
         0.30714993, -0.17419139]])