In [1]:
import numpy as np
from sklearn import neighbors, datasets

## Iris Data Set 
https://archive.ics.uci.edu/ml/datasets/iris

Informações dos atributos:
1. sepala tamanho in cm 
2. sepala largura in cm 
3. petala tamanho in cm 
4. petala largura in cm 
5. classe: 
 * Iris Setosa => 0
 * Iris Versicolour => 1
 * Iris Virginica => 2



In [2]:
# usando o iris data set diretamente
# x são as entradas e y são as saídas
# x usa as 4 colunas de valores
iris = datasets.load_iris()

In [3]:
# mostrando a descrição completa do data set
print(iris.DESCR)

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

In [4]:
# pegando as entradas
x = iris.data
print(len(x))

print(x)

150
[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]
 [ 4.6  3.4  1.4  0.3]
 [ 5.   3.4  1.5  0.2]
 [ 4.4  2.9  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 5.4  3.7  1.5  0.2]
 [ 4.8  3.4  1.6  0.2]
 [ 4.8  3.   1.4  0.1]
 [ 4.3  3.   1.1  0.1]
 [ 5.8  4.   1.2  0.2]
 [ 5.7  4.4  1.5  0.4]
 [ 5.4  3.9  1.3  0.4]
 [ 5.1  3.5  1.4  0.3]
 [ 5.7  3.8  1.7  0.3]
 [ 5.1  3.8  1.5  0.3]
 [ 5.4  3.4  1.7  0.2]
 [ 5.1  3.7  1.5  0.4]
 [ 4.6  3.6  1.   0.2]
 [ 5.1  3.3  1.7  0.5]
 [ 4.8  3.4  1.9  0.2]
 [ 5.   3.   1.6  0.2]
 [ 5.   3.4  1.6  0.4]
 [ 5.2  3.5  1.5  0.2]
 [ 5.2  3.4  1.4  0.2]
 [ 4.7  3.2  1.6  0.2]
 [ 4.8  3.1  1.6  0.2]
 [ 5.4  3.4  1.5  0.4]
 [ 5.2  4.1  1.5  0.1]
 [ 5.5  4.2  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 5.   3.2  1.2  0.2]
 [ 5.5  3.5  1.3  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 4.4  3.   1.3  0.2]
 [ 5.1  3.4  1.5  0.2]
 [ 5.   3.5  1.3  0.3]
 [ 4.5  2.3  1.3  0.3]
 [ 4.4  3.2  1.3  0.2]
 [ 5.  

In [5]:
# pegando as saidas transformadas em inteiros (target)
y = iris.target
print(len(y))
print(y)

150
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [6]:
# importando train_test_split para separar os elementos de treino
# dos elementos de teste
from sklearn.model_selection import train_test_split

In [7]:
# train_test_split retorna uma tupla com 4 listas
perc = 0.30
x_treino, x_teste, y_treino, y_teste = train_test_split(
    x,  # atributos
    y, # classes
    test_size=perc,  # percentual para o teste
    random_state=42  # pseudo aleatoriedade; garante o mesmo resultado
)

In [8]:
len(x_treino)

105

In [9]:
print(x_treino)

[[ 5.5  2.4  3.7  1. ]
 [ 6.3  2.8  5.1  1.5]
 [ 6.4  3.1  5.5  1.8]
 [ 6.6  3.   4.4  1.4]
 [ 7.2  3.6  6.1  2.5]
 [ 5.7  2.9  4.2  1.3]
 [ 7.6  3.   6.6  2.1]
 [ 5.6  3.   4.5  1.5]
 [ 5.1  3.5  1.4  0.2]
 [ 7.7  2.8  6.7  2. ]
 [ 5.8  2.7  4.1  1. ]
 [ 5.2  3.4  1.4  0.2]
 [ 5.   3.5  1.3  0.3]
 [ 5.1  3.8  1.9  0.4]
 [ 5.   2.   3.5  1. ]
 [ 6.3  2.7  4.9  1.8]
 [ 4.8  3.4  1.9  0.2]
 [ 5.   3.   1.6  0.2]
 [ 5.1  3.3  1.7  0.5]
 [ 5.6  2.7  4.2  1.3]
 [ 5.1  3.4  1.5  0.2]
 [ 5.7  3.   4.2  1.2]
 [ 7.7  3.8  6.7  2.2]
 [ 4.6  3.2  1.4  0.2]
 [ 6.2  2.9  4.3  1.3]
 [ 5.7  2.5  5.   2. ]
 [ 5.5  4.2  1.4  0.2]
 [ 6.   3.   4.8  1.8]
 [ 5.8  2.7  5.1  1.9]
 [ 6.   2.2  4.   1. ]
 [ 5.4  3.   4.5  1.5]
 [ 6.2  3.4  5.4  2.3]
 [ 5.5  2.3  4.   1.3]
 [ 5.4  3.9  1.7  0.4]
 [ 5.   2.3  3.3  1. ]
 [ 6.4  2.7  5.3  1.9]
 [ 5.   3.3  1.4  0.2]
 [ 5.   3.2  1.2  0.2]
 [ 5.5  2.4  3.8  1.1]
 [ 6.7  3.   5.   1.7]
 [ 4.9  3.1  1.5  0.1]
 [ 5.8  2.8  5.1  2.4]
 [ 5.   3.4  1.5  0.2]
 [ 5.   3.5

In [10]:
print(y_treino)

[1 2 2 1 2 1 2 1 0 2 1 0 0 0 1 2 0 0 0 1 0 1 2 0 1 2 0 2 2 1 1 2 1 0 1 2 0
 0 1 1 0 2 0 0 1 1 2 1 2 2 1 0 0 2 2 0 0 0 1 2 0 2 2 0 1 1 2 1 2 0 2 1 2 1
 1 1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1 1 2 2 0 1 2 0 1 2]


In [11]:
len(x_teste)

45

In [12]:
print(x_teste)

[[ 6.1  2.8  4.7  1.2]
 [ 5.7  3.8  1.7  0.3]
 [ 7.7  2.6  6.9  2.3]
 [ 6.   2.9  4.5  1.5]
 [ 6.8  2.8  4.8  1.4]
 [ 5.4  3.4  1.5  0.4]
 [ 5.6  2.9  3.6  1.3]
 [ 6.9  3.1  5.1  2.3]
 [ 6.2  2.2  4.5  1.5]
 [ 5.8  2.7  3.9  1.2]
 [ 6.5  3.2  5.1  2. ]
 [ 4.8  3.   1.4  0.1]
 [ 5.5  3.5  1.3  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 5.1  3.8  1.5  0.3]
 [ 6.3  3.3  4.7  1.6]
 [ 6.5  3.   5.8  2.2]
 [ 5.6  2.5  3.9  1.1]
 [ 5.7  2.8  4.5  1.3]
 [ 6.4  2.8  5.6  2.2]
 [ 4.7  3.2  1.6  0.2]
 [ 6.1  3.   4.9  1.8]
 [ 5.   3.4  1.6  0.4]
 [ 6.4  2.8  5.6  2.1]
 [ 7.9  3.8  6.4  2. ]
 [ 6.7  3.   5.2  2.3]
 [ 6.7  2.5  5.8  1.8]
 [ 6.8  3.2  5.9  2.3]
 [ 4.8  3.   1.4  0.3]
 [ 4.8  3.1  1.6  0.2]
 [ 4.6  3.6  1.   0.2]
 [ 5.7  4.4  1.5  0.4]
 [ 6.7  3.1  4.4  1.4]
 [ 4.8  3.4  1.6  0.2]
 [ 4.4  3.2  1.3  0.2]
 [ 6.3  2.5  5.   1.9]
 [ 6.4  3.2  4.5  1.5]
 [ 5.2  3.5  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.2  4.1  1.5  0.1]
 [ 5.8  2.7  5.1  1.9]
 [ 6.   3.4  4.5  1.6]
 [ 6.7  3.1  4.7  1.5]
 [ 5.4  3.9

In [13]:
print(y_teste)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]


In [14]:
# extraindo uma linha para verificar se os valores estão corretos
regist = x_treino[70]
print(regist)
classe = y_treino[70]
print(classe)

[ 6.9  3.1  5.4  2.1]
2


In [15]:
# verificando no dataset
for i in range(len(x)):
    if np.array_equal(x[i], regist):
        print(x[i])
        print(y[i])
        print('linha: %d' %i)

[ 6.9  3.1  5.4  2.1]
2
linha: 139


In [16]:
# definindo o k com a raiz quadrada do tamanho do treino
# e garantir que o valor seja impar
import math
k = int(math.sqrt(len(y_treino)))
print(k)
if k % 2 == 0:
    k += 1
print(k)

10
11


In [17]:
# importando o kNN classificador
from sklearn.neighbors import KNeighborsClassifier

# criando um classificador padrao
knn = KNeighborsClassifier(n_neighbors=k)

# fazendo o treinamento
knn.fit(x_treino, y_treino)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')

In [18]:
# fazendo a predição
labels = knn.predict(x_teste)

In [19]:
# np.sum soma o resultado da comparação entre as duas listas 
testes = len(y_teste)
acertos = np.sum(labels == y_teste)
print('Testados %d registros' % testes)
print('Acertos em %d registros' % acertos)

Testados 45 registros
Acertos em 45 registros


In [20]:
# calculando o score em percentual
100 * (labels == y_teste).sum() / len(x_teste)

100.0

In [21]:
# score usando o KNeighborsClassifier
score = knn.score(x_teste, y_teste)
score

1.0

In [22]:
# mostrando registros errados, se houver
if score != 1.0:
    for i in range(len(y_teste)):
        if y_teste[i] != labels[i]:
            print(x_teste[i])
            print(labels[i])
            print(y_teste[i])
            print(i)