In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
iris = datasets.load_iris()

In [3]:
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [4]:
print(iris.DESCR)  # 鸢尾花数据集的详细信息

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [5]:
x = iris.data
y = iris.target
print(x.shape)
print(y.shape)

(150, 4)
(150,)


## 训练集和测试集的划分

In [6]:
shuffle_indexs = np.random.permutation(len(x))
shuffle_indexs   # 为确保模型的泛化能力，对数据的索引进行一个乱序处理

array([ 65,  43, 109,  59, 133,  73,  89, 122,  51,  29, 102, 137,  91,
        82,  11,   7,  78,  48, 100,  97, 134,  42,  95,  39, 127,  24,
       114, 124, 128,  70,  72,  57,   0,  63,  84,  60,  99,  58, 148,
        18, 143,   2,   9,  34, 146,  98, 145,  74, 131,  36,  25,  92,
        90,  37,  68,  44,  38, 130,  41,  85, 113,  93,  46,  56,   6,
        50,  61, 147,  76, 108,  69, 111,  94,  49,  27,  77, 144,  47,
         3,  55,  66, 132, 121,   8, 104,  75,  13, 112,  26,  45, 142,
       120,  64, 126,  30,  79,  22,  35, 106,  40,  83, 110,  81,  86,
        96, 141, 129, 117,  53,  15,  16,  20,  19,  62,  80,  54, 138,
         5,  31,  10,   4,  87, 139, 105,  21, 135, 116,  52, 107,  17,
       125,  14,  71,   1, 103, 101, 119, 115,  88, 140, 123, 149,  33,
       136, 118,  28,  32,  12,  67,  23])

In [7]:
test_ratio = 0.2
test_size = int(len(x) * test_ratio)

In [8]:
test_indexs = shuffle_indexs[:test_size]
train_indexs = shuffle_indexs[test_size:]

In [9]:
X_train = x[train_indexs]
y_train = y[train_indexs]

X_test = x[test_indexs]
y_test = y[test_indexs]

In [10]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [11]:
from sklearn.neighbors import KNeighborsClassifier
kNN_classifier = KNeighborsClassifier(n_neighbors=3)
kNN_classifier.fit(X_train, y_train)
y_predict = kNN_classifier.predict(X_test)
y_predict

array([1, 0, 2, 1, 1, 1, 1, 2, 1, 0, 2, 2, 1, 1, 0, 0, 1, 0, 2, 1, 2, 0,
       1, 0, 2, 0, 2, 2, 2, 2])

In [12]:
sum(y_predict == y_test)

28

In [13]:
sum(y_predict == y_test)/len(y_test)

0.9333333333333333

# 使用sklearn中的train_test_split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=666)  # random_state=666就是随机种子

In [16]:
print(X_train.shape)

(120, 4)


In [17]:
print(X_test.shape)

(30, 4)
