## 归一化标准化

In [1]:
from sklearn.preprocessing import MinMaxScaler

def test():
    # 1. 准备数据
    data = [[90, 2, 10, 40],
            [60, 4, 15, 45],
            [75, 3, 13, 46]]
    # 2.初始化归一化对象
    transformer = MinMaxScaler()
    # 3. 对原始特征数据进行归一化处理
    data = transformer.fit_transform(data)
    # 4. 打印归一化之后的结果
    print(data)

In [2]:
test()

In [3]:
from sklearn.preprocessing import StandardScaler

def test():
    # 1. 准备数据
    data = [[90, 2, 10, 40],
            [60, 4, 15, 45],
            [75, 3, 13, 46]]
    # 2.初始化标准化对象
    transformer = StandardScaler()
    # 3. 对原始特征数据进行标准化处理
    data = transformer.fit_transform(data)
    # 4. 打印标准化之后的结果
    print(data)

In [4]:
test()

## K近邻算法Sklearn API

In [5]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

if __name__ == '__main__':
    # 1. 加载数据集
    iris = load_iris()
    # 2. 数据标准化
    transformer = StandardScaler()
    x = transformer.fit_transform(iris.data)
    # 3. 模型训练
    estimator = KNeighborsClassifier(n_neighbors=3)
    estimator.fit(x,iris.target)
    # 4. 利用模型进行预测
    result = estimator.predict(x)
    print(result)

In [6]:
iris = load_iris()
iris.data # 数据的特征
iris.target  # 数据的目标
print(iris.DESCR)

## 数据集划分-留出法

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import ShuffleSplit
from collections import Counter
from sklearn.datasets import load_iris

def test01():
    # 加载数据集
    x,y = load_iris(return_X_y = True)
    print('随机类别分割',Counter(y))
# 留出法(随机分割)
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
    print('随机类别分割',Counter(y_train),Counter(y_test))
    # 留出法(分层分割)
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y)
    print('分层类别分割',Counter(y_train),Counter(y_test))

def test02():
    # 加载数据集
    x,y = load_iris(return_X_y = True)
    print('随机类别分割',Counter(y))
    # 留出法(随机分割)
    spliter = ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
    for train,test in spliter.split(x,y):
        print('随机类别分割',Counter(y[test]))
    # 留出法(分层分割)
    spliter = StratifiedShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
    for train,test in spliter.split(x,y):
        print('分层分割',Counter(y[test]))

if __name__ == '__main__':
    test01()
    test02()

## 数据集划分-交叉验证

In [11]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from collections import Counter
from sklearn.datasets import load_iris

def test():
    # 加载数据集
    x,y = load_iris(return_X_y = True)
    print('原始类别比例',Counter(y))
    # 随机交叉验证
    spliter = KFold(n_splits=5,shuffle=True,random_state=0)
    for train,test in spliter.split(x,y):
        print(test)
        print('随机交叉验证',Counter(y[test]))
    # 分层交叉验证
    spliter = StratifiedKFold(n_splits=5,shuffle=True,random_state=0)
    for train,test in spliter.split(x,y):
        print('分层交叉验证',Counter(y[test]))
if __name__ == '__main__':
    test()

## 留一法

In [13]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut
from sklearn.datasets import load_iris
from collections import Counter


def test01():
    x,y = load_iris(return_X_y = True)
    print('原始类别比例',Counter(y))

    # 留一法
    spliter = LeaveOneOut()
    for train, test in spliter.split(x,y):
        print('训练集:',len(train),'测试集:',len(test),test)
    spliter = LeavePOut(p=3)
    for train, test in spliter.split(x,y):
        print('训练集:',len(train),'测试集:',len(test),test)
if __name__ == '__main__':
    test01()

## 自助法

In [23]:
import pandas as pd


if __name__ == '__main__':

    # 1. 构造数据集
    data = [[90, 2, 10, 40],
            [60, 4, 15, 45],
            [75, 3, 13, 46],
            [78, 2, 64, 22]]

    data = pd.DataFrame(data)
    print('数据集:\n',data)
    print('*' * 30)

    # 2. 产生训练集
    train = data.sample(frac=1,replace=True)
    print('训练集:',train)
    print('*' * 30)

    # 3. 产生测试集
    test = data.loc[data.index.difference(train.index)]
    print('测试集:',test)


## 分类模型评估方法

In [37]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# 加载数据
x,y = datasets.load_iris(return_X_y = True)
# 训练集测试集划分
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
# 创建KNN分类器对象,指定K个邻居
knn_estimator = KNeighborsClassifier(n_neighbors=6)
# 利用knn对象训练模型
knn_estimator.fit(x,y)
# 利用训练好的模型进行预测
y_predict = knn_estimator.predict(x_test)
# 对预测结果进行评估
print('预测结果准确率为:',sum(y_predict== y_test)/y_test.shape[0])

In [39]:
print(y_predict)

In [40]:
print(y_test)

In [41]:
from sklearn.metrics import accuracy_score
accuracy_score(y_predict,y_test)

In [42]:
knn_estimator.score(x_test,y_test)

In [4]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import joblib
from collections import Counter

In [2]:
def show_digit(idx):
    data = pd.read_csv('data/手写数字识别.csv')
    if idx<0 or idx>len(data)-1:
        return
    x = data.iloc[:,1:]
    y = data.iloc[:,0]

    print('当前的数字标签是：',y[idx])

    data_ = x.iloc[idx].values
    data_ = data_.reshape(28,28)
    plt.axis('off')
    plt.imshow(data_)
    plt.show()

In [4]:
show_digit(2)

In [5]:
data = pd.read_csv('data/手写数字识别.csv')
x = data.iloc[:,1:]/255
y = data.iloc[:,0]

#打印数据的基本信息
print('数据基本信息：',x.shape)
print('数据类别比例：',Counter(y))

# 数据集划分
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=0)
#模型训练
estimator = KNeighborsClassifier(n_neighbors=3)
estimator.fit(x_train,y_train)
# 模型评估
estimator.score(x_test,y_test)
# 模型保存
joblib.dump(estimator,'model/knn.pth')

In [2]:
import matplotlib.pyplot as plt
img = plt.imread('temp/demo.png')
plt.imshow(img)

In [5]:
knn = joblib.load('model/knn.pth')
y_pred = knn.predict(img.reshape(1,-1))
y_pred