In [1]:
import os.path
import time
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data_dir = 'data/digit-recognizer/'

# 加载数据
def opencsv():
    # 使用 pandas 打开
    data = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    data1 = pd.read_csv(os.path.join(data_dir, 'test.csv'))

    train_data = data.values[:, 1:]  # 读入全部训练数据,  [行，列]
    train_label = data.values[:, 0]  # 读取列表的第一列
    test_data = data1.values[:, 0:]  # 测试全部测试个数据
    return train_data, train_label, test_data

def knnClassify(trainData, trainLabel):
    knnClf = KNeighborsClassifier()  
    # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
    knnClf.fit(trainData, np.ravel(trainLabel))  
    # ravel Return a contiguous flattened array.
    return knnClf


# 数据预处理-降维 PCA主成成分分析
def dRPCA(x_train, x_test, COMPONENT_NUM):
    print('dimensionality reduction...')
    trainData = np.array(x_train)
    testData = np.array(x_test)
    '''
    使用说明：https://www.cnblogs.com/pinard/p/6243025.html
    n_components>=1
      n_components=NUM   设置占特征数量比
    0 < n_components < 1
      n_components=0.99  设置阈值总方差占比
    '''
    pca = PCA(n_components=COMPONENT_NUM, whiten=False)
    pca.fit(trainData)  # Fit the model with X
    pcaTrainData = pca.transform(trainData)  # Fit the model with X and 在X上完成降维.
    pcaTestData = pca.transform(testData)  # Fit the model with X and 在X上完成降维.

    # pca 方差大小、方差占比、特征数量
    print(pca.explained_variance_, '\n', pca.explained_variance_ratio_, '\n',
          pca.n_components_)
    print(sum(pca.explained_variance_ratio_))
    return pcaTrainData, pcaTestData

start_time = time.time()

# 加载数据
trainData, trainLabel, testData = opencsv()
# print("trainData==>", type(trainData), shape(trainData))
# print("trainLabel==>", type(trainLabel), shape(trainLabel))
# print("testData==>", type(testData), shape(testData))
print("load data finish")
stop_time_l = time.time()
print('load data time used:%f' % (stop_time_l - start_time))

# 降维处理
trainData, testData = dRPCA(trainData, testData, 0.8)

# 模型训练
knnClf = knnClassify(trainData, trainLabel)

# 结果预测
testLabel = knnClf.predict(testData)

load data finish
load data time used:5.131304
dimensionality reduction...
[334780.59149447 245885.06513832 211051.62698822 184726.37342815
 168070.0015068  147773.28274113 112534.61945611  99315.43743442
  95016.01443299  80660.73018992  72091.28112172  70706.5372777
  58466.0488886   58130.66595881  54296.21671756  50934.7635958
  45318.36268187  44049.12048052  40795.35308887  39585.84244148
  36819.27443543  34862.15972293  33134.94858928  31347.32791474
  30481.77568809  28803.41321758  27879.30252896  26696.27853362
  25433.56724803  23580.10141223  22595.24890274  21936.48003485
  20582.39066703  20223.41162321  19379.38992382  18576.92336602
  17486.78970718  16741.02306641  16331.15498464  16021.24495723
  15554.48384975  15281.02243785  14362.97506646] 
 [0.09748938 0.07160266 0.06145903 0.05379302 0.04894262 0.04303214
 0.03277051 0.02892103 0.02766902 0.02348871 0.02099325 0.02059001
 0.01702553 0.01692787 0.01581126 0.0148324  0.01319688 0.01282727
 0.01187976 0.01152755 0.

In [5]:
testLabel = pd.Series(testLabel,name="Label")
submission = pd.concat([pd.Series(range(1,28001),name = "ImageId"),testLabel],axis = 1)
submission.to_csv("output/digit-recognizer/sklearn_knn.csv",index=False)