In [14]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
import time
import struct
import os
from array import array

def load_mnist_images(filename):
    """加载MNIST图像文件"""
    with open(filename, 'rb') as f:
        magic, size, rows, cols = struct.unpack(">IIII", f.read(16))
        images = np.fromfile(f, dtype=np.uint8).reshape(size, rows * cols)
    return images

def load_mnist_labels(filename):
    """加载MNIST标签文件"""
    with open(filename, 'rb') as f:
        magic, size = struct.unpack(">II", f.read(8))
        labels = np.fromfile(f, dtype=np.uint8)
    return labels

# 尝试从本地加载MNIST数据集
try:
    # 如果您有本地MNIST文件，请修改这些路径
    train_images_path = './datasets/train-images.idx3-ubyte'
    train_labels_path = './datasets/train-labels.idx1-ubyte'
    test_images_path = './datasets/t10k-images.idx3-ubyte'
    test_labels_path = './datasets/t10k-labels.idx1-ubyte'
    
    print("正在从本地加载MNIST数据集...")
    x_train = load_mnist_images(train_images_path)
    y_train = load_mnist_labels(train_labels_path)
    x_test = load_mnist_images(test_images_path)
    y_test = load_mnist_labels(test_labels_path)
    
    print("数据集加载成功！")
    
except FileNotFoundError:
    print("本地MNIST文件未找到，使用备选方案...")
    # 如果本地文件不存在，使用备选方案

正在从本地加载MNIST数据集...
数据集加载成功！


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
import time


print(f"\n训练集形状: {x_train.shape}")
print(f"测试集形状: {x_test.shape}")

# 1. 降维前的KNN分类（作为基准）
print("\n=== 降维前的KNN分类 ===")
knn_clf_original = KNeighborsClassifier()

start_time = time.time()
knn_clf_original.fit(x_train, y_train)

start_time = time.time()
score_original = knn_clf_original.score(x_test, y_test)
test_time_original = time.time() - start_time
print(f"降维前测试时间: {test_time_original:.2f} 秒")
print(f"降维前准确率: {score_original:.4f}")

# 2. 使用PCA进行降维
print("\n=== 使用PCA降维 ===")
pca = PCA(0.9)  # 保留90%的方差
pca.fit(x_train)

x_train_reduction = pca.transform(x_train)
print(f"降维后的训练集形状: {x_train_reduction.shape}")
print(f"主成分数量: {pca.n_components_}")

# 3. 降维后的KNN分类
print("\n=== 降维后的KNN分类 ===")
knn_clf_reduced = KNeighborsClassifier()

start_time = time.time()
knn_clf_reduced.fit(x_train_reduction, y_train)

x_test_reduction = pca.transform(x_test)

start_time = time.time()
score_reduced = knn_clf_reduced.score(x_test_reduction, y_test)
test_time_reduced = time.time() - start_time
print(f"降维后测试时间: {test_time_reduced:.2f} 秒")
print(f"降维后准确率: {score_reduced:.4f}")

# 4. 结果对比
print("\n=== 结果对比 ===")

print(f"\n测试时间对比:")
print(f"  降维前: {test_time_original:.2f} 秒")
print(f"  降维后: {test_time_reduced:.2f} 秒")
print(f"  加速倍数: {test_time_original/test_time_reduced:.2f} 倍")

print(f"\n准确率对比:")
print(f"  降维前: {score_original:.4f}")
print(f"  降维后: {score_reduced:.4f}")
print(f"  准确率变化: {score_reduced - score_original:+.4f}")

# 5. 解释为什么降维后效果可能更好
print("\n=== 为什么降维后效果可能更好？ ===")
print("1. 降噪效果: PCA在降维过程中过滤掉了噪声成分")
print("2. 维度灾难: 高维空间中数据稀疏，距离度量可能失效")
print("3. 过拟合减少: 减少了特征数量，降低了过拟合风险")
print("4. 计算稳定性: 数值计算更加稳定")

# 6. 查看不同维度保留率的效果，自己补全代码
print("\n=== 不同维度保留率的效果 ===")
variance_ratios = [0.8, 0.85, 0.9, 0.95, 0.99]



训练集形状: (60000, 784)
测试集形状: (10000, 784)

=== 降维前的KNN分类 ===
降维前测试时间: 11.07 秒
降维前准确率: 0.9688

=== 使用PCA降维 ===
降维后的训练集形状: (60000, 87)
主成分数量: 87

=== 降维后的KNN分类 ===
降维后测试时间: 0.59 秒
降维后准确率: 0.9728

=== 结果对比 ===

测试时间对比:
  降维前: 11.07 秒
  降维后: 0.59 秒
  加速倍数: 18.78 倍

准确率对比:
  降维前: 0.9688
  降维后: 0.9728
  准确率变化: +0.0040

=== 为什么降维后效果可能更好？ ===
1. 降噪效果: PCA在降维过程中过滤掉了噪声成分
2. 维度灾难: 高维空间中数据稀疏，距离度量可能失效
3. 过拟合减少: 减少了特征数量，降低了过拟合风险
4. 计算稳定性: 数值计算更加稳定

=== 不同维度保留率的效果 ===
方差保留率 0.80: 44 个主成分, 准确率: 0.9742
方差保留率 0.85: 59 个主成分, 准确率: 0.9760
方差保留率 0.90: 87 个主成分, 准确率: 0.9728
方差保留率 0.95: 154 个主成分, 准确率: 0.9712
方差保留率 0.99: 331 个主成分, 准确率: 0.9687
