In [None]:
import numpy as np

class KNN:
    def __init__(self, k=3, distance='euclidean', p=2):
        self.k = k
        self.distance = distance
        self.p = p
        print(f"KNN initialized with k={k}, distance={distance}, p={p}")

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        print(f"Training data shape: X={X.shape}, y={y.shape}")
        print(f"Number of classes: {len(np.unique(y))}")

    # L2
    def euclidean_distance(self, x1, x2):
        dist = np.sqrt(np.sum((x1 - x2) ** 2))
        print(f"Euclidean distance: {dist:.4f}")
        return dist
    
    # L1
    def manhattan_distance(self, x1, x2):
        dist = np.sum(np.abs(x1 - x2))
        print(f"Manhattan distance: {dist:.4f}")
        return dist
    
    def minkowski_distance(self, x1, x2):
        dist = np.sum(np.abs(x1 - x2) ** self.p) ** (1/self.p)
        print(f"Minkowski distance (p={self.p}): {dist:.4f}")
        return dist
    
    def chebyshev_distance(self, x1, x2):
        dist = np.max(np.abs(x1 - x2))
        print(f"Chebyshev distance: {dist:.4f}")
        return dist
    
    def cosine_distance(self, x1, x2):
        dot_product = np.dot(x1, x2)
        norm_x1 = np.linalg.norm(x1)
        norm_x2 = np.linalg.norm(x2)
        if norm_x1 == 0 or norm_x2 == 0:
            print("Cosine distance: 1.0000 (zero vector)")
            return 1.0
        dist = 1 - (dot_product / (norm_x1 * norm_x2))
        print(f"Cosine distance: {dist:.4f}")
        return dist

    def predict(self, X):
        print(f"\nPredicting for {X.shape[0]} samples...")
        y_pred = [self._predict(x, i) for i, x in enumerate(X)]
        return np.array(y_pred)

    def _predict(self, x, sample_idx):
        print(f"\n--- Predicting sample {sample_idx + 1} ---")
        print(f"Test point: {x}")
        
        # Compute distances between x and all examples in the training set
        distances = []
        if self.distance == 'euclidean':
            distances = [self.euclidean_distance(x, x_train) for x_train in self.X_train]
        elif self.distance == 'manhattan':
            distances = [self.manhattan_distance(x, x_train) for x_train in self.X_train]
        elif self.distance == 'minkowski':
            distances = [self.minkowski_distance(x, x_train) for x_train in self.X_train]
        elif self.distance == 'chebyshev':
            distances = [self.chebyshev_distance(x, x_train) for x_train in self.X_train]
        elif self.distance == 'cosine':
            distances = [self.cosine_distance(x, x_train) for x_train in self.X_train]
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance}")
        
        print(f"All distances: {[f'{d:.4f}' for d in distances[:5]]}...")  # 只显示前5个
        
        # Sort by distance and return indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.k]
        print(f"K nearest neighbor indices: {k_indices}")
        
        # Extract the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        print(f"K nearest neighbor labels: {k_nearest_labels}")
        
        # Return the most common class label
        most_common = np.bincount(k_nearest_labels).argmax()
        print(f"Predicted label: {most_common}")
        
        return most_common

In [5]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 创建示例数据
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0, 
                          n_informative=2, n_clusters_per_class=1, 
                          random_state=42)

# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42)

print(f"训练集大小: {X_train.shape[0]}")
print(f"测试集大小: {X_test.shape[0]}")

# 创建分类器
knn = KNN(k=3, distance='euclidean')

# 训练
knn.fit(X_train, y_train)

# 预测
predictions = knn.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, predictions)
print(f"\n准确率: {accuracy:.4f}")

训练集大小: 70
测试集大小: 30
KNN initialized with k=3, distance=euclidean, p=2
Training data shape: X=(70, 2), y=(70,)
Number of classes: 2

Predicting for 30 samples...

--- Predicting sample 1 ---
Test point: [ 1.68674524 -0.35904111]
Euclidean distance: 0.5856
Euclidean distance: 0.9722
Euclidean distance: 0.8565
Euclidean distance: 2.1976
Euclidean distance: 1.6894
Euclidean distance: 1.3309
Euclidean distance: 2.0973
Euclidean distance: 2.6961
Euclidean distance: 0.9012
Euclidean distance: 0.8208
Euclidean distance: 0.4537
Euclidean distance: 1.5321
Euclidean distance: 1.6590
Euclidean distance: 1.6241
Euclidean distance: 1.1747
Euclidean distance: 1.9282
Euclidean distance: 0.1805
Euclidean distance: 1.6015
Euclidean distance: 1.4137
Euclidean distance: 2.5169
Euclidean distance: 3.5444
Euclidean distance: 1.4923
Euclidean distance: 1.6854
Euclidean distance: 1.2400
Euclidean distance: 0.4329
Euclidean distance: 1.9242
Euclidean distance: 1.4773
Euclidean distance: 1.7868
Euclidean distan