Without using python package, write a class to implement KNN for regression. It requires to choose optimal number of neighbors with prediction errors using holdout 20% test set. You should first generate data from $Y=X_1^2-2X_2-2\sin(X_3) + \cos(X_4^3) +\log(|X_5|+1) + \epsilon,$ where $X=(X_1,X_2,X_3,X_4,X_5)$ follows normal distribution $N(0,I_5)$ and   $ \epsilon$ follows $0.5N(0,1).$ The sample size of data is set to be $2000.$

In [123]:
import numpy as np
N = 2000
dim = 5
def data_generation(n,dim):
    x = np.random.multivariate_normal(mean=np.zeros(dim), cov=np.eye(dim), size=n)
    epi = 0.5*np.random.normal(0,1,n)
    y = x[:,0]**2 - 2*x[:,1] - 2*np.sin(x[:,2]) + np.cos(x[:,3]**3) + np.log(np.abs(x[:,4]) + 1) + epi
    
    return (x, y.reshape(-1,1))

def data_split(x,y,percentage = 0.2):
    data = np.concatenate((x,y),axis=1)
    shuffled_data = np.random.permutation(data)
    n = len(x)
    test_set = shuffled_data[0:int(percentage*n),:]
    training_set = shuffled_data[int(percentage*n):,:]

    return training_set, test_set

def KNN_regression(data,N,a_test):
    x = data[:,:-1]
    y = data[:,-1]
    dist = np.sum((a_test.reshape(1,-1) - x)**2, axis = 1)
    indices = np.argsort(dist)[:N]  # 最小 n 个数的索引

    return np.average(y[indices])

def Vectorized_KNN(test,data,N):
    f = lambda x : KNN_regression(data,N,x)
    
    return np.apply_along_axis(f, axis=1, arr=test)

def Loss_func(prediction, original):
    return np.sum((prediction - original)**2)
    
def optimal_classnum(test, training, N_list):
    loss = []
    for n in N_list:
        prediction = Vectorized_KNN(test[:,:-1], training, n)
        myloss = Loss_func(prediction, test[:,-1])
        loss.append(myloss)
    return N_list[np.argmin(loss)]

In [133]:
a = np.random.multivariate_normal(mean=np.zeros(5), cov=np.eye(5), size=2)
a[0,:]
print(a[:,0])
epi = 0.5*np.random.normal(0,1,(dim,1))
print(epi)

x,y = data_generation(N,dim)
print(x,y)

training, test = data_split(x,y)

KNN_regression(training,5,test[0,:-1])
predict = Vectorized_KNN(test[:,:-1],training,5)
print(predict.shape)

loss = Loss_func(predict, test[:,-1])
optimal_classnum(test, training,np.arange(2,20))

[0.04048564 0.53210068]
[[ 0.33344939]
 [-0.74648381]
 [ 0.09875368]
 [ 0.14541873]
 [ 0.10421954]]
[[ 0.15738235 -0.98675802 -0.06818563 -1.36589746 -1.90251984]
 [-0.2540199   0.64192017  2.02419165 -0.2560324  -1.13938193]
 [ 1.60241914  0.60985916 -1.23926117 -1.21393687 -0.06428513]
 ...
 [ 0.2846106  -0.89869155  0.95880913  0.71076734 -0.52064496]
 [ 1.28839516  0.72075824 -1.50128217 -0.06944642 -2.05377468]
 [-0.38431382  0.90005003  0.5568599  -0.37342675 -1.45145424]] [[ 2.13534743]
 [-0.89671747]
 [ 3.39061261]
 ...
 [ 0.39946042]
 [ 4.49208222]
 [-0.49610567]]
(400,)


6

In [138]:
import numpy as np

class KNN_Regression(object):
    def __init__(self, sample_size, dim, num_class=None):
        self.N = sample_size  # 样本数
        self.dim = dim  # 特征维度
        self.num_class = num_class  # 类别数量（如果需要）

    def data_generation(self):
        # 生成数据
        x = np.random.multivariate_normal(mean=np.zeros(self.dim), cov=np.eye(self.dim), size=self.N)
        epi = 0.5 * np.random.normal(0, 1, self.N)
        y = x[:, 0]**2 - 2 * x[:, 1] - 2 * np.sin(x[:, 2]) + np.cos(x[:, 3]**3) + np.log(np.abs(x[:, 4]) + 1) + epi
        self.x = x
        self.y = y.reshape(-1, 1)

    def data_split(self, percentage=0.2):
        # 划分数据集为训练集和测试集
        data = np.concatenate((self.x, self.y), axis=1)
        shuffled_data = np.random.permutation(data)  # 随机打乱数据
        n = len(self.x)
        test_set = shuffled_data[:int(percentage * n), :]  # 测试集
        training_set = shuffled_data[int(percentage * n):, :]  # 训练集
        self.training = training_set
        self.test = test_set

    def KNN_regression(self, a_test, N):
        # 单点的 KNN 回归
        x_train = self.training[:, :-1]  # 训练集特征
        y_train = self.training[:, -1]  # 训练集标签
        dist = np.sum((a_test.reshape(1, -1) - x_train)**2, axis=1)  # 计算欧几里得距离的平方
        indices = np.argsort(dist)[:N]  # 距离最近的 N 个点索引
        return np.average(y_train[indices])  # 返回 N 个最近点的标签平均值

    def Vectorized_KNN(self, test_features, N):
        # 使用向量化方法对测试集进行 KNN 回归
        f = lambda x: self.KNN_regression(x, N)  # 将单点回归函数封装为 lambda 函数
        predictions = np.apply_along_axis(f, axis=1, arr=test_features)  # 对每个测试样本应用回归
        return predictions

    def Loss_func(self, prediction, original):
        # 损失函数，计算平方误差
        return np.sum((prediction - original)**2)

    def optimal_classnum(self, N_list):
        # 寻找最佳 N
        test_features = self.test[:, :-1]
        test_labels = self.test[:, -1]
        loss = []
        for n in N_list:
            prediction = self.Vectorized_KNN(test_features, n)
            myloss = self.Loss_func(prediction, test_labels)
            loss.append(myloss)
        optimal_n = N_list[np.argmin(loss)]  # 返回使损失最小的 N
        return optimal_n


# 使用实例
sample = 2000
dim = 5
num_class = 6
my_class = KNN_Regression(sample, dim, num_class)

# 生成数据
my_class.data_generation()

# 划分数据
my_class.data_split()

# 测试 optimal_classnum
N_list = np.arange(2,20)
optimal_N = my_class.optimal_classnum(N_list)
print(f"Optimal N: {optimal_N}")

Optimal N: 9


In [115]:
import numpy as np

class KNN_Regression(object):
    def __init__(self, sample_size, dim, num_class=None):
        self.N = sample_size  # 样本数
        self.dim = dim  # 特征维度
        self.num_class = num_class  # 类别数量（如果需要）

    def data_generation(self):
        # 生成数据
        x = np.random.multivariate_normal(mean=np.zeros(self.dim), cov=np.eye(self.dim), size=self.N)
        epi = 0.5 * np.random.normal(0, 1, self.N)
        y = x[:, 0]**2 - 2 * x[:, 1] - 2 * np.sin(x[:, 2]) + np.cos(x[:, 3]**3) + np.log(np.abs(x[:, 4]) + 1) + epi
        self.x = x
        self.y = y.reshape(-1, 1)

    def data_split(self, percentage=0.2):
        # 划分数据集为训练集和测试集
        data = np.concatenate((self.x, self.y), axis=1)
        shuffled_data = np.random.permutation(data)  # 随机打乱数据
        n = len(self.x)
        test_set = shuffled_data[:int(percentage * n), :]  # 测试集
        training_set = shuffled_data[int(percentage * n):, :]  # 训练集
        self.training = training_set
        self.test = test_set

    def KNN_regression(self, a_test, N):
        # 单点的 KNN 回归
        x_train = self.training[:, :-1]  # 训练集特征
        y_train = self.training[:, -1]  # 训练集标签
        dist = np.sum((a_test.reshape(1, -1) - x_train)**2, axis=1)  # 计算欧几里得距离的平方
        indices = np.argsort(dist)[:N]  # 距离最近的 N 个点索引
        return np.average(y_train[indices])  # 返回 N 个最近点的标签平均值

    def Vectorized_KNN(self, N):
        # 使用向量化方法对测试集进行 KNN 回归
        test_features = self.test[:, :-1]  # 测试集特征
        f = lambda x: self.KNN_regression(x, N)  # 将单点回归函数封装为 lambda 函数
        predictions = np.apply_along_axis(f, axis=1, arr=test_features)  # 对每个测试样本应用回归
        return predictions


# 实例化并运行
sample = 2000
dim = 5
num_class = 6
my_class = KNN_Regression(sample, dim, num_class)

# 生成数据
my_class.data_generation()

# 划分数据
my_class.data_split()

# 单点测试
a_test_sample = my_class.test[0, :-1]  # 随机选择测试集中的一个点
single_prediction = my_class.KNN_regression(a_test_sample, N=10)  # 对单个点进行回归
print(f"Prediction for a single test point: {single_prediction:.4f}")

# 测试集的向量化回归
all_predictions = my_class.Vectorized_KNN(N=10)  # 对所有测试点进行回归
print(f"Predictions for all test points:\n{all_predictions[:5]}")  # print前 5 个预测结果

Prediction for a single test point: 1.3047
Predictions for all test points:
[1.30465923 6.30396091 1.120483   4.28986842 5.03158275]
