## 线性回归

### 1. 手写代码

#### > 最小二乘法

In [152]:
import math
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error


# 手写线性模型
class LinearModel:
    def __init__(self):
        self.w = np.array([])
        pass

    # 训练，传入训练集的 X 向量的列表和 Y 的列表
    def train(self, x: np.ndarray, y: np.ndarray):
        if x.shape[0] != y.shape[0]:
            print(f"error: x's len is not same as y, {x.shape} and {y.shape}")
            exit(1)
        x = np.concatenate((np.array([1] * n).reshape((n, 1)), x), axis=1)
        # 最小二乘法
        self.w = np.linalg.inv(x.T.dot(x)).dot(x.T).dot(y)

    # 加载模型，设置 W 向量
    def load(self, w: np.ndarray):
        self.w = w

    # 预测，传入的为 X 向量
    def predict(self, x: np.ndarray):
        # print(self.w.shape)
        # print(x.shape)
        return self.w.T.dot(np.concatenate((np.array([1]), x)).T.reshape(self.w.shape))

    # 传入用于测试的 x，y
    def test(self, test_x: np.ndarray, test_y: np.ndarray):
        y = []
        for e in test_x:
            y.append(self.predict(e)[0])
        print(f'mse: {mean_squared_error(y, test_y)}')

    def __str__(self):
        arr = []
        for i, wi in enumerate(self.w):
            arr.append(f'{wi[0]}' + (f'x{i}' if i != 0 else ''))
        return ' + '.join(arr)


df = pd.read_csv('./实验一 线性模型 数据集/housing-data.csv')
l = len(df)

# 取前 80% 作为训练数据，后 20% 为测试数据
train_data = df[:math.floor(l * 0.8)]
test_data = df[math.floor(l * 0.8):]

# 训练集 y 和 x
train_y = np.array(train_data[['price']])
train_x = np.array(
    train_data[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']])
n = len(train_y)

# 测试集 y 和 x
test_y = np.array(train_data[['price']])
test_x = np.array(
    train_data[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']])

In [153]:
linear_model = LinearModel()

# 使用训练数据训练模型
linear_model.train(train_x, train_y)
print(f'训练结果，W 向量：{linear_model.w}')

# 预测
ans = linear_model.predict(np.array(test_x[0]))
print(f'预测 x: {test_x[0]}，\n预测 y: {ans}，\n真实 y:{test_y[0]}')

# 评估（计算mse）
linear_model.test(test_x, test_y)

训练结果，W 向量：[[ 3.00771669e+01]
 [-2.02135297e-01]
 [ 4.41276341e-02]
 [ 5.26739364e-02]
 [ 1.88474315e+00]
 [-1.49281487e+01]
 [ 4.76038673e+00]
 [ 2.88734527e-03]
 [-1.30025278e+00]
 [ 4.61661953e-01]
 [-1.55434673e-02]
 [-8.11632369e-01]
 [-1.97174433e-03]
 [-5.32273431e-01]]
预测 x: [6.320e-03 1.800e+01 2.310e+00 0.000e+00 5.380e-01 6.575e+00 6.520e+01
 4.090e+00 1.000e+00 2.960e+02 1.530e+01 3.969e+02 4.980e+00]，
预测 y: [[29.13979681]]，
真实 y:[24.]
mse: 22.778379521800783


### 2. 调用 API

In [154]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 样本特征
X = np.array(train_x)
y = np.array(train_y)

# 创建线性回归模型
model = LinearRegression()

# 训练模型
model.fit(X, y)
print(f'系数：{model.coef_}')

# 预测
y_pred = model.predict([test_x[0]])

# 打印预测结果
print(f'预测 x: {test_x[0]}，\n预测 y: {y_pred}，\n真实 y:{test_y[0]}')

y = model.predict(test_x)
print(f'mse: {mean_squared_error(y, test_y)}')

系数：[[-2.02135297e-01  4.41276341e-02  5.26739364e-02  1.88474315e+00
  -1.49281487e+01  4.76038673e+00  2.88734527e-03 -1.30025278e+00
   4.61661953e-01 -1.55434673e-02 -8.11632369e-01 -1.97174433e-03
  -5.32273431e-01]]
预测 x: [6.320e-03 1.800e+01 2.310e+00 0.000e+00 5.380e-01 6.575e+00 6.520e+01
 4.090e+00 1.000e+00 2.960e+02 1.530e+01 3.969e+02 4.980e+00]，
预测 y: [[29.13979681]]，
真实 y:[24.]
mse: 22.77837952180079


## 逻辑回归

数据预处理
剔除缺省值，将数据保存为 csv。
这里将class从原先的2与4改为0与1，方便后续计算

In [155]:
with open('实验一 线性模型 数据集/breast-cancer-wisconsin.data', 'r') as f:
    data = f.readlines()
    arr = []
    for line in data:
        if line.find('?') > 0:
            continue
        line = line.replace('\n', '').split(',')
        # print(line)
        arr.append({
            'Sample code number': int(line[0]),
            'Clump Thickness': int(line[1]),
            'Uniformity of Cell Size': int(line[2]),
            'Uniformity of Cell Shape': int(line[3]),
            'Marginal Adhesion': int(line[4]),
            'Single Epithelial Cell Size': int(line[5]),
            'Bare Nuclei': int(line[6]),
            'Bland Chromatin': int(line[7]),
            'Normal Nucleoli': int(line[8]),
            'Mitoses': int(line[9]),
            'Class': 1 if int(line[10]) == 4 else 0,
        })

    df = pd.DataFrame(arr)
    df.to_csv('实验一 线性模型 数据集/breast-cancer-wisconsin.csv')

### 1. 手写代码

In [156]:
import numpy as np
import pandas as pd
from tqdm import tqdm


def sigmoid(z):
    return 1 / (1 + np.exp(-z))


# 手写逻辑回归模型
class LogisticModel:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.w = None

    # 交叉熵损失函数
    def _cost(self, h, y):
        m = len(y)
        cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
        return cost

    # 训练，传入训练集的 X 向量的列表和 Y 的列表
    def train(self, _X: np.ndarray, y: np.ndarray):
        X = np.hstack((np.ones((_X.shape[0], 1)), _X))  # 添加偏置
        m, n_features = X.shape
        self.w = np.zeros(n_features)

        for _ in tqdm(range(self.num_iterations)):
            h = sigmoid(np.dot(X, self.w))

            dw = (1 / m) * np.dot((h - y), X)
            dw = np.sum(dw, axis=0)

            self.w -= self.learning_rate * dw.T

    # 加载模型，设置 W 向量
    def load(self, w: np.ndarray):
        self.w = w

    # 预测，传入的为 X 向量
    def predict(self, _X: np.ndarray):
        X = np.hstack((np.ones((_X.shape[0], 1)), _X))  # 添加偏置
        h = np.dot(X, self.w)
        predictions = sigmoid(h)
        predictions_cls = [1 if p >= 0.5 else 0 for p in predictions]
        return predictions_cls

    # 传入用于测试的 x，y
    def test(self, _X: np.ndarray, _y: np.ndarray):
        pred_y = self.predict(_X)
        correct = 0
        total = len(_y)
        for i in range(total):
            if pred_y[i] == _y[i]:
                correct += 1

        acc = correct / total
        print(f'acc: {acc}')


df = pd.read_csv('./实验一 线性模型 数据集/breast-cancer-wisconsin.csv')
l = len(df)

# 取前 80% 作为训练数据，后 20% 为测试数据
train_data = df[:math.floor(l * 0.8)]
test_data = df[math.floor(l * 0.8):]

# 训练集 y 和 x
train_y = np.array(train_data[['Class']])
train_x = np.array(
    train_data[['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion',
                'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']])
n = len(train_y)

# 测试集 y 和 x
test_y = np.array(train_data[['Class']])
test_x = np.array(
    train_data[['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion',
                'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']])

logistic_model = LogisticModel()

# 归一化
std = train_x.std(axis=0)
mean = train_x.mean(axis=0)
train_x_norm = (train_x - mean) / std

logistic_model.train(train_x_norm, train_y)
print(logistic_model.w)

# 归一化
std = test_x.std(axis=0)
mean = test_x.mean(axis=0)
test_x_norm = (test_x - mean) / std

y_pred = logistic_model.predict(test_x[0:10])

# 打印预测结果
print(f'预测 x: {test_x[0:10]}，\n预测 y: {y_pred}，\n真实 y: {test_y[0:10]}')
logistic_model.test(test_x_norm, test_y)
# 未进行归一化时 acc 仅有 0.6，而归一化后可达 0.9

100%|██████████| 1000/1000 [00:01<00:00, 757.95it/s]

[-4.06142267  1.52247686  0.15107139  0.5453952   0.32536976  0.67277934
  1.36675944  0.6249277   0.586551   -0.14265028]
预测 x: [[ 5  1  1  1  2  1  3  1  1]
 [ 5  4  4  5  7 10  3  2  1]
 [ 3  1  1  1  2  2  3  1  1]
 [ 6  8  8  1  3  4  3  7  1]
 [ 4  1  1  3  2  1  3  1  1]
 [ 8 10 10  8  7 10  9  7  1]
 [ 1  1  1  1  2 10  3  1  1]
 [ 2  1  2  1  2  1  3  1  1]
 [ 2  1  1  1  2  1  1  1  5]
 [ 4  2  1  1  2  1  2  1  1]]，
预测 y: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]，
真实 y: [[0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]]
acc: 0.9084249084249084





### 2. 调用 API

In [157]:
from sklearn.linear_model import LogisticRegression

# 创建逻辑回归模型
model = LogisticRegression()

# 拟合模型
model.fit(train_x, train_y)

# 使用模型进行预测
y_pred = model.predict(test_x[0:10])

# 打印预测结果
print(f'预测 x: {test_x[0:10]}，\n预测 y: {y_pred}，\n真实 y: {test_y[0:10]}')

print(f'coef: {model.coef_}')

pred_y = model.predict(test_x)
correct = 0
total = len(test_y)
for i in range(total):
    if pred_y[i] == test_y[i]:
        correct += 1

acc = correct / total
print(f'acc: {acc}')

# 经过测试归一化与否并无影响

预测 x: [[ 5  1  1  1  2  1  3  1  1]
 [ 5  4  4  5  7 10  3  2  1]
 [ 3  1  1  1  2  2  3  1  1]
 [ 6  8  8  1  3  4  3  7  1]
 [ 4  1  1  3  2  1  3  1  1]
 [ 8 10 10  8  7 10  9  7  1]
 [ 1  1  1  1  2 10  3  1  1]
 [ 2  1  2  1  2  1  3  1  1]
 [ 2  1  1  1  2  1  1  1  5]
 [ 4  2  1  1  2  1  2  1  1]]，
预测 y: [0 1 0 1 0 1 0 0 0 0]，
真实 y: [[0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]]
coef: [[ 0.52954629 -0.06077739  0.32907828  0.30437928  0.12910377  0.36743094
   0.36931925  0.19495615  0.44411127]]
acc: 0.9633699633699634


  y = column_or_1d(y, warn=True)
