In [1]:
import os
os.sys.path.append(os.path.dirname(os.path.abspath('.')))

## 数据准备

In [2]:
import numpy as np
from datasets.dataset import load_breast_cancer
from model_selection.train_test_split import train_test_split

In [3]:
data=load_breast_cancer()
X=data.data
Y=data.target

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

数据$X$是一个$(n{\times}m)$的矩阵，每一行是一个样本，每一列代表一个特征：

In [4]:
n=X_train.shape[0]
m=X_train.shape[1]

标签$Y$是一个列向量，其行数与$X$相同：

In [5]:
Y_train = Y_train.reshape((n, 1))
Y_test = Y_test.reshape((-1, 1))

## 粗略模型

模型表达式为：
$$
\hat{Y}=\sigma{(XW+b)}
$$
其中
$$
\sigma(x)=\frac{1}{1+e^{-x}}
$$
权重系数$W$的形状为$(m,1)$，偏置系数$b$为单变量系数。这里注意sigmoid函数的曲线，**只有当$XW+b$处于非常有限的范围内如$[-5,5]$时才能被sigmoid函数划分到$[0,1]$区间，而$XW+b$相当于是对原始数据的一个线性回归，想要线性回归的结果落在$[-5,5]$的范围内，需要对数据做预处理，或者缩小初始的$W$与$b$。实际发现缩小$W$的效果远优于对数据的预处理。**

In [6]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

# 缩小的初始权重参数
W = 0.001*np.random.randn(m).reshape((m, 1))  # 权重
b = 0  # 偏置

Y_hat=sigmoid(np.dot(X_train, W)+b)

模型的损失函数为：
$$
\begin{aligned}
L&=-\sum\limits_{i=1}^n[y^{(i)}\ln{\hat{y}^{(i)}}+(1-y^{(i)})\ln{(1-\hat{y}^{(i)})}] \\
&=-\frac{1}{n}[Y^{T}\ln{\hat{Y}}+(1-Y)^{T}\ln{(1-\hat{Y})}] \\
\end{aligned}
$$
损失函数关于参数$W$与$b$的梯度可以求得：
$$
\begin{aligned}
\frac{\partial{L}}{\partial{W}}&=\frac{1}{n}X^{T}{\cdot}(\hat{Y}-Y) \\
\frac{\partial{L}}{\partial{b}}&=\frac{1}{n}{\cdot}[1,1,...,1](\hat{Y}-Y) \\
\end{aligned}
$$

In [7]:
dW = X_train.T.dot(Y_hat - Y_train) / n
db = np.sum(Y_hat - Y_train) / n

参数的迭代更新公式：
$$
W:=W-{\alpha}\frac{\partial{L}}{\partial{W}}, \quad b:b-{\alpha}\frac{\partial{L}}{\partial{b}}
$$

In [8]:
max_iter=2000
alpha=0.00001        # 注意学习率过大会导致震荡，然后误差越来越大

for i in range(max_iter):
    Y_hat=sigmoid(np.dot(X_train, W)+b)
    
    dW = X_train.T.dot(Y_hat - Y_train) / n
    db = np.sum(Y_hat - Y_train) / n
    
    W = W - alpha * dW
    b = b - alpha * db

使用该模型分别对训练集与预测集做预测。**注意这里有一个实现上的坑，就是经过信号函数处理过的输出无法用于计算logistics regression的交叉熵计算，因为$ln(x)$函数不能接受0作为参数。**所以说如果要设计一个函数可以计算训练模型的交叉熵损失，必须提供模型的$W$与$b$，使用模型的原始输出概率进行计算。

In [9]:
threshold = 0.5

# 注意以下输出值不能用于计算交叉熵，只能用于计算准确率
Y_pred_train = np.squeeze(
    np.where(sigmoid(np.dot(X_train, W)+b) > threshold, 1, 0))
Y_pred_test = np.squeeze(
    np.where(sigmoid(np.dot(X_test, W)+b) > threshold, 1, 0))

定义一个Precision函数来评价模型的表现：

In [10]:
def ACC(Y_true,Y_pred):
    return np.sum(Y_true==Y_pred)/len(Y_true)

print(ACC(np.squeeze(Y_train),Y_pred_train),ACC(np.squeeze(Y_test),Y_pred_test))

0.9120879120879121 0.9473684210526315


模型简单打包：

In [11]:
def logit_reg(X,Y,alpha=0.0001,max_iter=2000,threshold=0.5):
    n=X.shape[0]
    m=X.shape[1]
    
    W = 0.001*np.random.rand(m).reshape((m, 1))  # 权重
    b = 0  # 偏置

    for i in range(max_iter):
        Y_hat=sigmoid(np.dot(X_train, W)+b)

        dW = X.T.dot(Y_hat - Y) / n
        db = np.sum(Y_hat - Y) / n

        W = W - alpha * dW
        b = b - alpha * db

        if i%200==200-1:
            Y_hat=sigmoid(np.dot(X_train, W)+b)
            L=np.sum(-np.dot(Y.T,np.log(Y_hat))-np.dot((1-Y).T,np.log(1-Y_hat)))/n
            print(L,end=' ')

    return W,b

W,b=logit_reg(X_train,Y_train)

0.6741321736913188 0.5256622014802633 0.4396315510561449 0.4419725424410019 0.3745682568891109 0.3319956988216062 0.4164478937940294 0.3679559784971766 0.33263426830286424 0.4103189660707004 

## 数据归一化
**Normalization：**
$$
x=\frac{x-x_{min}}{x_{max}-x_{min}}
$$

In [12]:
X=np.row_stack((X_train,X_test))

X_max=X.max(axis=0)
X_min=X.min(axis=0)

X_train_norm=(X_train-X_min)/(X_max-X_min)
X_test_norm=(X_test-X_min)/(X_max-X_min)

对数据归一化之后再测试模型表现：

In [13]:
W,b=logit_reg(X_train_norm,Y_train)

0.672021062910098 0.6623831579053243 0.6502731868055389 0.6384413361573875 0.6272882617586929 0.6167991348107899 0.6069213403740612 0.5976037249806252 0.5888000848490065 0.5804689725772003 

因为数据做了归一化，整个数据集上的梯度分布得到了改良，所以可以调大学习率，由此可以看出数据标准化在logistic regression上的威力：

In [14]:
W,b=logit_reg(X_train_norm,Y_train,alpha=0.1)

0.2178013227443486 0.17037648124448207 0.16455334108979822 0.14799782683921445 0.1398401231894581 0.13376739149466366 0.12859573235845057 0.12411286024441638 0.12018714868013629 0.1167212508622003 

**Standardization：**
$$
x=\frac{x-x_{\mu}}{\sigma}
$$

In [15]:
X=np.row_stack((X_train,X_test))

X_avg=X.mean(axis=0)
X_std=X.std(axis=0)

X_train_std=(X_train-X_avg)/X_std
X_test_std=(X_test-X_avg)/X_std

In [16]:
W,b=logit_reg(X_train_std,Y_train)

4.585594948928191 10.015572317554604 15.452489526828806 20.88948901258511 26.326490511963755 31.763492076384765 37.200493643142025 42.63749520998671 48.074496776834714 53.51149834368241 

In [17]:
W,b=logit_reg(X_train_std,Y_train,alpha=0.0001)

4.724188592470386 10.154844017876874 15.591766645603109 21.02876624157395 26.465767744252595 31.90276930878883 37.33977087555038 42.776772442395234 48.21377400924323 53.650775576090915 