## 数据准备

In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [2]:
data=load_breast_cancer()
X=data.data
Y=data.target

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

数据$X$是一个$(n{\times}m)$的矩阵，每一行是一个样本，每一列代表一个特征：

In [3]:
n=X_train.shape[0]
m=X_train.shape[1]

标签$Y$是一个列向量，其行数与$X$相同：

In [4]:
Y_train = Y_train.reshape((n, 1))
Y_test = Y_test.reshape((-1, 1))

## 粗略模型

模型表达式为：
$$
\hat{Y}=\sigma{(XW^{T}+b)}
$$
其中
$$
\sigma(x)=\frac{1}{1+e^{-x}}
$$
权重系数$W$的形状为$(1,m)$，偏置系数$b$为单变量系数。

In [6]:
# 注意这里有一个计算精度问题。在实际中，sigmoid函数只有在正负无穷处才能取到1或0，
# 而在计算机中因为精度问题，在值稍大或稍小时即会使sigmoid函数取到1或0，需要对这种情况做处理。
def sigmoid(x):
    epsilon=1e-8
    val=1/(1+np.exp(-x))
    
    ones_index=np.where(val>1-epsilon)
    val[ones_index]=1-epsilon
    
    zeros_index=np.where(val<epsilon)
    val[zeros_index]=epsilon
    
    return val

W = np.random.rand(m).reshape((1, -1))  # 权重，行向量
b = np.ones((1, 1))  # 偏置

Y_hat=np.dot(X_train, W.T)+b

模型的损失函数为：
$$
\begin{align}
L&=-\sum\limits_{i=1}^n[y^{(i)}\ln{y^{(i)}}+(1-y^{(i)})\ln{(1-y^{(i)})}] \\
&=-\frac{1}{n}[Y^{T}\ln{\hat{Y}}+(1-Y)^{T}\ln{(1-\hat{Y})}] \\
\end{align}
$$
损失函数关于参数$W$与$b$的梯度可以求得：
$$
\begin{align}
\frac{\partial{L}}{\partial{W}}&=\frac{1}{n}(\hat{Y}-Y)^{T}{\cdot}X \\
\frac{\partial{L}}{\partial{b}}&=\frac{1}{n}(\hat{Y}-Y)^{T}{\cdot}[1,1,...,1]^{T} \\
\end{align}
$$

In [7]:
dW = (Y_hat - Y_train).T.dot(X_train) / n
db = (Y_hat - Y_train).T.dot(np.ones((n, 1))) / n

参数的迭代更新公式：
$$
W:=W-{\alpha}\frac{\partial{L}}{\partial{W}}, \quad b:b-{\alpha}\frac{\partial{L}}{\partial{b}}
$$

In [8]:
max_iter=1000
alpha=0.000001        # 注意学习率过大会导致震荡，然后误差越来越大

for i in range(max_iter+1):
    Y_hat=sigmoid(np.dot(X_train, W.T)+b)
    
    dW = (Y_hat - Y_train).T.dot(X_train) / n
    db = (Y_hat - Y_train).T.dot(np.ones((n, 1))) / n
    
    W = W - alpha * dW
    b = b - alpha * db

使用该模型分别对训练集与预测集做预测：

In [9]:
threshold=0.5
Y_pred_train=np.where(sigmoid(np.dot(X_train, W.T)+b)>threshold,1,0)
Y_pred_test=np.where(sigmoid(np.dot(X_test, W.T)+b)>threshold,1,0)

定义一个Precision函数来评价模型的表现：

In [10]:
def ACC(Y_true,Y_pred):
    return np.sum(Y_true==Y_pred)/Y_true.shape[0]

print(ACC(Y_train,Y_pred_train),ACC(Y_test,Y_pred_test))

0.7098901098901099 0.7631578947368421


模型简单打包：

In [11]:
def logit_reg(X,Y,alpha=0.000001,max_iter=2000,threshold=0.5):
    n=X.shape[0]
    m=X.shape[1]
    
    W = np.random.rand(m).reshape((1, -1))  # 权重，行向量
    b = np.ones((1, 1))  # 偏置
#     epsilon=1e-10        # 因计算精度问题，在做ln运算时加上这个极小值

    for i in range(max_iter+1):
        Y_hat=sigmoid(np.dot(X_train, W.T)+b)

        dW = (Y_hat - Y).T.dot(X) / n
        db = (Y_hat - Y).T.dot(np.ones((n, 1))) / n

        W = W - alpha * dW
        b = b - alpha * db

        if i%200==0:
            Y_hat=sigmoid(np.dot(X_train, W.T)+b)
            L=np.sum(-np.dot(Y.T,np.log(Y_hat))-np.dot((1-Y).T,np.log(1-Y_hat)))
            print(L,end=' ')

    return W,b

W,b=logit_reg(X_train,Y_train)

3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3181.3082791878833 2186.2648280676976 914.9527068357011 705.9708180021119 678.5474657340123 649.2502241361556 

  """


## 数据归一化
**Normalization：**
$$
x=\frac{x-x_{min}}{x_{max}-x_{min}}
$$

In [12]:
X=np.row_stack((X_train,X_test))

X_max=X.max(axis=0)
X_min=X.min(axis=0)

X_train_norm=(X_train-X_min)/(X_max-X_min)
X_test_norm=(X_test-X_min)/(X_max-X_min)

对数据归一化之后再测试模型表现：

In [13]:
W,b=logit_reg(X_train_norm,Y_train)

3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 

因为数据做了归一化，整个数据集上的梯度分布得到了改良，所以可以调大学习率，由此可以看出数据标准化在logistic regression上的威力：

In [20]:
W,b=logit_reg(X_train_norm,Y_train,alpha=0.1)

3223.6191321123324 1061.4825061220477 141.96491849822374 62.43757154310592 56.70962883830205 53.534686901478565 51.140023280276885 49.20319119624459 47.570556619499115 46.15747225749712 44.9126731507272 

**Standardization：**
$$
x=\frac{x-x_{\mu}}{\sigma}
$$

In [15]:
X=np.row_stack((X_train,X_test))

X_avg=X.mean(axis=0)
X_std=X.std(axis=0)

X_train_std=(X_train-X_avg)/X_std
X_test_std=(X_test-X_avg)/X_std

In [21]:
W,b=logit_reg(X_train_std,Y_train)

3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 

In [66]:
W,b=logit_reg(X_train_std,Y_train,alpha=0.001)

3223.6191321123324 3223.6191321123324 3223.6191321123324 3223.6191321123324 3181.350301464521 1337.6074968055227 919.0454196438369 778.6788065482548 724.2800249855735 698.1279193400684 675.5638209393013 

初步观察，在logistic regression中好像有负数就不太行的样子，后续待补充。

In [None]:
X=np.row_stack((X_train,X_test))

X_max=X.max(axis=0)
X_min=X.min(axis=0)
X_avg=X.mean(axis=0)

X_train_norm=(X_train-X_avg)/(X_max-X_min)
X_test_norm=(X_test-X_avg)/(X_max-X_min)