# 逻辑回归
## 二分类算法
* Sigmiod函数，1/(1+exp(theta * X)),用于把特征值变成范围为0-1，相当于一个概率值，当大于0.5时分为1


In [92]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 乳腺癌数据集，希望做个二分类模型去判别是否为癌症
from sklearn.datasets import load_breast_cancer 
cancer = load_breast_cancer()

data=cancer["data"]
col = cancer['feature_names']       #提取特征的名字
x = pd.DataFrame(data,columns=col)  #那些特征的数值,30个特征，569个数据
target = cancer.target.astype(int)
y = pd.DataFrame(target,columns=['target'])#对应特征组合下的类别标签

# 划分数据集
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

model = LogisticRegression()#默认参数
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
# 计算准确率  
accuracy = np.mean(y_pred == y_test.T)  
print(f'Accuracy: {accuracy * 100:.2f}%') 

              precision    recall  f1-score   support

           0       0.92      0.96      0.94        47
           1       0.97      0.94      0.95        67

    accuracy                           0.95       114
   macro avg       0.94      0.95      0.95       114
weighted avg       0.95      0.95      0.95       114

Accuracy: 94.74%


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


下面是用梯度下降优化器来实现

In [94]:
from sklearn.linear_model import SGDClassifier  

# 创建 SGDClassifier，使用逻辑回归  

model = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3,random_state=0)  # 设置 loss='log' 用于逻辑回归  

# 训练模型  
model.fit(x_train, y_train)  

# 进行预测  
y_pred = model.predict(x_test)  

# 输出分类报告  
report = classification_report(y_test, y_pred)  
print(report)
# 计算准确率  
accuracy = np.mean(y_pred == y_test.T)  
print(f'Accuracy: {accuracy * 100:.2f}%') 

              precision    recall  f1-score   support

           0       0.63      0.96      0.76        47
           1       0.95      0.61      0.75        67

    accuracy                           0.75       114
   macro avg       0.79      0.78      0.75       114
weighted avg       0.82      0.75      0.75       114

Accuracy: 75.44%


  y = column_or_1d(y, warn=True)


下面我们来手动实现一下逻辑回归，还是用上面的数据来分类癌症患者

In [96]:
import warnings  
# 忽略所有的 RuntimeWarning  
warnings.simplefilter(action='ignore', category=RuntimeWarning)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# 癌症数据集，希望做个二分类模型去判别是否为癌症
from sklearn.datasets import load_breast_cancer 
cancer = load_breast_cancer()

data=cancer["data"]
col = cancer['feature_names']       #提取特征的名字
X = pd.DataFrame(data,columns=col)  #那些特征的数值,30个特征，569个数据
target = cancer.target.astype(int)
y = pd.DataFrame(target,columns=['target'])#对应特征组合下的类别标签
y=y.to_numpy()

X_b=np.concatenate([np.ones((len(y),1)),X],axis=1)

# 划分数据集
x_train,x_test,y_train,y_test=train_test_split(X_b,y,test_size=0.2,random_state=0)

# 学习率衰减策略，迭代到后面学习率逐渐变小
t0,t1=5,50
def learning_schedule(t):
    return t0/(t+t1)

# 迭代次数
n_epoch=10000
# 初始化待求参数
theta=np.ones((31,1))
# 使用样本数
m=len(y_test)


for epoch in range(n_epoch):
    Sigmoid=1/(1+np.exp(-theta.T.dot(x_train.T)))

    gradients= -1/m * (y_train.T-Sigmoid).dot(x_train)
    eta=learning_schedule(epoch)
    theta=theta - eta*gradients.T

# 验证集检验
y_test_pred=(1/(1+np.exp(-theta.T.dot(x_test.T)))>0.5).astype(int)

# 计算准确率  
accuracy = np.mean(y_test_pred == y_test.T)  
print(f'Accuracy: {accuracy * 100:.2f}%') 

Accuracy: 90.35%


下面是去掉常数项之后的结果，比原本的结果略差

In [98]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# 癌症数据集，希望做个二分类模型去判别是否为癌症
from sklearn.datasets import load_breast_cancer 
cancer = load_breast_cancer()

data=cancer["data"]
col = cancer['feature_names']       #提取特征的名字
X = pd.DataFrame(data,columns=col)  #那些特征的数值,30个特征，569个数据
target = cancer.target.astype(int)
y = pd.DataFrame(target,columns=['target'])#对应特征组合下的类别标签

# 划分数据集
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# 学习率衰减策略，迭代到后面学习率逐渐变小
t0,t1=5,50
def learning_schedule(t):
    return t0/(t+t1)

# 迭代次数
n_epoch=1000
# 初始化待求参数
theta=np.ones((30,1))
# 使用样本数
m=len(y_test)


for epoch in range(n_epoch):
    Sigmoid=1/(1+np.exp(-theta.T.dot(x_train.T)))
    gradients= -1/m * (y_train.T-Sigmoid).dot(x_train)
    eta=learning_schedule(epoch)
    theta=theta - eta*gradients.T

# 验证集检验
y_test_pred=(1/(1+np.exp(-theta.T.dot(x_test.T)))>0.5).astype(int)

# 计算准确率  
accuracy = np.mean(y_test_pred == y_test.T)  
print(f'Accuracy: {accuracy * 100:.2f}%') 

Accuracy: 86.84%


## 多类别分类   
* 多分类用到solfmax，就是多个类别的概率归一化，然后求损失函数最小，
    这里用到一个鸢尾花的数据集

In [None]:
from sklearn.datasets import load_iris
data = load_iris()
# 可以打印数据集的说明
print(data.DESCR)

In [138]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X=data['data']
y=data["target"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)
log_res=LogisticRegression(solver='lbfgs')
log_res.fit(X_train,y_train)
y_pred=log_res.predict(X_test)
# 这个函数可以返回每个类别的概率值
print(log_res.predict_proba(X_test[1:2,:]))

accuracy=np.mean((y_pred==y_test).astype(int))
print(accuracy)

[[9.64127138e-01 3.58727086e-02 1.53820941e-07]]
[4.6 3.4 1.4 0.3]
0.9666666666666667
