# 环境说明

In [1]:
!python -V

Python 3.11.5


In [7]:
import sklearn
import pandas
import statsmodels

In [3]:
sklearn.__version__

'1.5.2'

In [4]:
pandas.__version__

'2.1.1'

In [8]:
statsmodels.__version__

'0.14.4'

# 代码实现

In [9]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [10]:
df = pd.read_csv("data/train.csv")

In [11]:
df.head()

Unnamed: 0,Age,Blood_Sugar,Blood_Pressure,Weight,Heart_Disease
0,58,99.975461,106.328124,62.448684,0
1,71,93.609515,134.190672,67.875853,0
2,48,119.131949,115.773278,49.263101,0
3,34,98.626509,117.651983,93.111787,0
4,62,114.056088,119.456831,71.043221,0


In [12]:
label_name = 'Heart_Disease'

In [13]:
X = df.drop(label_name, axis=1)
y = df[label_name]

# 回归结果

## Stasmodels

In [14]:
# 添加常数列（截距项）到特征中
X_ = sm.add_constant(X)

# 拟合逻辑回归模型
model = sm.Logit(y, X_)
result = model.fit()

# 输出模型摘要
print(result.summary())

# 进行预测（概率值）
y_pred_prob = result.predict(X_)
# 将概率值转化为0/1预测类别，使用阈值0.5
y_pred_class = (y_pred_prob > 0.5).astype(int)
# 计算准确率
accuracy = accuracy_score(y, y_pred_class)
print(f"准确率: {accuracy:.4f}")

# 计算AUC（曲线下面积）
auc = roc_auc_score(y, y_pred_prob)
print(f"AUC: {auc:.4f}")

Optimization terminated successfully.
         Current function value: 0.022594
         Iterations 14
                           Logit Regression Results                           
Dep. Variable:          Heart_Disease   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      995
Method:                           MLE   Df Model:                            4
Date:                Mon, 17 Feb 2025   Pseudo R-squ.:                  0.8405
Time:                        17:15:59   Log-Likelihood:                -22.594
converged:                       True   LL-Null:                       -141.63
Covariance Type:            nonrobust   LLR p-value:                 2.421e-50
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const           -109.6707     24.800     -4.422      0.000    -158.278     -61.064
Age             

## sklearn

In [17]:
lr = LogisticRegression(fit_intercept=True, penalty=None)
lr.fit(X, y)

coef_df = pd.DataFrame({'col': X.columns, 'coef': lr.coef_.ravel()})
intercept_df = pd.DataFrame({'col': ['intercept'], 'coef': lr.intercept_})
total_coef_df = pd.concat([coef_df.sort_values(by='coef', ascending=False), intercept_df])

print(total_coef_df)

pred = lr.predict(X)
prob = lr.predict_proba(X)

accuracy = accuracy_score(y, pred)
print(f"准确率: {accuracy:.4f}")

# 计算AUC（曲线下面积）
auc = roc_auc_score(y, prob[:, 1])
print(f"AUC: {auc:.4f}")

              col        coef
1     Blood_Sugar    0.555694
0             Age    0.495265
3          Weight    0.064683
2  Blood_Pressure    0.013461
0       intercept -109.655631
准确率: 0.9910
AUC: 0.9971
