# Logistic Regression Explained


我们使用作业里的野火数据，通过图表和数据展示 Logistic Regression 的原理：

1. 读取与预处理数据（目标二值化 + 特征标准化）。
2. 查看类别分布与 Sigmoid 曲线。
3. 训练 scikit-learn 的 LogisticRegression 并输出指标与混淆矩阵。
4. 手写梯度下降版本，观察损失和准确率随迭代的变化。
5. 仅用温度与湿度两个特征画出决策边界。


In [None]:

from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

plt.style.use('seaborn-v0_8')
DATA_DIR = Path.cwd()
TRAIN_PATH = DATA_DIR / 'wildfires_training.csv'
TEST_PATH = DATA_DIR / 'wildfires_test.csv'


In [None]:

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
train_df.head()


In [None]:

y_train = train_df['fire'].map({'yes': 1, 'no': 0}).astype(int)
X_train = train_df.drop(columns='fire')
y_test = test_df['fire'].map({'yes': 1, 'no': 0}).astype(int)
X_test = test_df.drop(columns='fire')

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

print('Scaled train shape:', X_train_std.shape)
print('Scaled test shape :', X_test_std.shape)
print('
Training class counts:')
print(y_train.value_counts())


In [None]:

ax = y_train.value_counts().sort_index().plot(kind='bar', color=['tab:blue', 'tab:red'])
ax.set_xlabel('Label (0=no fire, 1=fire)')
ax.set_ylabel('Count')
ax.set_title('Training class distribution')
plt.show()


In [None]:

z = np.linspace(-8, 8, 400)
plt.plot(z, 1 / (1 + np.exp(-z)))
plt.axvline(0, color='0.5', linestyle='--', alpha=0.6)
plt.axhline(0.5, color='0.5', linestyle='--', alpha=0.6)
plt.xlabel('Linear combination z')
plt.ylabel('Sigmoid(z)')
plt.title('Sigmoid function')
plt.grid(alpha=0.3)
plt.show()


In [None]:

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_std, y_train)

train_pred = model.predict(X_train_std)
test_pred = model.predict(X_test_std)
print(f"Train accuracy: {accuracy_score(y_train, train_pred):.4f}")
print(f"Test accuracy : {accuracy_score(y_test, test_pred):.4f}
")
print('Classification report (test set):')
print(classification_report(y_test, test_pred))


In [None]:

cm = confusion_matrix(y_test, test_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion matrix (baseline model)')
plt.show()


In [None]:

X_train_np = X_train_std.astype(float)
X_test_np = X_test_std.astype(float)
y_train_np = y_train.to_numpy(dtype=float)
y_test_np = y_test.to_numpy(dtype=float)

weights = np.zeros(X_train_np.shape[1])
bias = 0.0
learning_rate = 0.1
epochs = 400

history = []

for epoch in range(1, epochs + 1):
    scores = X_train_np @ weights + bias
    probs = 1 / (1 + np.exp(-scores))

    loss = -np.mean(y_train_np * np.log(probs + 1e-15) + (1 - y_train_np) * np.log(1 - probs + 1e-15))
    train_acc = (probs >= 0.5).astype(int).mean() if y_train_np.ndim == 0 else ((probs >= 0.5).astype(int) == y_train_np).mean()
    test_probs = 1 / (1 + np.exp(-(X_test_np @ weights + bias)))
    test_acc = ((test_probs >= 0.5).astype(int) == y_test_np).mean()
    history.append((epoch, loss, train_acc, test_acc))

    error = probs - y_train_np
    grad_w = X_train_np.T @ error / len(y_train_np)
    grad_b = error.mean()
    weights -= learning_rate * grad_w
    bias -= learning_rate * grad_b

    if epoch % 50 == 0 or epoch == 1:
        print(f"Epoch {epoch:3d} | loss={loss:.4f} | train_acc={train_acc:.3f} | test_acc={test_acc:.3f}")

final_train_acc = ((1 / (1 + np.exp(-(X_train_np @ weights + bias))) >= 0.5).astype(int) == y_train_np).mean()
final_test_acc = ((1 / (1 + np.exp(-(X_test_np @ weights + bias))) >= 0.5).astype(int) == y_test_np).mean()
print(f"
Manual GD model -> train_acc={final_train_acc:.4f}, test_acc={final_test_acc:.4f}")


In [None]:

history_df = pd.DataFrame(history, columns=['epoch', 'loss', 'train_acc', 'test_acc'])
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(history_df['epoch'], history_df['loss'])
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Binary cross-entropy loss')
axes[0].set_title('Loss over epochs')
axes[0].grid(alpha=0.3)

axes[1].plot(history_df['epoch'], history_df['train_acc'], label='train')
axes[1].plot(history_df['epoch'], history_df['test_acc'], label='test')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_ylim(0.6, 1.0)
axes[1].set_title('Accuracy over epochs')
axes[1].legend()
axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:

cols = ['temp', 'humidity']
X_train_2d = X_train[cols]
X_test_2d = X_test[cols]

scaler_2d = StandardScaler()
X_train_2d_std = scaler_2d.fit_transform(X_train_2d)
X_test_2d_std = scaler_2d.transform(X_test_2d)

model_2d = LogisticRegression(max_iter=1000, random_state=42)
model_2d.fit(X_train_2d_std, y_train)
print(f"2D model test accuracy: {accuracy_score(y_test, model_2d.predict(X_test_2d_std)):.3f}")


In [None]:

xx, yy = np.meshgrid(
    np.linspace(X_train_2d_std[:, 0].min() - 1, X_train_2d_std[:, 0].max() + 1, 200),
    np.linspace(X_train_2d_std[:, 1].min() - 1, X_train_2d_std[:, 1].max() + 1, 200)
)

grid = np.c_[xx.ravel(), yy.ravel()]
z = model_2d.predict_proba(grid)[:, 1].reshape(xx.shape)

plt.figure(figsize=(6, 5))
plt.contourf(xx, yy, z, levels=np.linspace(0, 1, 11), cmap='coolwarm', alpha=0.7)
plt.contour(xx, yy, z, levels=[0.5], colors='k', linewidths=2)
plt.scatter(X_train_2d_std[y_train == 1, 0], X_train_2d_std[y_train == 1, 1], c='tab:red', edgecolor='k', label='fire (train)', alpha=0.7)
plt.scatter(X_train_2d_std[y_train == 0, 0], X_train_2d_std[y_train == 0, 1], c='tab:blue', edgecolor='k', label='no fire (train)', alpha=0.7)
plt.xlabel('temp (scaled)')
plt.ylabel('humidity (scaled)')
plt.title('Decision boundary (temp vs humidity)')
plt.legend()
plt.tight_layout()
plt.show()



**总结**：Sigmoid 把线性组合映射成概率，标准化让优化更稳定；梯度下降逐步降低损失；决策边界在标准化后的空间里是一条直线。
