<a href="https://colab.research.google.com/github/CoreTheGreat/HBPU-Machine-Learning-Course/blob/main/ML_Chapter3_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第三章：分类
湖北理工学院《机器学习》课程资料

作者：李辉楚吴

笔记内容概述:
* 3.1 逻辑回归与二分类问题
* 3.2 常用的二分类模型——支持向量机
* 3.3 常用的二分类模型——决策树
* 3.4 二分类模型的度量
* 3.5 由二分类到多分类
* 3.6 实验3：基于机器学习方法的手写字母识别


## 3.1 逻辑回归与二分类问题

3.1.1 利用torchvision载入训练数据MINST

MINST是一个小型的基于灰度图像(图像大小1x28x28)的手写字母识别数据集，包含60000个训练数据，10000个测试数据。

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim

import numpy as np

from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import roc_curve, auc

import time

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager
font_manager.fontManager.addfont('./Data/simhei.ttf') # Add the font
matplotlib.rc('font', family='SimHei') # Set the font

color_list = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

label_size = 18 # Label size
ticklabel_size = 14 # Tick label size

# Load the MNIST dataset to display
imgDisp = torchvision.datasets.MNIST(root='./data', train=False, download=True)
img, label = imgDisp[0]

print(f'Image size is {img.size}')

fig, ax = plt.subplots(figsize=(7,7))
ax.imshow(img, cmap='gray')
ax.tick_params(axis='both', which='major', labelsize=ticklabel_size) # Set tick label size
ax.set_title(f"Label: {label}", fontsize=label_size)
# plt.savefig(f'exp_character{label}.png', dpi=300) # Make figure clearer
plt.show()

In [None]:
class ftrExtract(object):
    def __call__(self, tensor):
        tensor = tensor.squeeze()

        mean_width = tensor.mean(axis=0)
        mean_height = tensor.mean(axis=1)

        std_width = tensor.std(axis=0)
        std_height = tensor.std(axis=1)

        ftrs = torch.cat([mean_width, mean_height, std_width, std_height])

        return ftrs

# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(), ftrExtract()])

# Load the MNIST dataset
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Count number of each class in trainset
train_class_counts = {}
for _, label in trainset:
    if label not in train_class_counts:
        train_class_counts[label] = 0
    train_class_counts[label] += 1

# Count number of each class in testset
test_class_counts = {}
for _, label in testset:
    if label not in test_class_counts:
        test_class_counts[label] = 0
    test_class_counts[label] += 1

# Print results
for i in range(10):
    cls_counts_train = train_class_counts.get(i, 0)
    cls_ratio_train = cls_counts_train / len(trainset)
    cls_counts_test = test_class_counts.get(i, 0)
    cls_ratio_test = cls_counts_test / len(testset)

    print(f"Class {i}: Trainset - {cls_counts_train} ({cls_ratio_train:.2%}), Testset - {cls_counts_test} ({cls_ratio_test:.2%})")

batch_size = 42
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=0)

# Get a batch of training data
dataiter = iter(trainloader)
data, labels = next(dataiter)

input_size = data[0].numpy().shape[0]
print(f'Input_size is {input_size}')

3.1.2 使用线性回归识别手写字母

In [None]:
# Convert data to numpy arrays
X_train = []
y_train = []
for batch_image, batch_label in trainloader:
    X_train.append(batch_image.view(-1, input_size).numpy())
    y_train.append(batch_label.numpy())

X_train = np.vstack(X_train)
y_train = np.concatenate(y_train)

print(f'Shapes of X_train and Y_train: {X_train.shape} and {y_train.shape}')

X_test = []
y_test = []
for batch_image, batch_label in testloader:
    X_test.append(batch_image.view(-1, input_size).numpy())
    y_test.append(batch_label.numpy())

X_test = np.vstack(X_test)
y_test = np.concatenate(y_test)

print(f'Shapes of X_test and y_test: {X_test.shape} and {y_test.shape}')

使用回归的方法做分类

In [None]:
# Initialize the linear regression model
lr_model = LinearRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test)

# Round predictions to nearest integer for classification
y_pred_rounded = np.round(y_pred).astype(int)
print(f"Predicted classes: {np.unique(y_pred_rounded)}")

# Calculate accuracy
accuracy = np.mean(y_pred_rounded == y_test)
print(f"Real classes: {np.unique(y_test)}")
print(f"Accuracy of linear regression model: {accuracy:.4f}")

# Calculate and print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rounded))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred_rounded)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')

plt.savefig(f'Regression_for_classification.png', dpi=300)
plt.show()

使用逻辑回归做分类

In [None]:
# Initialize the linear regression model
lr_model = LogisticRegression(max_iter=1000)

# Train the model
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test)
y_pred_proba = lr_model.predict_proba(X_test)

# Round predictions to nearest integer for classification
y_pred_rounded = np.round(y_pred).astype(int)
print(f"Predicted classes: {np.unique(y_pred_rounded)}")

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of logistic regression model: {accuracy:.4f}")

# Calculate and print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')

plt.savefig(f'Logicregression_for_classification.png', dpi=300)
plt.show()


3.1.3 使用逻辑回归识别手写字母"1"

构建二分类数据集

In [None]:
# Extract features and labels from trainset
x_train = []
y_train = []
for image, label in trainset:
    x_train.append(image.numpy())
    y_train.append(1 if label == 1 else 0)  # Set label to 1 for character 1, 0 otherwise

x_train = np.array(x_train)
y_train = np.array(y_train)

# Extract features and labels from trainset
x_test = []
y_test = []
for image, label in testset:
    x_test.append(image.numpy())
    y_test.append(1 if label == 1 else 0)  # Set label to 1 for character 1, 0 otherwise

x_test = np.array(x_test)
y_test = np.array(y_test)

训练逻辑回归模型

In [None]:
# Define logic regression model
mdl_logic = LogisticRegression(max_iter=1000)

# Train model
start_time = time.time()
mdl_logic.fit(x_train, y_train)
end_time = time.time()

print(f'Training time: {end_time - start_time:.2f} seconds')

模型测试

In [None]:
y_pred_logic = mdl_logic.predict(x_test)
y_proba_logic = mdl_logic.predict_proba(x_test) # Output ratio

accuracy = accuracy_score(y_test, y_pred_logic)
precision = precision_score(y_test, y_pred_logic)
recall = recall_score(y_test, y_pred_logic)
f1 = f1_score(y_test, y_pred_logic)

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}')

案例演示：随机选取图片，输出判断结果

In [None]:
# Random select 3 examples from imgDisp and testset
np.random.seed(42)
idx = np.random.choice(len(imgDisp), 3)

# Select instances
imgDisp_select = [imgDisp[i] for i in idx]
x_select = x_test[idx]
y_select = y_test[idx]

y_select_proba = mdl_logic.predict_proba(x_select)

# Check the selected instances' labels are the same
for i in range(len(idx)):
    print(f'Sample {i+1}: imgDisp label is {imgDisp_select[i][1]}, x label is {y_select[i]}')

    # Display image from imgDisp
    fig, ax = plt.subplots(figsize=(7,7))
    ax.imshow(imgDisp_select[i][0], cmap='gray')
    ax.tick_params(axis='both', which='major', labelsize=ticklabel_size) # Set tick label size
    ax.set_title(f"Label: {imgDisp_select[i][1]}, Prediction: {y_select_proba[i,1]:.4f}", fontsize=label_size)

    # plt.savefig(f'binary_prediction_{i+1}.png', dpi=300) # Make figure clearer
    plt.show()

## 3.2 常用的二分类模型——支持向量机

In [None]:
# Define SVM classifier
mdl_svm = svm.SVC(kernel='linear', probability=True)

# Train model
start_time = time.time()
mdl_svm.fit(x_train, y_train)
end_time = time.time()

print(f'Training time: {end_time - start_time:.2f} seconds')

In [None]:
# Make predictions and evaluate the model
y_pred_svm = mdl_svm.predict(x_test)
y_proba_svm = mdl_svm.predict_proba(x_test) # Output ratio

accuracy = accuracy_score(y_test, y_pred_svm)
precision = precision_score(y_test, y_pred_svm)
recall = recall_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm)

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}')

## 3.3 常用的二分类模型——决策树和随机森林

In [None]:
# Define DecisionTree classifier
mdl_dt = tree.DecisionTreeClassifier()

# Train model
start_time = time.time()
mdl_dt.fit(x_train, y_train)
end_time = time.time()

print(f'Training time: {end_time - start_time:.2f} seconds')

# Define Random Forest classifier
mdl_rf = RandomForestClassifier(n_estimators=100)

# Train model
start_time = time.time()
mdl_rf.fit(x_train, y_train)
end_time = time.time()

print(f'Training time: {end_time - start_time:.2f} seconds')

In [None]:
y_pred_dt = mdl_dt.predict(x_test)
y_proba_dt = mdl_dt.predict_proba(x_test) # Output ratio

accuracy = accuracy_score(y_test, y_pred_dt)
precision = precision_score(y_test, y_pred_dt)
recall = recall_score(y_test, y_pred_dt)
f1 = f1_score(y_test, y_pred_dt)

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}')

y_pred_rf = mdl_rf.predict(x_test)
y_proba_rf = mdl_rf.predict_proba(x_test) # Output ratio

accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)

print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}')

## 3.4 二分类模型的度量

准确率、召回率、敏感性、特异性、精确度、F1-Score

In [None]:
def cls_counts(y_test, y_proba, th=0.5):
    y_pred = (y_proba[:,1] > th).astype(int)

    tp_idx = (y_test == 1) & (y_pred == 1)
    fp_idx = (y_test == 0) & (y_pred == 1)
    tn_idx = (y_test == 0) & (y_pred == 0)
    fn_idx = (y_test == 1) & (y_pred == 0)

    tp = np.sum(tp_idx)
    fp = np.sum(fp_idx)
    tn = np.sum(tn_idx)
    fn = np.sum(fn_idx)

    return th, (tp, fp, tn, fn)

th, (tp, fp, tn, fn) = cls_counts(y_test, y_proba_logic)
print(f'Threshold {th}, TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}')

In [None]:
def plot_confusion_matrix(th, tp, fp, tn, fn):
    """Plots a confusion matrix given the number of true positives, false positives,
    true negatives, and false negatives."""
    global label_size, ticklabel_size # Set global variables of font size

    cm = np.array([[tn, fp], [fn, tp]])

    # Display the confusion matrix as a heatmap
    fig, ax = plt.subplots(figsize=(5,5))
    img = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)

    # Add labels to the axes
    tick_marks = np.arange(2)
    ax.set_xticks(tick_marks, ['阴(N)', '阳(P)'], fontsize=ticklabel_size)
    ax.set_yticks(tick_marks, ['真(T)', '假(F)'], fontsize=ticklabel_size)

    # Add the count of each category to the plot
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     fontsize=ticklabel_size,
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    ax.tick_params(axis='both', which='major', labelsize=ticklabel_size) # Set tick label size

    ax.set_ylabel('客观事实（Real Label）', fontsize=label_size)
    ax.set_xlabel('主观判断（Predicted Label）', fontsize=label_size)
    ax.set_title(f'判断阈值(Threshold): {th}', fontsize=label_size)

    return fig, ax

def get_scores(tp, fp, tn, fn):
    precision = tp / (tp + fp)
    recall = tp / (tp + fn) # Also called sensitivity
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    f1 = 2 * precision * recall / (precision + recall)

    specificity = tn / (tn + fp)

    return precision, recall, specificity, accuracy, f1

precision, recall, specificity, accuracy, f1 = get_scores(tp, fp, tn, fn)
print(f'Precision: {precision:.4f}, Recall (Sensitivity): {recall:.4f}, Specificity: {specificity:.4f}, Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}')

# Example usage (replace with your actual values)
fig, ax = plot_confusion_matrix(th, tp, fp, tn, fn)

# plt.savefig(f'binary_confusion_matrix.png', dpi=300) # Make figure clearer
plt.show()

In [None]:
th = 0.1
th, (tp, fp, tn, fn) = cls_counts(y_test, y_proba_logic, th)

precision, recall, specificity, accuracy, f1 = get_scores(tp, fp, tn, fn)
print(f'Precision: {precision:.4f}, Recall (Sensitivity): {recall:.4f}, Specificity: {specificity:.4f}, Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}')

fig, ax = plot_confusion_matrix(th, tp, fp, tn, fn)
# plt.savefig(f'binary_confusion_matrix_0D1.png', dpi=300) # Make figure clearer
plt.show()

In [None]:
th = 0.9
th, (tp, fp, tn, fn) = cls_counts(y_test, y_proba_logic, th)

precision, recall, specificity, accuracy, f1 = get_scores(tp, fp, tn, fn)
print(f'Precision: {precision:.4f}, Recall (Sensitivity): {recall:.4f}, Specificity: {specificity:.4f}, Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}')

fig, ax = plot_confusion_matrix(th, tp, fp, tn, fn)
# plt.savefig(f'binary_confusion_matrix_0D9.png', dpi=300) # Make figure clearer
plt.show()

ROC（Receiver operating characteristic curve）接收者操作特征曲线

In [None]:
def plot_roc_curve_base():
    """Plots the ROC curve and computes AUC."""
    global label_size, ticklabel_size # Set global variables of font size

    fig, ax = plt.subplots(figsize=(8,6))

    ax.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.0])
    ax.tick_params(axis='both', which='major', labelsize=ticklabel_size) # Set tick label size

    ax.set_xlabel('False Positive Rate (FPR)', fontsize=label_size)
    ax.set_ylabel('True Positive Rate (TPR)', fontsize=label_size)

    return fig, ax

def add_roc_curve(ax, y_true, y_proba, curve_color, curve_label):
    """Plots the ROC curve and computes AUC."""

    fpr, tpr, thresholds = roc_curve(y_true, y_proba)
    roc_auc = auc(fpr, tpr)

    roc = ax.plot(fpr, tpr, color=curve_color, lw=2, label=f'{curve_label} (AUC = {roc_auc:.4f})')

    return roc_auc, fpr, tpr, thresholds

fig, ax = plot_roc_curve_base()

roc_auc_logic, fpr_logic, tpr_logic, thresholds_logic = add_roc_curve(ax, y_test, y_proba_logic[:,1], color_list[0], '逻辑回归')
roc_auc_logic, fpr_logic, tpr_logic, thresholds_logic = add_roc_curve(ax, y_test, y_proba_svm[:,1], color_list[1], '支持向量机')
roc_auc_logic, fpr_logic, tpr_logic, thresholds_logic = add_roc_curve(ax, y_test, y_proba_dt[:,1], color_list[2], '决策树')
roc_auc_logic, fpr_logic, tpr_logic, thresholds_logic = add_roc_curve(ax, y_test, y_proba_rf[:,1], color_list[3], '随机森林')

plt.legend(loc="lower right", fontsize=ticklabel_size)
# plt.savefig(f'binary_roc_curve.png', dpi=300) # Make figure clearer
plt.show()


## 3.5 由二分类到多分类

In [None]:
# Extract features and labels from trainset
x_train = []
y_train = []
for image, label in trainset:
    x_train.append(image.numpy())
    y_train.append(label)

x_train = np.array(x_train)
y_train = np.array(y_train)

# Extract features and labels from trainset
x_test = []
y_test = []
for image, label in testset:
    x_test.append(image.numpy())
    y_test.append(label)

x_test = np.array(x_test)
y_test = np.array(y_test)

3.5.1 一对多（One-vs-Rest）方法

In [None]:
# Define logic multi-classifier
mdl_logic_ovr = OneVsRestClassifier(LogisticRegression(max_iter=1000))

# Train model
start_time = time.time()
mdl_logic_ovr.fit(x_train, y_train)
end_time = time.time()

print(f'Training time: {end_time - start_time:.2f} seconds')

# Make predictions and evaluate the model
y_pred_logic_ovr = mdl_logic_ovr.predict(x_test)
y_proba_logic_ovr = mdl_logic_ovr.predict_proba(x_test) # Output ratio

accuracy = accuracy_score(y_test, y_pred_logic_ovr)
print(f'Accuracy: {accuracy:.4f}')

In [None]:
# Get class list: 0, 1, ..., 9
class_list = np.sort(np.unique(y_train))

# Create model list
mdl_logic_list = []
for c in class_list:
    mdl_logic_list.append(LogisticRegression(max_iter=1000))

# Train models seperately
for i in range(len(class_list)):
    start_time = time.time()
    mdl_logic_list[i].fit(x_train, (y_train == class_list[i]).astype(int))
    end_time = time.time()
    print(f'Training class {class_list[i]}, Training time: {end_time - start_time:.2f} seconds')

In [None]:
# Plot ROC curve
fig, ax = plot_roc_curve_base()

# Draw ROC of individual classifier
for i in range(len(class_list)):
    # Make predictions and evaluate the model
    y_test_trans = (y_test == class_list[i]).astype(int)
    y_proba = mdl_logic_list[i].predict_proba(x_test) # Output ratio

    roc_auc_logic, fpr_logic, tpr_logic, thresholds_logic = add_roc_curve(ax, y_test_trans, y_proba[:,1], color_list[i], f'{class_list[i]}')

plt.legend(loc="lower right", fontsize=ticklabel_size)
# plt.savefig(f'binary_roc_curve_ovr.png', dpi=300) # Make figure clearer
plt.show()

In [None]:
sample_num = 10

# Random select 3 examples from imgDisp and testset
np.random.seed(1)
idx = np.random.choice(len(imgDisp), sample_num)

# Select instances
imgDisp_select = [imgDisp[i] for i in idx]
testset_select = [testset[i] for i in idx]

# Check the selected instances' labels are the same
for i in range(sample_num):
    x = testset_select[i][0].view(-1, input_size)

    # Using model to predict character
    y_pred_list = []
    for j in range(len(mdl_logic_list)):
        y_pred_list.append(mdl_logic_list[j].predict(x))

    y_pred = np.argmax(np.array(y_pred_list), axis=0)[0]

    # Display image from imgDisp
    fig, ax = plt.subplots(figsize=(7,7))
    ax.imshow(imgDisp_select[i][0], cmap='gray')
    ax.tick_params(axis='both', which='major', labelsize=ticklabel_size) # Set tick label size
    ax.set_title(f"Label: {imgDisp_select[i][1]}, Prediction Label: {y_pred}", fontsize=label_size)

    print(f'Sample {i+1}: imgDisp label is {imgDisp_select[i][1]}, testset label is {testset_select[i][1]}, predict label is {y_pred}')

In [None]:
# Prediction
y_pred_list = []
for i in range(len(mdl_logic_list)):
    y_pred_list.append(mdl_logic_list[i].predict(x_test))

y_pred = np.argmax(np.array(y_pred_list), axis=0)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

混淆矩阵

In [None]:
# Create confusion matrix
cm_test = np.zeros((10, 10))
for i in range(len(y_test)):
    cm_test[y_test[i], y_pred[i]] += 1

# Display confusion matrix
fig, ax = plt.subplots(figsize=(9,9))
im = ax.imshow(cm_test, cmap=plt.cm.Blues, interpolation='nearest')

# Loop over data dimensions and create text annotations.
for i in range(cm_test.shape[0]):
    for j in range(cm_test.shape[1]):
        ax.text(j, i, cm_test[i, j], fontsize=ticklabel_size, ha="center", va="center",
                color="white" if cm_test[i, j] > cm_test.max() / 2. else "black")

ax.set_xlabel('Predicted label', fontsize=label_size)
ax.set_ylabel('True label', fontsize=label_size)

ax.set_xticks(np.arange(10))
ax.set_xticklabels(np.arange(10))

ax.set_yticks(np.arange(10))
ax.set_yticklabels(np.arange(10))

ax.tick_params(axis='both', which='major', labelsize=ticklabel_size)

# plt.savefig(f'confusion_matrix_numel.png', dpi=300) # Make figure clearer

plt.show()

In [None]:
# Create confusion matrix
cm_test = np.zeros((10, 10))
for i in range(len(y_test)):
    cm_test[y_test[i], y_pred[i]] += 1

# Change value to ratio
cm_test = cm_test / np.sum(cm_test, axis=1, keepdims=True)

# Display confusion matrix
fig, ax = plt.subplots(figsize=(9,9))
im = ax.imshow(cm_test, cmap=plt.cm.Blues, interpolation='nearest')

# Loop over data dimensions and create text annotations.
for i in range(cm_test.shape[0]):
    for j in range(cm_test.shape[1]):
        ax.text(j, i, format(cm_test[i, j], '.2f'), fontsize=ticklabel_size, ha="center", va="center",
                color="white" if cm_test[i, j] > cm_test.max() / 2. else "black")

ax.set_xlabel('Predicted label', fontsize=label_size)
ax.set_ylabel('True label', fontsize=label_size)

ax.set_xticks(np.arange(10))
ax.set_xticklabels(np.arange(10))

ax.set_yticks(np.arange(10))
ax.set_yticklabels(np.arange(10))

ax.tick_params(axis='both', which='major', labelsize=ticklabel_size)

# plt.savefig(f'confusion_matrix_ratio.png', dpi=300) # Make figure clearer

plt.show()

3.5.2 一对一（One-vs-One）方法

In [None]:
# Define logic regression classifier
mdl_logic_ovo = OneVsOneClassifier(LogisticRegression(max_iter=1000))

# Train model
start_time = time.time()
mdl_logic_ovo.fit(x_train, y_train)
end_time = time.time()

print(f'Training time: {end_time - start_time:.2f} seconds')

# Make predictions and evaluate the model
y_pred = mdl_logic_ovo.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

In [None]:
# Get class list: 0, 1, ..., 9
class_list = np.sort(np.unique(y_train))

# Create model matrix to save models
mdl_logic_matrix = {}
for cls_p in class_list:
    mdl_logic_matrix[cls_p] = {}
    for cls_n in class_list:
        if cls_p == cls_n:
            continue
        mdl_logic_matrix[cls_p][cls_n] = LogisticRegression(max_iter=1000)

for cls_p in class_list:
    # Training data of positive class
    x_train_ovo_p = x_train[(y_train == cls_p), :]
    y_train_ovo_p = np.ones(x_train_ovo_p.shape[0])

    # Testing data of positive class
    x_test_ovo_p = x_test[(y_test == cls_p), :]
    y_test_ovo_p = np.ones(x_test_ovo_p.shape[0])

    for cls_n in class_list:
        if cls_p == cls_n:
            continue

        # Training data of negative class
        x_train_ovo_n = x_train[(y_train == cls_n), :]
        y_train_ovo_n = np.zeros(x_train_ovo_n.shape[0])

        # Testing data of negative class
        x_test_ovo_n = x_test[(y_test == cls_n), :]
        y_test_ovo_n = np.zeros(x_test_ovo_n.shape[0])

        # Concatenate data for training
        x_train_ovo = np.concatenate((x_train_ovo_p, x_train_ovo_n), axis=0)
        y_train_ovo = np.concatenate((y_train_ovo_p, y_train_ovo_n), axis=0)

        # Model training
        start_time = time.time()
        mdl_logic_matrix[cls_p][cls_n].fit(x_train_ovo, y_train_ovo)
        end_time = time.time()

        # Concatenate data for testing
        x_test_ovo = np.concatenate((x_test_ovo_p, x_test_ovo_n), axis=0)
        y_test_ovo = np.concatenate((y_test_ovo_p, y_test_ovo_n), axis=0)

        # Test model on sub-task
        y_proba_ovo = mdl_logic_matrix[cls_p][cls_n].predict_proba(x_test_ovo) # Output ratio

        # Display results
        _, (tp, fp, tn, fn) = cls_counts(y_test_ovo, y_proba_ovo)
        precision, recall, specificity, accuracy, f1 = get_scores(tp, fp, tn, fn)
        print(f'Training class {cls_p} ({x_train_ovo_p.shape[0]}) vs class {cls_n} ({x_train_ovo_n.shape[0]}), Training time: {end_time - start_time:.2f} seconds, Precision: {precision:.4f}, Recall (Sensitivity): {recall:.4f}, Specificity: {specificity:.4f}, Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}')

In [None]:
# Select class 1
x_test_select = x_test[:, :]

# Prediction
y_pred_counts = np.zeros((x_test_select.shape[0], len(class_list)))

for cls_p in class_list:
    for cls_n in class_list:
        if cls_p == cls_n:
            continue

        y_pred_counts[:, cls_p] = y_pred_counts[:, cls_p] + mdl_logic_matrix[cls_p][cls_n].predict(x_test_select)

y_pred = np.argmax(y_pred_counts, axis=1)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

3.5.3 Softmax回归

In [None]:
mdl_softmax = LogisticRegression(max_iter=1000, solver='lbfgs')

start_time = time.time()
mdl_softmax.fit(x_train, y_train)
end_time = time.time()

print(f'Training time: {end_time - start_time:.2f} seconds')

# Evaluate accuracy (or other metrics)
y_pred = mdl_softmax.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# One-hot encoding
def one_hot_encode(y, num_classes):
    """Converts integer labels to one-hot encoding."""
    one_hot = np.zeros((y.shape[0], num_classes))
    one_hot[np.arange(y.shape[0]), y] = 1
    return one_hot

# Example usage:
num_classes = len(class_list)
y_train_onehot = one_hot_encode(y_train, num_classes)

# Display one-hot encoding results of ten random sample
for _ in range(10):
    idx = np.random.randint(0, y_train_onehot.shape[0])

    print(f'Sample {idx+1},\t Class {y_train[idx]}: {y_train_onehot[idx,:]}')

In [None]:
# Softmax function
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

# Cross-entropy loss
def cross_entropy_loss(y, y_pred):
    """Compute cross-entropy loss."""
    epsilon = 1e-15  # Small value to avoid log(0)
    loss = -np.sum(y * np.log(y_pred + epsilon)) / y.shape[0]
    return loss

def gradient_descent(x, y, learning_rate, num_iterations):
    """Performs gradient descent optimization."""
    num_samples, num_features = x.shape
    num_classes = y.shape[1]

    # Initialize weights and bias
    w = np.random.randn(num_features, num_classes)
    b = np.zeros(num_classes)

    for i in range(num_iterations):
        # Forward pass
        scores = np.dot(x, w) + b
        y_pred = softmax(scores)

        # Compute loss
        loss = cross_entropy_loss(y, y_pred)

        # Backward pass (compute gradients), penalty 'l2'
        dw = (1 / num_samples) * np.dot(x.T, (y_pred - y)) + 0.1 * w
        db = (1 / num_samples) * np.sum(y_pred - y, axis=0)

        # Update parameters
        w -= learning_rate * dw
        b -= learning_rate * db

        if i % 100 == 0:
            print(f'Iteration {i}, Loss: {loss}')

    return w, b

def predict(x, w, b):
    """Predicts class labels for input data."""
    scores = np.dot(x, w) + b
    y_pred = softmax(scores)
    return np.argmax(y_pred, axis=1)

In [None]:
# Perform gradient descent
start_time = time.time()
w, b = gradient_descent(x_train, y_train_onehot, learning_rate=0.1, num_iterations=1000)
end_time = time.time()

print(f'Training time: {end_time - start_time:.2f} seconds')

# Make predictions
y_pred = predict(x_test, w, b)

# Evaluate accuracy (or other metrics)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

## 3.6 实验3：基于机器学习方法的手写字母识别

此部分需要同学自行完成各个任务要求：
* 数据读取、特征提取及分析
* 分别使用逻辑回归、SVM、决策树、随机森林将手写字母分为大数（5-9）和小数（0-4）
* 尝试结合随机森林的思想，联合多个不同的分类器进行判断
* 使用ROC展示并分析二分类模型的结果
* 分别以One-vs-Rest, One-vs-One和softmax的方式识别手写字母
* 画出手写字母识别精度的分布以及混淆矩阵，并进行必要的描述与分析