### 步骤1: 编写贝叶斯二值分类器
- 贝叶斯分类器基于贝叶斯定理，最常见的是朴素贝叶斯分类器。二值分类器是最简单的形式，只区分两个类别。
- 语料地址： https://huggingface.co/datasets/dair-ai/emotion/tree/main/data

In [5]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

# 加载数据
def load_data(filename):
    return pd.read_csv(filename)

train = load_data('train.csv')
validation = load_data('validation.csv')
test = load_data('test.csv')

# 合并数据集进行词汇表构建
full_data = pd.concat([train, validation, test])

# 文本向量化
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(full_data['text']).toarray()

# 标签编码
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(full_data['label'])

# 分割数据集
train_size = len(train)
valid_size = len(validation)
test_size = len(test)

X_train = X[:train_size]
y_train = y[:train_size]
X_valid = X[train_size:train_size+valid_size]
y_valid = y[train_size:train_size+valid_size]
X_test = X[-test_size:]
y_test = y[-test_size:]

# 计算先验概率
p_y = np.bincount(y_train) / float(len(y_train))

# 计算条件概率
p_xy = np.zeros((len(np.unique(y)), X_train.shape[1]))  # 类别数 × 词汇表大小
for label in np.unique(y_train):
    X_label = X_train[y_train == label]
    p_xy[label, :] = (X_label.sum(axis=0) + 1) / (X_label.sum() + X_train.shape[1])  # 加上词汇表大小


# 预测函数
def predict(texts):
    X_test = vectorizer.transform(texts).toarray()
    log_p_xy = np.log(p_xy)  # 对数概率
    log_p_y = np.log(p_y)  # 对数先验概率
    log_p = X_test @ log_p_xy.T + log_p_y
    return label_encoder.inverse_transform(np.argmax(log_p, axis=1))

# 使用验证集评估模型
predictions = predict(validation['text'])
accuracy = np.mean(predictions == validation['label'])
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

# 使用测试集评估模型
predictions_test = predict(test['text'])
accuracy_test = np.mean(predictions_test == test['label'])
print(f"Test Accuracy: {accuracy_test * 100:.2f}%")


Validation Accuracy: 76.10%
Test Accuracy: 76.40%


In [6]:
from sklearn.metrics import confusion_matrix, classification_report

# 使用验证集评估模型
predictions = predict(validation['text'])
print("Validation Confusion Matrix:")
print(confusion_matrix(validation['label'], predictions))
print("Validation Classification Report:")
print(classification_report(validation['label'], predictions))

# 使用测试集评估模型
predictions_test = predict(test['text'])
print("Test Confusion Matrix:")
print(confusion_matrix(test['label'], predictions_test))
print("Test Classification Report:")
print(classification_report(test['label'], predictions_test))

Validation Confusion Matrix:
[[515  23   1   3   7   1]
 [ 29 665   5   4   0   1]
 [ 48  80  49   1   0   0]
 [ 60  38   0 174   3   0]
 [ 52  41   0   6 112   1]
 [ 36  28   0   1   9   7]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.94      0.80       550
           1       0.76      0.94      0.84       704
           2       0.89      0.28      0.42       178
           3       0.92      0.63      0.75       275
           4       0.85      0.53      0.65       212
           5       0.70      0.09      0.15        81

    accuracy                           0.76      2000
   macro avg       0.80      0.57      0.60      2000
weighted avg       0.78      0.76      0.73      2000

Test Confusion Matrix:
[[545  26   0   6   4   0]
 [ 17 669   5   1   2   1]
 [ 28  92  34   4   1   0]
 [ 66  41   0 161   7   0]
 [ 64  35   0   6 119   0]
 [ 22  30   0   1  13   0]]
Test Classification Report:
              pre