In [None]:
from sklearn.datasets import fetch_20newsgroups

# 加载20类新闻数据集
newsgroups = fetch_20newsgroups(data_home='data',subset='all')


print("特征（文本数据）示例：")
print(newsgroups.data[0])
print("\n标签示例：", newsgroups.target[0])
print("标签总数：", len(newsgroups.target))
print("类别名称：", newsgroups.target_names)
print("data类型：", type(newsgroups.data))
print("target类型：", type(newsgroups.target))
print("target最小值：", min(newsgroups.target))
print("target最大值：", max(newsgroups.target))



In [None]:
# 数据分割，tfidf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# 对训练集进行TF-IDF向量化
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

# 输出处理后的特征数
print("处理后的特征数：", X_train_tfidf.shape[1])


print(vectorizer.get_feature_names_out()[100000:100010])

In [None]:
len(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# 对测试集进行同样的TF-IDF向量化处理
X_test_tfidf = vectorizer.transform(X_test)

# 朴素贝叶斯训练
nb = MultinomialNB(alpha=1)
nb.fit(X_train_tfidf, y_train)

# 预测
y_pred = nb.predict(X_test_tfidf)

# 评估
print("准确率：", accuracy_score(y_test, y_pred))
print("\n详细分类报告：\n", classification_report(y_test, y_pred, target_names=newsgroups.target_names))


In [None]:
# 手动计算 alt.atheism 类别的精确率

# alt.atheism 类别的索引
alt_atheism_index = newsgroups.target_names.index('alt.atheism')

# 预测结果中被分类为 alt.atheism 的样本索引
predicted_as_alt_atheism = (y_pred == alt_atheism_index)

# 实际标签中是 alt.atheism 的样本索引
actual_alt_atheism = (y_test == alt_atheism_index)

# 真正例（TP）：预测为 alt.atheism，且实际也是 alt.atheism
TP = ((y_pred == alt_atheism_index) & (y_test == alt_atheism_index)).sum()

# 假正例（FP）：预测为 alt.atheism，实际不是 alt.atheism
FP = ((y_pred == alt_atheism_index) & (y_test != alt_atheism_index)).sum()

# 精确率 = TP / (TP + FP)
precision = TP / (TP + FP) if (TP + FP) > 0 else 0

print("alt.atheism 类别的精确率（手动计算）：", precision)


In [None]:
# 相对没那么重要,了解
from sklearn.metrics import roc_auc_score
import numpy as np

# alt.atheism 类别的索引
alt_atheism_index = newsgroups.target_names.index('alt.atheism')

# 使用 predict 得到分类结果
y_pred_binary = (nb.predict(X_test_tfidf) == alt_atheism_index).astype(int)
y_true_binary = (y_test == alt_atheism_index).astype(int)
print("y_pred_binary:", y_pred_binary)
print("y_true_binary:", y_true_binary)

auc_score = roc_auc_score(y_true_binary, y_pred_binary)

print("alt.atheism 类别的AUC值（使用predict）：", auc_score)
