In [9]:
from sklearn.datasets import fetch_20newsgroups

# 加载20类新闻数据集
newsgroups = fetch_20newsgroups(data_home='data',subset='all')


print("特征（文本数据）示例：")
print(newsgroups.data[0])
print("\n标签示例：", newsgroups.target[0])
print("标签总数：", len(newsgroups.target))
print("类别名称：", newsgroups.target_names)
print("data类型：", type(newsgroups.data))
print("target类型：", type(newsgroups.target))
print("target最小值：", min(newsgroups.target))
print("target最大值：", max(newsgroups.target))



特征（文本数据）示例：
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!



标签示例： 10
标签总数： 18846
类别名称： ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc

  cache = pickle.loads(uncompressed_content)


In [10]:
# 数据分割，tfidf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# 对训练集进行TF-IDF向量化
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

# 输出处理后的特征数
print("处理后的特征数：", X_train_tfidf.shape[1])


print(vectorizer.get_feature_names_out()[100000:100010])

处理后的特征数： 146060
['o_uv' 'o_wcp_' 'o_yd' 'oa' 'oa0' 'oa13' 'oa2' 'oa3' 'oa44o' 'oa4no']


In [11]:
len(X_test)

4712

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# 对测试集进行同样的TF-IDF向量化处理
X_test_tfidf = vectorizer.transform(X_test)

# 朴素贝叶斯训练
nb = MultinomialNB(alpha=1)
nb.fit(X_train_tfidf, y_train)

# 预测
y_pred = nb.predict(X_test_tfidf)

# 评估
print("准确率：", accuracy_score(y_test, y_pred))
print("\n详细分类报告：\n", classification_report(y_test, y_pred, target_names=newsgroups.target_names))


准确率： 0.8425297113752123

详细分类报告：
                           precision    recall  f1-score   support

             alt.atheism       0.88      0.72      0.79       198
           comp.graphics       0.86      0.79      0.82       245
 comp.os.ms-windows.misc       0.88      0.83      0.85       242
comp.sys.ibm.pc.hardware       0.66      0.86      0.75       238
   comp.sys.mac.hardware       0.95      0.84      0.89       250
          comp.windows.x       0.96      0.80      0.87       260
            misc.forsale       0.96      0.66      0.78       241
               rec.autos       0.89      0.93      0.91       244
         rec.motorcycles       0.91      0.95      0.93       219
      rec.sport.baseball       0.96      0.94      0.95       261
        rec.sport.hockey       0.90      0.98      0.94       245
               sci.crypt       0.78      0.98      0.87       251
         sci.electronics       0.92      0.80      0.86       249
                 sci.med       0.97      

In [19]:
# 手动计算 alt.atheism 类别的精确率

# alt.atheism 类别的索引
alt_atheism_index = newsgroups.target_names.index('alt.atheism')
print(y_pred)

# 预测结果中被分类为 alt.atheism 的样本索引
predicted_as_alt_atheism = (y_pred == alt_atheism_index)

# 实际标签中是 alt.atheism 的样本索引
actual_alt_atheism = (y_test == alt_atheism_index)

# 真正例（TP）：预测为 alt.atheism，且实际也是 alt.atheism
TP = ((y_pred == alt_atheism_index) & (y_test == alt_atheism_index)).sum()

# 假正例（FP）：预测为 alt.atheism，实际不是 alt.atheism
FP = ((y_pred == alt_atheism_index) & (y_test != alt_atheism_index)).sum()

# 精确率 = TP / (TP + FP)
precision = TP / (TP + FP) if (TP + FP) > 0 else 0

print("alt.atheism 类别的精确率（手动计算）：", precision)


[ 9 12 14 ...  9  3  8]
alt.atheism 类别的精确率（手动计算）： 0.8827160493827161


In [14]:
# 相对没那么重要,了解
from sklearn.metrics import roc_auc_score
import numpy as np

# alt.atheism 类别的索引
alt_atheism_index = newsgroups.target_names.index('alt.atheism')

# 使用 predict 得到分类结果
y_pred_binary = (nb.predict(X_test_tfidf) == alt_atheism_index).astype(int)
y_true_binary = (y_test == alt_atheism_index).astype(int)
print("y_pred_binary:", y_pred_binary)
print("y_true_binary:", y_true_binary)

auc_score = roc_auc_score(y_true_binary, y_pred_binary)

print("alt.atheism 类别的AUC值（使用predict）：", auc_score)


y_pred_binary: [0 0 0 ... 0 0 0]
y_true_binary: [0 0 0 ... 0 0 0]
alt.atheism 类别的AUC值（使用predict）： 0.8590065475311377
