In [5]:
from sklearn.datasets import fetch_20newsgroups

# 加载20newsgroups数据集
news = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

print("数据集描述:", news.DESCR)
print("\n数据集大小:", len(news.data))
print("\n目标分类:", news.target_names)
print("\n示例文本:\n", news.data[0])

# 将数据集分割为训练集和测试集
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    news.data,  # 文本数据
    news.target,  # 目标分类
    test_size=0.2,  # 测试集占20%
    random_state=42  # 设置随机种子以确保结果可重现
)

# 这段代码创建了两个动态类对象,分别用于存储训练集和测试集数据
# type()函数用于动态创建类,参数分别是:
# - 类名'obj'
# - 父类(object,)
# - 类的属性字典

# 创建训练集对象,包含:
# - data: 训练文本数据 
# - target: 训练标签
# - target_names: 目标分类名称
newsgroups_train = type('obj', (object,), {
    'data': X_train,
    'target': y_train,
    'target_names': news.target_names
})

# 创建测试集对象,结构同上
newsgroups_test = type('obj', (object,), {
    'data': X_test,
    'target': y_test,
    'target_names': news.target_names
})

print("训练集大小:", len(X_train))
print("测试集大小:", len(X_test))




数据集描述: .. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

Classes                     20
Samples total            18846
Dimensionality               1
Features               

In [6]:
# 创建TF-IDF向量化器
from sklearn.feature_extraction.text import TfidfVectorizer

# 初始化TF-IDF向量化器
# max_df=0.5表示在超过50%的文档中出现的词会被忽略
# min_df=5表示出现次数少于5次的词会被忽略
# stop_words='english'表示去除英文停用词
vectorizer = TfidfVectorizer(max_df=0.5,
                            min_df=5,
                            stop_words='english')

# 对训练集进行向量化转换
X_train_tfidf = vectorizer.fit_transform(newsgroups_train.data)

# 对测试集进行向量化转换
X_test_tfidf = vectorizer.transform(newsgroups_test.data)

print("训练集向量化后的形状:", X_train_tfidf.shape)
print("测试集向量化后的形状:", X_test_tfidf.shape)


训练集向量化后的形状: (15076, 30254)
测试集向量化后的形状: (3770, 30254)


In [7]:
# 导入朴素贝叶斯分类器
from sklearn.naive_bayes import MultinomialNB

# 创建朴素贝叶斯分类器
nb_classifier = MultinomialNB(alpha=0.01)

# 使用训练数据训练分类器
nb_classifier.fit(X_train_tfidf, newsgroups_train.target)

# 在测试集上进行预测
y_pred = nb_classifier.predict(X_test_tfidf)

# 计算分类准确率
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(newsgroups_test.target, y_pred)
print("分类准确率:", accuracy)

# 输出详细的分类报告
from sklearn.metrics import classification_report
print("\n分类报告:")
print(classification_report(newsgroups_test.target, y_pred, 
                          target_names=newsgroups_test.target_names))


分类准确率: 0.9103448275862069

分类报告:
                          precision    recall  f1-score   support

             alt.atheism       0.89      0.93      0.91       151
           comp.graphics       0.81      0.89      0.84       202
 comp.os.ms-windows.misc       0.87      0.81      0.84       195
comp.sys.ibm.pc.hardware       0.72      0.83      0.77       183
   comp.sys.mac.hardware       0.89      0.88      0.88       205
          comp.windows.x       0.90      0.88      0.89       215
            misc.forsale       0.88      0.78      0.83       193
               rec.autos       0.91      0.95      0.93       196
         rec.motorcycles       0.95      0.96      0.95       168
      rec.sport.baseball       0.98      0.99      0.98       211
        rec.sport.hockey       0.97      0.97      0.97       198
               sci.crypt       0.96      0.95      0.96       201
         sci.electronics       0.92      0.86      0.88       202
                 sci.med       0.96      0

In [9]:
# 导入精确率和召回率评估指标
from sklearn.metrics import precision_score, recall_score

# 计算精确率(precision)
precision = precision_score(newsgroups_test.target, y_pred, average='weighted')
print("精确率:", precision)

# 计算召回率(recall) 
recall = recall_score(newsgroups_test.target, y_pred, average='weighted')
print("召回率:", recall)


精确率: 0.9117019821190391
召回率: 0.9103448275862069
