In [None]:
import numpy as np
import pandas as pd

from collections import defaultdict

from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    make_scorer,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score
)

from naive_bayes import NaiveBayes

SEED = 42

In [2]:
categories = [
    'sci.space',
    'comp.graphics',
    'soc.religion.christian'
]

X, y = fetch_20newsgroups(
    categories=categories,
    remove=('headers', 'footers', 'quotes') ,
    return_X_y=True,
)

In [None]:
target_names = categories

vectorizer = CountVectorizer(stop_words="english")
X_vec = vectorizer.fit_transform(X) 

vocab = vectorizer.get_feature_names_out()
vocab_set = set(vocab)

docs_by_class = defaultdict(list)
lengths_by_class = defaultdict(list)

for doc, label in zip(X, y):
    words = doc.split()
    lengths_by_class[target_names[label]].append(len(words))
    docs_by_class[target_names[label]].append(doc)


word_sets_by_class = {}
for class_name, docs in docs_by_class.items():
    class_text = " ".join(docs)
    tokens = vectorizer.build_analyzer()(class_text) 
    word_sets_by_class[class_name] = set(tokens)

print(f"📄 Общее количество документов: {len(X)}\n")

print("📊 Распределение по классам:")
for class_name in target_names:
    print(f"- {class_name}: {len(docs_by_class[class_name])} документов")
print()

print("📏 Длина документов (в словах) по классам:")
for class_name in target_names:
    lengths = lengths_by_class[class_name]
    print(f"- {class_name}:")
    print(f"  • Средняя длина : {np.mean(lengths):.1f}")
    print(f"  • Медианная     : {np.median(lengths):.1f}")
    print(f"  • Мин           : {np.min(lengths)}")
    print(f"  • Макс          : {np.max(lengths)}")

    min_idx = np.argmin(lengths)
    max_idx = np.argmax(lengths)
    print(f"  • 🔽 Самый короткий документ: «{docs_by_class[class_name][min_idx][:100]}...»")
    print(f"  • 🔼 Самый длинный документ : «{docs_by_class[class_name][max_idx][:100]}...»\n")

print("🔡 Количество уникальных слов (после CountVectorizer):")
for class_name in target_names:
    print(f"- {class_name}: {len(word_sets_by_class[class_name])} слов")

print(f"\n🧠 Всего уникальных слов во всех документах: {len(vocab_set)}")


📄 Общее количество документов: 1776

📊 Распределение по классам:
- sci.space: 584 документов
- comp.graphics: 593 документов
- soc.religion.christian: 599 документов

📏 Длина документов (в словах) по классам:
- sci.space:
  • Средняя длина : 157.8
  • Медианная     : 62.0
  • Мин           : 0
  • Макс          : 9109
  • 🔽 Самый короткий документ: «...»
  • 🔼 Самый длинный документ : «Archive-name: jpeg-faq
Last-modified: 18 April 1993

This FAQ article discusses JPEG image compressi...»

- comp.graphics:
  • Средняя длина : 202.3
  • Медианная     : 82.0
  • Мин           : 0
  • Макс          : 6109
  • 🔽 Самый короткий документ: «...»
  • 🔼 Самый длинный документ : «COMMERCIAL SPACE NEWS/SPACE TECHNOLOGY INVESTOR NUMBER 22

   This is number twenty-two in an irregu...»

- soc.religion.christian:
  • Средняя длина : 262.8
  • Медианная     : 157.0
  • Мин           : 0
  • Макс          : 2939
  • 🔽 Самый короткий документ: «...»
  • 🔼 Самый длинный документ : «I have come across wh

In [4]:
pipeline = make_pipeline(
    CountVectorizer(stop_words="english"), 
    NaiveBayes(alpha=1.0)
)

sk_pipeline = make_pipeline(
    CountVectorizer(stop_words="english"), 
    MultinomialNB(alpha=1.0)
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

In [5]:
def evaluate_model_with_cv(model, X, y, cv=5, average='macro'):
    scoring = {
        'accuracy':  make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average=average, zero_division=0),
        'recall':    make_scorer(recall_score, average=average, zero_division=0),
        'f1':        make_scorer(f1_score, average=average, zero_division=0)
    }

    scores = cross_validate(model, X, y, scoring=scoring, cv=cv, return_train_score=False)

    metrics = ['accuracy', 'precision', 'recall', 'f1']
    result_df = pd.DataFrame({m: scores[f'test_{m}'] for m in metrics})
    result_df.loc['mean'] = result_df.mean()

    print(f"📊 Результаты кросс-валидации ({cv.n_splits}-fold, усреднение: '{average}'):\n")
    print(result_df.round(4))

In [6]:
evaluate_model_with_cv(pipeline, X, y, cv=cv, average='macro')

📊 Результаты кросс-валидации (5-fold, усреднение: 'macro'):

      accuracy  precision  recall      f1
0       0.8933     0.9043  0.8930  0.8933
1       0.8986     0.9017  0.8985  0.8982
2       0.9183     0.9215  0.9181  0.9182
3       0.9127     0.9164  0.9121  0.9121
4       0.9099     0.9139  0.9094  0.9095
mean    0.9065     0.9116  0.9062  0.9063


In [7]:
evaluate_model_with_cv(sk_pipeline, X, y, cv=cv, average='macro')

📊 Результаты кросс-валидации (5-fold, усреднение: 'macro'):

      accuracy  precision  recall      f1
0       0.8933     0.9043  0.8930  0.8933
1       0.8986     0.9017  0.8985  0.8982
2       0.9183     0.9215  0.9181  0.9182
3       0.9127     0.9164  0.9121  0.9121
4       0.9099     0.9139  0.9094  0.9095
mean    0.9065     0.9116  0.9062  0.9063


In [8]:
%timeit cross_validate(pipeline, X, y, scoring='accuracy', cv=cv, return_train_score=False)

696 ms ± 1.77 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%timeit cross_validate(sk_pipeline, X, y, scoring='accuracy', cv=cv, return_train_score=False)

707 ms ± 5.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
