In [None]:
import numpy as np
import pandas as pd

from collections import defaultdict

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

from lda import LdaPreprocessor, LdaCoherenceEvaluator, LatentDirichletAllocation as LDA


In [2]:
SEED = 42

categories = [
    'sci.space',
    'comp.graphics',
    'soc.religion.christian',
    'alt.atheism',
    'talk.politics.guns',
    'rec.sport.hockey'
]

X, _ = fetch_20newsgroups(
    categories=categories,
    remove=('headers', 'footers', 'quotes') ,
    return_X_y=True,
)

In [None]:
categories = [
    'sci.space',
    'comp.graphics',
    'soc.religion.christian',
    'alt.atheism',
    'talk.politics.guns',
    'rec.sport.hockey'
]

X, y = fetch_20newsgroups(
    categories=categories,
    remove=('headers', 'footers', 'quotes') ,
    return_X_y=True,
)

target_names = categories

vectorizer = CountVectorizer(stop_words="english")
X_vec = vectorizer.fit_transform(X) 

vocab = vectorizer.get_feature_names_out()
vocab_set = set(vocab)

docs_by_class = defaultdict(list)
lengths_by_class = defaultdict(list)

for doc, label in zip(X, y):
    words = doc.split()
    lengths_by_class[target_names[label]].append(len(words))
    docs_by_class[target_names[label]].append(doc)


word_sets_by_class = {}
for class_name, docs in docs_by_class.items():
    class_text = " ".join(docs)
    tokens = vectorizer.build_analyzer()(class_text) 
    word_sets_by_class[class_name] = set(tokens)

print(f"📄 Общее количество документов: {len(X)}\n")

print("📊 Распределение по классам:")
for class_name in target_names:
    print(f"- {class_name}: {len(docs_by_class[class_name])} документов")
print()

print("📏 Длина документов (в словах) по классам:")
for class_name in target_names:
    lengths = lengths_by_class[class_name]
    print(f"- {class_name}:")
    print(f"  • Средняя длина : {np.mean(lengths):.1f}")
    print(f"  • Медианная     : {np.median(lengths):.1f}")
    print(f"  • Мин           : {np.min(lengths)}")
    print(f"  • Макс          : {np.max(lengths)}")

    min_idx = np.argmin(lengths)
    max_idx = np.argmax(lengths)
    print(f"  • 🔽 Самый короткий документ: «{docs_by_class[class_name][min_idx][:100]}...»")
    print(f"  • 🔼 Самый длинный документ : «{docs_by_class[class_name][max_idx][:100]}...»\n")

print("🔡 Количество уникальных слов (после CountVectorizer):")
for class_name in target_names:
    print(f"- {class_name}: {len(word_sets_by_class[class_name])} слов")

print(f"\n🧠 Всего уникальных слов во всех документах: {len(vocab_set)}")

In [3]:
processor = LdaPreprocessor(
    max_features=1000,
    stop_words='english',
    min_df=5,
    max_df=0.9,
    ngram_range=(1,1),
)

X_counts = processor.fit_transform(X)

In [4]:
lda_sklearn = LatentDirichletAllocation(
    n_components=6,
    random_state=SEED,
    learning_method='batch'
    )
lda_sklearn.fit(X_counts)

In [5]:
for topic_idx, comp in enumerate(lda_sklearn.components_):
    terms = processor.feature_names_[comp.argsort()[-10:]]
    print(f"Topic {topic_idx}: {' '.join(terms)}")

Topic 0: state crime pts law weapons control firearms file guns gun
Topic 1: time christian faith does people christ bible church jesus god
Topic 2: moon data lunar shuttle satellite orbit earth launch nasa space
Topic 3: jpeg files ftp data available software file graphics edu image
Topic 4: good time does say know like just think people don
Topic 5: league players year nhl games season play hockey game team


In [6]:
lda_own = LDA(
    n_topics=6, 
    random_state=SEED
    )
lda_own.fit(X_counts)

<lda.LatentDirichletAllocation at 0x2b085021710>

In [7]:
for topic_idx, comp in enumerate(lda_own.components_):
    terms = processor.feature_names_[comp.argsort()[-10:]]
    print(f"Topic {topic_idx}: {' '.join(terms)}")

Topic 0: control year period team pts file firearms new guns gun
Topic 1: time christian faith does people christ bible church jesus god
Topic 2: moon data lunar shuttle satellite orbit earth launch nasa space
Topic 3: jpeg files ftp data available software file graphics edu image
Topic 4: good time does say know like just think don people
Topic 5: points van win goal hockey play games season team game


In [8]:
evaluator = LdaCoherenceEvaluator(
    docs=X,
    processor=processor,
)

In [9]:
coherence_sklearn = evaluator.coherence(lda_sklearn)

In [10]:
coherence_own = evaluator.coherence(lda_own)

In [11]:
print(f'{coherence_sklearn = :.4f}')
print(f'{coherence_own = :.4f}')

coherence_sklearn = 0.7767
coherence_own = 0.7147


In [12]:
%timeit lda_sklearn.fit(X_counts)

9.05 s ± 4.53 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%timeit lda_own.fit(X_counts)

27.7 s ± 7.4 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
