In [90]:
import math

from natasha import Doc, Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger
from nltk.corpus import stopwords

filepath: str = "your filepath"

with open(filepath, "r") as file:
    text: str = file.read().replace("\n", " ")

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

stopwords = stopwords.words("russian")


def preprocess_text(text: str) -> list[str]:
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)

    tokens: list = []
    for token in doc.tokens:
        if token.pos == "PUNCT" or token.pos == "NUM":
            continue

        token.lemmatize(morph_vocab)
        lemma = token.lemma.lower()

        if lemma not in stopwords:
            tokens.append(lemma)

    return tokens


preprocessed_text = preprocess_text(text)

print("preprocessed_text: ", preprocessed_text[:100])

FileNotFoundError: [Errno 2] No such file or directory: 'your filepath'

In [88]:
def bag_of_words(text: list[str]) -> list[int]:
    vocab_set = set()
    for tokens in text:
        vocab_set.update(tokens)
    vocabulary = sorted(list(vocab_set))

    bow_matrix = []
    for tokens in text:
        row = [0] * len(vocabulary)
        for token in tokens:
            if token in vocabulary:
                j = vocabulary.index(token)
                row[j] += 1
        bow_matrix.append(row)

    return bow_matrix


matrix = bag_of_words([preprocessed_text, preprocessed_text])

print("\n--- Bag of Words ---")
print("matrix: ", matrix[:10])
for i, row in enumerate(matrix):
    print(f"Документ {i}:", row)


--- Bag of Words ---
matrix:  [[1, 1, 2, 1, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 5, 1, 1, 1, 2, 1, 1, 1, 12, 1, 7, 1, 5, 3, 5, 3, 1, 1, 1, 2, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 3, 1, 1, 6, 1, 2, 15, 3, 9, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 8, 1, 11, 7, 2, 1, 1, 2, 1, 1, 1, 3, 1, 9, 17, 1, 2, 3, 12, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 1, 1, 5, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 6, 8, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 4, 1, 1, 2, 3, 1, 1, 3, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 2, 3, 1, 1, 2, 1, 5, 1, 1, 3, 1, 1, 5, 3, 2, 1, 1, 1, 19, 3, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 3, 1, 1, 18, 4, 1, 1, 2, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 4, 2, 2, 6, 4, 1, 1, 1, 1, 2, 2, 1, 1, 44, 1, 3, 1, 3, 12, 1, 2, 3, 1, 1, 10, 8, 5, 4, 3, 1, 1, 2, 5, 2, 5, 5, 19, 19, 1, 3, 30, 2, 4, 2, 2, 2, 2, 1, 1, 2, 24, 1, 1, 7, 1, 1, 1, 15, 2, 7, 18, 1, 6, 2, 1, 2, 1, 1, 1, 2, 6, 5, 15, 17, 1, 1, 1, 3, 1, 1, 1, 1, 2, 6, 1, 1, 1, 1, 7, 1, 18, 2, 1, 2, 1, 1, 1, 1, 24, 1, 9, 1, 5, 1, 1, 1, 11

In [89]:
def tf_idf(text: list[str]) -> tuple:
    """
    Принимает список списков токенов.
    Возвращает:
    - vocabulary: список уникальных слов (отсортированных)
    - tfidf_matrix: матрицу (число документов) x (число слов),
      где [i, j] = TF-IDF слова j в документе i.
    """
    # 1. Собираем словарь
    vocab_set = set()
    for tokens in text:
        vocab_set.update(tokens)
    vocabulary = sorted(list(vocab_set))
    N = len(text)  # число документов

    # 2. Подсчитаем df(t) для каждого слова
    df = [0] * len(vocabulary)
    for i, word in enumerate(vocabulary):
        for tokens in text:
            if word in tokens:
                df[i] += 1

    # 3. Вычислим IDF
    idf = [math.log(N / (1 + df[i])) for i in range(len(vocabulary))]

    # 4. Собираем TF-IDF матрицу
    tfidf_matrix = []
    for tokens in text:
        total_words = len(tokens)
        word_counts = {}
        for t in tokens:
            word_counts[t] = word_counts.get(t, 0) + 1

        row = []
        for i, word in enumerate(vocabulary):
            tf = word_counts.get(word, 0) / total_words if total_words > 0 else 0
            tfidf_value = tf * idf[i]
            row.append(tfidf_value)
        tfidf_matrix.append(row)

    return vocabulary, tfidf_matrix

vocab_tfidf, tfidf_matrix = tf_idf(preprocessed_text)
print("\n--- TF-IDF ---")
print("Словарь:", vocab_tfidf[:10])
print("Матрица TF-IDF:")
for i, row in enumerate(tfidf_matrix[:10]):
    print(f"Документ {i}:", row)



--- TF-IDF ---
Словарь: ['-', '1', '7', '8', '9', 'a', 'c', 'd', 'e', 'f', 'h', 'i', 'k', 'n', 'o', 'p', 'r', 's', 't', 'v', 'x', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я']
Матрица TF-IDF:
Документ 0: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25351202406893186, 0.0, 0.0, 0.08429220877156443, 0.0, 0.0, 0.12070343375368542, 0.0, 0.09352538719943379, 0.0, 0.2685686663268061, 0.09457389098969145, 0.08247568415916456, 0.05570386505083651, 0.1573922870561724, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Документ 1: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2722669630883656, 0.0, 0.0, 0.0, 0.0, 0.11267201069730304, 0.0, 0.23738025074653155, 0.11238961169541925, 0.0, 0.18169309368279768, 0.0, 0.0, 0.249401032