In [None]:
import pandas as pd
import numpy as np
import ast
import re
from collections import defaultdict

# PROPER FIX FOR pymorphy2 in Python 3.11+
!pip install -q pymorphy3  # New fork compatible with Python 3.11+
from pymorphy3 import MorphAnalyzer

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Initialize Russian lemmatizer
morph = MorphAnalyzer()

In [None]:
# Special cases and abbreviations
SPECIAL_CASES = {
    'сша': 'США',
    'сизо': 'СИЗО',
    'мчс': 'МЧС',
    'цб': 'ЦБ',
    'дтп': 'ДТП',
    'хамас': 'ХАМАС',
    'нато': 'НАТО',
    'ндфл': 'НДФЛ',
    'россии': 'Россия',
    'хезболлы': 'Хезболла',
    'росавиации': 'Росавиация',
    'дону': 'Дон',
    'украины': 'Украина',
    'украина': 'Украина',
    'подписаться': '',
    'на': '',
    'это': '',
    'в': 'в',
    'с': 'с'
}

ABBREVIATIONS = {'США', 'МЧС', 'ЦБ', 'ДТП', 'ХАМАС', 'СИЗО'}

In [None]:
def normalize_word(word, prev_word=None):
    """Enhanced word normalization with proper nominative case handling"""
    lower_word = word.lower()

    # Handle special cases first
    if lower_word in SPECIAL_CASES:
        return SPECIAL_CASES[lower_word]

    # Handle numbers with units
    if word.replace('.', '').isdigit() and prev_word:
        if prev_word.lower() in ['рублей', 'млн', 'млрд', 'кв']:
            return word

    try:
        parsed = morph.parse(word)[0]

        # Only convert to nominative for inflectable words
        if any(tag in parsed.tag for tag in ['NOUN', 'ADJF', 'ADJS', 'VERB', 'PRTF', 'GRND']):
            # Get all possible normal forms
            normal_forms = {p.normal_form for p in morph.parse(word)}

            # Try to convert to nominative
            try:
                nominative = parsed.inflect({'nomn'}).word
                # Check if nominative form exists in possible normal forms
                if nominative.lower() in {nf.lower() for nf in normal_forms}:
                    result = nominative
                else:
                    result = parsed.normal_form
            except:
                result = parsed.normal_form

            # Preserve original capitalization for proper nouns
            if word.istitle() and not any(c.isupper() for c in word[1:]):
                return result.capitalize()
            return result

        return word
    except:
        return word

In [None]:
def construct_phrase(words):
    """Build natural-sounding phrases from words"""
    if not words:
        return ""

    phrase = []
    skip_next = False

    for i, word in enumerate(words):
        if skip_next:
            skip_next = False
            continue

        # Handle number+unit combinations
        if word.isdigit() and i < len(words)-1:
            next_word = words[i+1].lower()
            if next_word in ['рублей', 'млн', 'млрд', 'кв']:
                phrase.append(f"{word} {next_word}")
                skip_next = True
                continue

        # Handle currency amounts
        if word == 'млн' and i > 0 and words[i-1].isdigit():
            phrase[-1] = f"{words[i-1]} {word}"
            continue

        phrase.append(word)

    # Join and clean the phrase
    phrase_text = ' '.join(phrase)

    # Add prepositions where needed
    phrase_text = re.sub(r'(\s[вс])\s', r'\1 ', phrase_text)

    return phrase_text

In [None]:
def clean_topic_name(name):
    """Main cleaning function for topic names"""

    # Extract meaningful part
    topic_part = name.split('____')[-1] if '____' in name else name.split('_', 1)[-1]

    # Split and clean words
    words = re.split(r'_|\s+', topic_part)
    words = [w.strip() for w in words if w.strip() and w.lower() != 'nan']

    # Process words
    processed_words = []
    seen_lemmas = set()

    for i, word in enumerate(words):
        # Skip empty or redundant words
        if not word or word.lower() in {'на', 'за', 'подписаться', 'nan'}:
            continue

        prev_word = processed_words[-1] if processed_words else None
        normalized = normalize_word(word, prev_word)

        if not normalized:  # Skip if normalized to empty string
            continue

        lemma = morph.parse(word)[0].normal_form if word.isalpha() else word

        if lemma not in seen_lemmas:
            seen_lemmas.add(lemma)
            processed_words.append(normalized)

    # Build natural phrase
    topic_text = construct_phrase(processed_words[:3])  # Limit to a number of components

    # Final capitalization and cleaning
    if topic_text:
        topic_text = topic_text[0].upper() + topic_text[1:]
        topic_text = re.sub(r'(\d)\s([а-яё])', r'\1_\2', topic_text)
        topic_text = topic_text.replace('  ', ' ').strip()

    return topic_text

In [None]:
def post_process_names(df):
    """Apply final polishing to topic names"""
    df['Clean_Name'] = df['Clean_Name'].str.replace(r'\s+', ' ', regex=True)
    df['Clean_Name'] = df['Clean_Name'].apply(
        lambda x: re.sub(r'(\d) (\D)', r'\1_\2', x) if isinstance(x, str) else x
    )
    return df

In [None]:
# Main execution
if __name__ == "__main__":
    # Load data
    df = pd.read_csv("rbc_BERTopic.csv", sep=';')

    # Apply cleaning
    df['Clean_Name'] = df['Name'].apply(clean_topic_name)
    df = post_process_names(df)

    # Move Representative_Docs to the last column position
    if 'Representative_Docs' in df.columns:
        # Get all columns except Representative_Docs, then add it at the end
        cols = [col for col in df.columns if col != 'Representative_Docs'] + ['Representative_Docs']
        df = df[cols]
        print("\nMoved 'Representative_Docs' to last column position")
    else:
        print("\n'Representative_Docs' column not found in DataFrame")

    # Show results
    print("\nFinal cleaned topic names:")
    print(df.head(10))

    # Save cleaned data to new CSV file
    output_filename = "rbc_BERTopic_cleaned_v3.csv"
    df.to_csv(output_filename,
              sep=';',
              index=False,
              encoding='utf-8-sig')

    print(f"\nCleaned data successfully saved to '{output_filename}'")


Moved 'Representative_Docs' to last column position

Final cleaned topic names:
   Topic  Count                                         Name  \
0     -1   7044                         -1____сша_это_россии   
1      0   1914              0_nan_nan nan_дону nan_nan сизо   
2      1    421  1_спорт_канал спорт_подписаться канал_канал   
3      2    341             2_израиля_хамас_израиль_хезболлы   
4      3    315          3_пост_должность_должности_отставку   
5      4    299                        4_пожара_мчс_пожар_кв   
6      5    281    5_ограничения_росавиации_аэропорту_полеты   
7      6    226        6_вино_вина_алкоголя_подписаться вино   
8      7    219                7_делу_взятки_млн руб_бывшего   
9      8    212         8_недвижимость_жилья_недвижимости_кв   

                                      Representation  \
0  ['__', 'сша', 'это', 'россии', 'заявил', 'года...   
1  ['nan', 'nan nan', 'дону nan', 'nan сизо', 'си...   
2  ['спорт', 'канал спорт', 'подписаться канал