In [None]:
#!pip install pandas transformers torch
import pandas as pd
import re
from transformers import pipeline
from collections import defaultdict

# Text Cleaning Function
def clean_entity_text(entity):
    """Normalize entity text by removing spaces and normalizing characters"""
    # Remove all spaces and special characters
    entity = re.sub(r'\s+', '', entity)
    # Normalize Russian characters
    entity = entity.replace('ё', 'е').replace('Ё', 'Е')
    return entity

# Entity Processing
def extract_entities(text, ner_pipeline):
    """Extract and normalize entities from text"""
    if not text or pd.isna(text):
        return {}

    try:
        ner_results = ner_pipeline(text)
    except:
        return {}

    entities = defaultdict(list)
    current_entity = ""
    current_type = ""

    for entity in ner_results:
        word = entity['word'].replace('##', '')

        if entity['entity'].startswith('B-'):
            if current_entity:
                cleaned = clean_entity_text(current_entity)
                if len(cleaned) > 1:  # Only keep meaningful entities
                    entities[current_type].append(cleaned)
            current_entity = word
            current_type = entity['entity'][2:]
        elif entity['entity'].startswith('I-'):
            current_entity += word  # Concatenate directly for Russian
        else:
            if current_entity:
                cleaned = clean_entity_text(current_entity)
                if len(cleaned) > 1:
                    entities[current_type].append(cleaned)
            current_entity = ""
            current_type = ""

    if current_entity:
        cleaned = clean_entity_text(current_entity)
        if len(cleaned) > 1:
            entities[current_type].append(cleaned)

    return dict(entities)

# Main Processing
def process_to_3columns(df, ner_pipeline):
    """Convert the results to 3-column format"""
    three_column_data = []

    for _, row in df.iterrows():
        message = str(row.get('message', ''))
        if pd.isna(message):
            continue

        entities = extract_entities(message, ner_pipeline)

        # Create unique entities set to avoid duplicates
        unique_entities = set()
        for entity_type, entity_list in entities.items():
            for entity in entity_list:
                unique_entities.add((entity, entity_type))

        # Add to results
        for entity, entity_type in unique_entities:
            three_column_data.append({
                'news_content': message[:500] + "..." if len(message) > 500 else message,
                'entity_text': entity,
                'entity_type': entity_type,
                'time': row.get('time'),
                'sha': row.get('sha')
            })

    return pd.DataFrame(three_column_data)

# Execution
try:
    # Load data with encoding fallbacks
    try:
        df = pd.read_csv('rbc.csv')
    except UnicodeDecodeError:
        df = pd.read_csv('rbc.csv', encoding='utf-8')
    except:
        df = pd.read_csv('rbc.csv', encoding='latin1')

    # Initialize NER pipeline
    ner_pipeline = pipeline("ner", model="Davlan/bert-base-multilingual-cased-ner-hrl")

    # Process to 3-column format
    final_df = process_to_3columns(df, ner_pipeline)

    # Remove duplicates (same entity in same news item)
    final_df = final_df.drop_duplicates(
        subset=['sha', 'entity_text', 'entity_type'],
        keep='first'
    )

    # Save both formats
    final_df.to_csv('entities_3column.csv', index=False, encoding='utf-8')

    print("Success! Sample of 3-column output:")
    print(final_df[['news_content', 'entity_text', 'entity_type']].head())

except Exception as e:
    print(f"Error: {e}")

Device set to use cuda:0


Success! Sample of 3-column output:
                                        news_content       entity_text  \
0  **Разворот в 2025 году: ждать или нет? **\n\nБ...   АлексейКорнилов   
1  **Разворот в 2025 году: ждать или нет? **\n\nБ...  ВТБМоиИнвестиции   
2  **Разворот в 2025 году: ждать или нет? **\n\nБ...   СтаниславКлещев   
3  **Разворот в 2025 году: ждать или нет? **\n\nБ...        ртемМаркин   
4  **Разворот в 2025 году: ждать или нет? **\n\nБ...            России   

  entity_type  
0         PER  
1         ORG  
2         PER  
3         PER  
4         LOC  
