In [None]:
pip install pandas deeppavlov torch torchcrf nltk

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import sys
import subprocess
import pandas as pd
import re
from collections import defaultdict
import ssl

def fix_ssl():
    """Fix SSL certificate verification issues"""
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

def check_dependencies():
    """Check and install required packages"""
    required = {
        'pandas': 'pandas',
        'deeppavlov': 'deeppavlov',
        'torch': 'torch',
        'torchcrf': 'torchcrf',
        'nltk': 'nltk',
        'stanza': 'stanza'
    }

    print("Checking dependencies...")
    for pkg, install_name in required.items():
        try:
            __import__(pkg)
            print(f"{pkg} already installed")
        except ImportError:
            print(f"Installing {pkg}...")
            subprocess.run([sys.executable, '-m', 'pip', 'install', '--user', install_name], check=True)

def load_model():
    """Try loading models with fallbacks"""
    # Try DeepPavlov first
    try:
        from deeppavlov import configs
        from deeppavlov.core.commands.infer import build_model
        print("Loading DeepPavlov model...")
        return build_model(configs.ner.ner_ontonotes_bert_mult, download=True), 'deeppavlov'
    except Exception as e:
        print(f"DeepPavlov failed: {e}")

    # Fallback to Stanza
    try:
        import stanza
        print("Using Stanza fallback...")
        stanza.download('ru')
        return stanza.Pipeline('ru', processors='tokenize,ner'), 'stanza'
    except Exception as e:
        print(f"Stanza failed: {e}")
        raise ImportError("Could not load any NER model")

def clean_entity_text(entity):
    """Normalize entity text"""
    return re.sub(r'\s+', '', entity).replace('ё', 'е').replace('Ё', 'Е')

def extract_entities(text, model, model_type):
    """Extract entities based on model type"""
    if not text or pd.isna(text):
        return {}

    try:
        if model_type == 'deeppavlov':
            results = model([text])[0]
            entities = defaultdict(list)
            current_entity, current_tag = [], None

            for word, tag in results:
                if tag == "O":
                    if current_entity:
                        cleaned = clean_entity_text("".join(current_entity))
                        if len(cleaned) > 1:
                            entities[current_tag].append(cleaned)
                        current_entity, current_tag = [], None
                else:
                    entity_tag = tag.split("-")[-1]
                    if tag.startswith("B-") or entity_tag != current_tag:
                        if current_entity:
                            cleaned = clean_entity_text("".join(current_entity))
                            if len(cleaned) > 1:
                                entities[current_tag].append(cleaned)
                        current_entity, current_tag = [word], entity_tag
                    else:
                        current_entity.append(word)

            if current_entity:
                cleaned = clean_entity_text("".join(current_entity))
                if len(cleaned) > 1:
                    entities[current_tag].append(cleaned)

            return dict(entities)

        elif model_type == 'stanza':
            doc = model(text)
            entities = defaultdict(list)
            for ent in doc.ents:
                cleaned = clean_entity_text(ent.text)
                if len(cleaned) > 1:
                    entities[ent.type].append(cleaned)
            return dict(entities)

    except Exception as e:
        print(f"Error processing text: {e}")
        return {}

def get_output_path():
    """Get a writable output path"""
    # Try current directory first
    if os.access(".", os.W_OK):
        return "entities_output.csv"

    # Try home directory
    home_path = os.path.expanduser("~/entities_output.csv")
    try:
        with open(home_path, 'w') as f:
            pass
        os.remove(home_path)
        return home_path
    except:
        pass

    # Try desktop as last resort
    desktop_path = os.path.expanduser("~/Desktop/entities_output.csv")
    try:
        with open(desktop_path, 'w') as f:
            pass
        os.remove(desktop_path)
        return desktop_path
    except:
        raise PermissionError("Could not find a writable location for output file")

def main():
    try:
        # Fix SSL first
        fix_ssl()

        # Check and install dependencies
        check_dependencies()

        # Load model
        model, model_type = load_model()
        print(f"Using {model_type} model")

        # Load data
        try:
            df = pd.read_csv("/Users/alyona/Desktop/rbc.csv")
        except UnicodeDecodeError:
            df = pd.read_csv("/Users/alyona/Desktop/rbc.csv", encoding="utf-8")
        except:
            df = pd.read_csv("/Users/alyona/Desktop/rbc.csv", encoding="latin1")

        # Process data
        results = []
        for _, row in df.iterrows():
            message = str(row.get('message', ''))
            if pd.isna(message):
                continue

            entities = extract_entities(message, model, model_type)

            for entity_type, entity_list in entities.items():
                for entity in set(entity_list):
                    results.append({
                        'news_content': message[:500] + "..." if len(message) > 500 else message,
                        'entity_text': entity,
                        'entity_type': entity_type,
                        'time': row.get('time'),
                        'sha': row.get('sha')
                    })

        # Save results to a writable location
        output_path = get_output_path()
        final_df = pd.DataFrame(results).drop_duplicates(
            subset=['sha', 'entity_text', 'entity_type'],
            keep='first'
        )
        final_df.to_csv(output_path, index=False, encoding="utf-8")

        print(f"\nSuccess! Results saved to: {output_path}")
        print("Sample output:")
        print(final_df[['news_content', 'entity_text', 'entity_type']].head())

    except Exception as e:
        print(f"Fatal error: {e}")
        print("\nTroubleshooting steps:")
        print("1. Try running VSCode as administrator")
        print("2. Or run these commands in terminal:")
        print("   mkdir -p ~/ner_output")
        print("   chmod 777 ~/ner_output")
        print("3. Then modify the script to save to ~/ner_output/entities_output.csv")

if __name__ == "__main__":
    main()

Checking dependencies...
pandas already installed
deeppavlov already installed
torch already installed
Installing torchcrf...


You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.


nltk already installed
stanza already installed
Loading DeepPavlov model...


2025-05-05 14:11:33.932 INFO in 'deeppavlov.download'['download'] at line 138: Skipped http://files.deeppavlov.ai/v1/ner/ner_ontonotes_bert_mult_torch_crf.tar.gz download because of matching hashes


DeepPavlov failed: No module named 'torchcrf'
Using Stanza fallback...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 28.4MB/s]                    
2025-05-05 14:11:39 INFO: Downloaded file to /Users/alyona/stanza_resources/resources.json
2025-05-05 14:11:39 INFO: Downloading default packages for language: ru (Russian) ...
2025-05-05 14:11:44 INFO: File exists: /Users/alyona/stanza_resources/ru/default.zip
2025-05-05 14:12:02 INFO: Finished downloading models and saved to /Users/alyona/stanza_resources
2025-05-05 14:12:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 6.98MB/s]                    
2025-05-05 14:12:03 INFO: Downloaded file to /Users/alyona/stanza_resources/resources.json
2025-05-05 14:12:04 INFO: Loading these m

Using stanza model

Success! Results saved to: /Users/alyona/entities_output.csv
Sample output:
                                        news_content       entity_text  \
0  **Разворот в 2025 году: ждать или нет? **\n\nБ...               ДКП   
1  **Разворот в 2025 году: ждать или нет? **\n\nБ...     @selfinvestor   
2  **Разворот в 2025 году: ждать или нет? **\n\nБ...        **Разворот   
3  **Разворот в 2025 году: ждать или нет? **\n\nБ...  ВТБМоиИнвестиции   
4  **Разворот в 2025 году: ждать или нет? **\n\nБ...               ВТБ   

  entity_type  
0        MISC  
1        MISC  
2        MISC  
3         ORG  
4         ORG  
