In [18]:
import pandas as pd
import ast
import chardet
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)
import os

def prepare_ner_data(input_file='ner_results.csv'):
    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' not found in the working directory.")
        return None, None

    # Initialize Natasha components for lemmatization
    try:
        segmenter = Segmenter()
        morph_vocab = MorphVocab()
        emb = NewsEmbedding()
        morph_tagger = NewsMorphTagger(emb)
    except Exception as e:
        print(f"Error initializing Natasha components: {e}")
        print("Ensure natasha is installed: pip install natasha")
        return None, None

    # Function to normalize Russian words (lemmatization)
    def normalize_russian_word(word):
        try:
            # Create a Natasha Doc object for the word
            doc = Doc(word)
            doc.segment(segmenter)
            doc.tag_morph(morph_tagger)

            # Get the first token (assuming single word or phrase)
            if doc.tokens:
                token = doc.tokens[0]
                token.lemmatize(morph_vocab)
                return token.lemma if token.lemma else word.lower()
            return word.lower()
        except Exception as e:
            print(f"Error normalizing word '{word}': {e}")
            return word.lower()

    # Function to clean and process entities with deduplication
    def process_entities(entity_list):
        try:
            if pd.isna(entity_list) or entity_list == '[]':
                return []

            # Convert string to list
            if isinstance(entity_list, str):
                try:
                    entities = ast.literal_eval(entity_list)
                except:
                    # Clean common formatting issues
                    cleaned = entity_list.replace('"[{', '[{').replace('}]"', '}]')
                    try:
                        entities = ast.literal_eval(cleaned)
                    except:
                        return []
            else:
                entities = entity_list

            # Deduplicate with normalized keys
            seen = set()
            unique_entities = []

            for entity in entities:
                if not isinstance(entity, dict):
                    continue

                entity_text = entity.get('text', '')
                entity_type = entity.get('type', '')

                if not entity_text or not entity_type:
                    continue

                # Normalize text for deduplication
                norm_text = normalize_russian_word(entity_text.lower()) if entity_type in ['PER', 'LOC'] else entity_text.lower()
                entity_key = (norm_text, entity_type)

                if entity_key not in seen:
                    seen.add(entity_key)
                    unique_entities.append({
                        'text': entity_text,  # Preserve original casing
                        'type': entity_type
                    })

            return unique_entities
        except Exception as e:
            print(f"Error processing entities: {str(e)}")
            return []

    # Read and process the data
    try:
        # Detect file encoding
        with open(input_file, 'rb') as f:
            rawdata = f.read(10000)
        encoding = chardet.detect(rawdata)['encoding'] or 'utf-8'
        print(f"Detected encoding: {encoding}")

        # Read the CSV with semicolon delimiter
        df = pd.read_csv(input_file, encoding='windows-1251', on_bad_lines='skip', sep=';', usecols=['text', 'entities'])

        # Print actual column names
        print("Actual columns in the file:", df.columns.tolist())

        # Clean column names (remove extra semicolons or trailing characters)
        df.columns = [col.split(';')[0] for col in df.columns]
        print("Cleaned columns:", df.columns.tolist())        # Check if required columns exist
        required_columns = ['text', 'entities']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Error: Missing required columns: {missing_columns}")
            return None, None

        # Print first few rows of raw data
        print("First few rows of raw data:")
        print(df.head())

        # Clean entities and create counts
        df['entities_clean'] = df['entities'].apply(process_entities)

        # Create counts with deduplicated entities
        df['per_count'] = df['entities_clean'].apply(lambda x: len([e for e in x if e.get('type') == 'PER']))
        df['loc_count'] = df['entities_clean'].apply(lambda x: len([e for e in x if e.get('type') == 'LOC']))
        df['org_count'] = df['entities_clean'].apply(lambda x: len([e for e in x if e.get('type') == 'ORG']))

        # Create flattened dataframe for dashboard use
        entity_rows = []
        for _, row in df.iterrows():
            for entity in row['entities_clean']:
                entity_rows.append({
                    'original_text': row['text'],
                    'entity_text': entity['text'],
                    'entity_type': entity['type']
                })

        entity_df = pd.DataFrame(entity_rows)

        # Save results
        df.to_csv('processed_ner_results.csv', index=False, sep='\t', encoding='utf-8')
        entity_df.to_csv('flattened_ner_entities.csv', index=False, sep='\t', encoding='utf-8')

        print("Data processing complete with improved deduplication")
        return df, entity_df

    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return None, None

# Run the processing
sample_df, sample_entities = prepare_ner_data()
if sample_df is not None:
    print(sample_df[['text', 'entities', 'entities_clean', 'per_count', 'loc_count', 'org_count']].head(1))

Detected encoding: windows-1251
Actual columns in the file: ['text', 'entities']
Cleaned columns: ['text', 'entities']
First few rows of raw data:
                                                text  \
0  Взрыв газа произошел в одной из квартир пятиэт...   
1  В Прионежском районе Карелии шесть снегоходов ...   
2  Министерство обороны Франции подтвердило, что ...   
3  Соединенные Штаты вывели из-под санкций ряд фи...   
4  Министр юстиции Сирии Шади Аль-Вайси, назначен...   

                                            entities  
0  [{'text': 'Луначарского', 'type': 'LOC'}, {'te...  
1  [{'text': 'Прионежском районе', 'type': 'LOC'}...  
2  [{'text': 'Министерство обороны', 'type': 'ORG...  
3  [{'text': 'Соединенные Штаты', 'type': 'LOC'},...  
4  [{'text': 'Сирии', 'type': 'LOC'}, {'text': 'Ш...  
Data processing complete with improved deduplication
                                                text  \
0  Взрыв газа произошел в одной из квартир пятиэт...   

                    