In [26]:
import pandas as pd
import pymorphy2
import csv

# Initialize the pymorphy2 MorphAnalyzer for Russian
morph = pymorphy2.MorphAnalyzer()

# Function to normalize a Russian word to its base form
def normalize_word(word):
    parsed_word = morph.parse(word)[0]  # Get the most likely parse
    return parsed_word.normal_form  # Return the normalized form

# Load the CSV file with the correct encoding, delimiter, and handling for messy data
df = pd.read_csv('NER_step1_processed.csv',
                 encoding='cp1251',
                 quoting=csv.QUOTE_ALL,
                 on_bad_lines='skip',
                 sep=';')  # Use semicolon as the delimiter

# Print the column names to confirm
print("Column names in the DataFrame:", df.columns.tolist())

# Apply normalization to the 'entity_text' column
df['entity_text'] = df['entity_text'].apply(normalize_word)

# Save the updated DataFrame back to a CSV file (using UTF-8 for the output)
df.to_csv('NER_step2_processed.csv', index=False, encoding='utf-8')

print("Normalization complete. File saved as 'NER_step2_processed.csv'.")

Column names in the DataFrame: ['original_text', 'entity_text', 'entity_type']
Normalization complete. File saved as 'NER_step2_processed.csv'.
