In [36]:
import pandas as pd
import country_converter as coco
import logging
from datetime import datetime

In [37]:
class EnhancedDataPostProcessor:
    def __init__(self):
        self.cc = coco.CountryConverter()
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def standardize_location(self, locations: str) -> str:
        if pd.isna(locations) or locations == '':
            return 'Unknown'
        
        locations_list = [loc.strip() for loc in locations.split(';')]
        standardized = []
        
        for location in locations_list:
            try:
                std_name = self.cc.convert(names=location, to='name_short')
                if std_name != 'not found':
                    standardized.append(std_name)
                else:
                    standardized.append(location)
            except Exception as e:
                self.logger.warning(f"Error standardizing location {location}: {str(e)}")
                standardized.append(location)
        
        return '; '.join(sorted(set(standardized)))

    def standardize_person(self, persons: str) -> str:
        if pd.isna(persons) or persons == '':
            return 'Unknown'
        
        persons_list = [person.strip() for person in persons.split(';')]
        return '; '.join(sorted(set(persons_list)))
    
    def process_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        self.logger.info("Starting data processing...")
        processed_df = df.copy()
        
        initial_rows = len(processed_df)
        processed_df.drop_duplicates(subset=['text'], inplace=True)
        self.logger.info(f"Removed {initial_rows - len(processed_df)} duplicate rows")
        
        self.logger.info("Standardizing locations...")
        processed_df['LOC'] = processed_df['LOC'].apply(self.standardize_location)
        
        self.logger.info("Standardizing persons...")
        processed_df['PER'] = processed_df['PER'].apply(self.standardize_person)
        
        entity_columns = ['LOC', 'PER', 'ORG', 'CRIME_TYPES']
        processed_df[entity_columns] = processed_df[entity_columns].fillna('Unknown')
        
        return processed_df

In [38]:
if __name__ == "__main__":
    input_path = "/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/process1_3.csv"
    output_path = "/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/process2_cleaned.csv"
    
    df = pd.read_csv(input_path)
    processor = EnhancedDataPostProcessor()
    processed_df = processor.process_dataframe(df)
    processed_df.to_csv(output_path, index=False)
    
    print(f"\nOriginal rows: {len(df)}")
    print(f"Cleaned rows: {len(processed_df)}")
    print("\nMissing values after cleaning:")
    print(processed_df[['LOC', 'PER', 'ORG', 'CRIME_TYPES']].isnull().sum())

INFO:__main__:Starting data processing...
INFO:__main__:Removed 4 duplicate rows
INFO:__main__:Standardizing locations...
INFO:__main__:Standardizing persons...



Original rows: 1553
Cleaned rows: 1549

Missing values after cleaning:
LOC            0
PER            0
ORG            0
CRIME_TYPES    0
dtype: int64
