In [2]:
import pickle
import pandas as pd
import numpy as np
import re
import os
import multiprocessing as mp
from tqdm import tqdm
import time

def load_data():
    """Load the dataset and the address to geo mapping"""
    # Load dataset
    df = pd.read_csv('../data/test.csv', encoding='utf-8', low_memory=False)
    
    # Load address to geo coordinates mapping
    with open('adres_to_geo.pickle', 'rb') as f:
        adres_to_geo = pickle.load(f)
    
    return df, adres_to_geo

def create_address_keys(row, first_col_name):
    """Create multiple possible address keys to maximize matching chances"""
    keys = []
    
    # Extract components from the first column (assuming it contains the address)
    try:
        base_address = row[first_col_name]  # Get the first column value
        if pd.isna(base_address):
            return keys
            
        # Strategy 1: Full address with original 본번/부번 format
        try:
            if pd.notna(row['본번']):
                if pd.notna(row['부번']) and row['부번'] != 0:
                    plot_number = f"{int(row['본번'])}-{int(row['부번'])}"
                else:
                    plot_number = str(int(row['본번']))
                
                full_address = f"{base_address} {plot_number}"
                keys.append(full_address)
        except:
            pass
        
        # Strategy 2: Address parts with original 본번/부번 format
        # Extract city, district, neighborhood if they exist in the address
        try:
            address_parts = base_address.split()
            if len(address_parts) >= 3:  # Ensure we have at least city+district+neighborhood
                neighborhood = address_parts[2]  # e.g., "개포동"
                district = address_parts[1]      # e.g., "강남구"
                city = address_parts[0]          # e.g., "서울특별시"
                
                # Different combinations
                if pd.notna(row['본번']):
                    if pd.notna(row['부번']) and row['부번'] != 0:
                        plot_number = f"{int(row['본번'])}-{int(row['부번'])}"
                    else:
                        plot_number = str(int(row['본번']))
                    
                    keys.append(f"{city} {district} {neighborhood} {plot_number}")
        except:
            pass
        
        # Strategy 3: Try with just the neighborhood and 번지
        try:
            if len(address_parts) >= 3 and pd.notna(row['본번']):
                neighborhood = address_parts[2]  # e.g., "개포동"
                
                if pd.notna(row['부번']) and row['부번'] != 0:
                    plot_number = f"{int(row['본번'])}-{int(row['부번'])}"
                else:
                    plot_number = str(int(row['본번']))
                
                keys.append(f"{neighborhood} {plot_number}")
        except:
            pass
        
        # Strategy 4: Extract address using regular expressions
        try:
            match = re.search(r'(서울[특별시]*|인천[광역시]*|부산[광역시]*|대구[광역시]*|광주[광역시]*|대전[광역시]*|울산[광역시]*|세종[특별자치시]*|경기도|강원도|충청북도|충청남도|전라북도|전라남도|경상북도|경상남도|제주[특별자치도]*)\s+([^\s]+)\s+([^\s]+)', base_address)
            if match:
                city = match.group(1)
                district = match.group(2)
                neighborhood = match.group(3)
                
                if pd.notna(row['본번']):
                    if pd.notna(row['부번']) and row['부번'] != 0:
                        plot_number = f"{int(row['본번'])}-{int(row['부번'])}"
                    else:
                        plot_number = str(int(row['본번']))
                    
                    keys.append(f"{city} {district} {neighborhood} {plot_number}")
        except:
            pass
            
    except Exception as e:
        pass
    
    return keys

def process_chunk(chunk_data):
    """Process a chunk of data in parallel"""
    chunk_df, adres_to_geo, first_col_name = chunk_data
    results = []
    
    for idx, row in chunk_df.iterrows():
        coord_x = row['좌표X']
        coord_y = row['좌표Y']
        
        # Skip if coordinates are already filled
        if pd.notna(coord_x) and pd.notna(coord_y):
            results.append((idx, coord_x, coord_y, None))
            continue
        
        # Try all key strategies
        possible_keys = create_address_keys(row, first_col_name)
        match_method = None
        
        # Try direct matches first
        for key in possible_keys:
            if key in adres_to_geo:
                latitude, longitude = adres_to_geo[key]
                results.append((idx, longitude, latitude, "exact_match"))
                match_method = "exact_match"
                break
        
        if match_method:
            continue
        
        # Try apartment name match
        try:
            if '아파트명' in chunk_df.columns and pd.notna(row['아파트명']):
                apt_name = row['아파트명']
                address_parts = row[first_col_name].split()
                
                if len(address_parts) >= 3:
                    for addr in adres_to_geo.keys():
                        # Check if both the neighborhood and apartment name match
                        if address_parts[2] in addr and apt_name in addr:
                            latitude, longitude = adres_to_geo[addr]
                            results.append((idx, longitude, latitude, "apartment_match"))
                            match_method = "apartment_match"
                            break
        except:
            pass
        
        if match_method:
            continue
        
        # Try neighborhood match
        try:
            address_parts = row[first_col_name].split()
            if len(address_parts) >= 3:
                neighborhood = address_parts[2]  # e.g., "개포동"
                
                # Find any address in the same neighborhood
                neighborhood_matches = [addr for addr in adres_to_geo.keys() if neighborhood in addr]
                
                if neighborhood_matches:
                    # Use the first match
                    latitude, longitude = adres_to_geo[neighborhood_matches[0]]
                    results.append((idx, longitude, latitude, "neighborhood_match"))
                    match_method = "neighborhood_match"
        except:
            pass
        
        # If no match found, add to results with original NaN values
        if not match_method:
            results.append((idx, coord_x, coord_y, None))
    
    return results

def parallel_process_coordinates():
    """Process the dataset in parallel to fill missing coordinates"""
    print("Starting parallel coordinate processing...")
    start_time = time.time()
    
    # Load data
    df, adres_to_geo = load_data()
    
    # Determine the first column name
    first_col_name = df.columns[0]
    
    # Check initial missing values
    initial_missing = df[df['좌표X'].isna() | df['좌표Y'].isna()].shape[0]
    print(f"Initial missing coordinates: {initial_missing}")
    
    # Identify rows with missing coordinates
    missing_df = df[df['좌표X'].isna() | df['좌표Y'].isna()].copy()
    
    # Determine the number of CPU cores to use
    num_cores = max(mp.cpu_count() - 1, 1)  # Leave one core free
    print(f"Using {num_cores} CPU cores for parallel processing")
    
    # Split the missing data into chunks for parallel processing
    chunk_size = max(1, len(missing_df) // (num_cores * 10))  # Create more chunks than cores
    chunks = []
    
    for i in range(0, len(missing_df), chunk_size):
        chunk = missing_df.iloc[i:i+chunk_size]
        chunks.append((chunk, adres_to_geo, first_col_name))
    
    print(f"Split data into {len(chunks)} chunks for processing")
    
    # Process chunks in parallel
    with mp.Pool(num_cores) as pool:
        results = list(tqdm(pool.imap(process_chunk, chunks), total=len(chunks), desc="Processing chunks"))
    
    # Flatten results
    all_results = []
    for chunk_result in results:
        all_results.extend(chunk_result)
    
    # Count matches by method
    match_counts = {"exact_match": 0, "apartment_match": 0, "neighborhood_match": 0}
    filled_count = 0

    # Before updating the dataframe:
    print(f"Aggregated {len(all_results)} results. Updating dataframe...")
    # Update the original dataframe with the results
    for idx, coord_x, coord_y, match_method in all_results:
        if match_method is not None:
            df.loc[idx, '좌표X'] = coord_x
            df.loc[idx, '좌표Y'] = coord_y
            match_counts[match_method] += 1
            filled_count += 1
    
    # Check how many coordinates were successfully filled
    final_missing = df[df['좌표X'].isna() | df['좌표Y'].isna()].shape[0]
    
    print(f"\nProcessing completed in {time.time() - start_time:.2f} seconds")
    print(f"Successfully filled {filled_count} out of {initial_missing} missing coordinates")
    print(f"Remaining missing coordinates: {final_missing}")
    print(f"Match methods statistics:")
    for method, count in match_counts.items():
        print(f"  - {method}: {count}")
    

    # Save the updated dataset
    output_path = 'updated_test_with_coordinates_parallel.csv'
    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"Updated dataset saved to {output_path}")
    
    return df

if __name__ == "__main__":
    parallel_process_coordinates()

Starting parallel coordinate processing...
Initial missing coordinates: 6562
Using 63 CPU cores for parallel processing
Split data into 657 chunks for processing


Processing chunks: 100%|██████████| 657/657 [00:02<00:00, 318.13it/s]


Aggregated 6562 results. Updating dataframe...

Processing completed in 4.13 seconds
Successfully filled 6557 out of 6562 missing coordinates
Remaining missing coordinates: 146
Match methods statistics:
  - exact_match: 6557
  - apartment_match: 0
  - neighborhood_match: 0
Updated dataset saved to updated_test_with_coordinates_parallel.csv
