In [3]:
import pickle
import pandas as pd
import numpy as np
import re

# 1. Load your dataset with low_memory=False to handle mixed types
df = pd.read_csv('../data/train.csv', encoding='utf-8', low_memory=False)

# Check initial missing values
initial_missing = df[df['좌표X'].isna() | df['좌표Y'].isna()].shape[0]
print(f"Initial missing coordinates: {initial_missing}")

# 2. Load the address to geo coordinates mapping
with open('adres_to_geo.pickle', 'rb') as f:
    adres_to_geo = pickle.load(f)

# 3. Create multiple address keys with different strategies

def create_address_keys(row):
    """Create multiple possible address keys to maximize matching chances"""
    keys = []
    
    # Extract components from the first column (assuming it contains the address)
    try:
        base_address = row.iloc[0]  # Get the first column value
        if pd.isna(base_address):
            return keys
            
        # Strategy 1: Full address with original 본번/부번 format
        try:
            if pd.notna(row['본번']):
                if pd.notna(row['부번']) and row['부번'] != 0:
                    plot_number = f"{int(row['본번'])}-{int(row['부번'])}"
                else:
                    plot_number = str(int(row['본번']))
                
                full_address = f"{base_address} {plot_number}"
                keys.append(full_address)
        except:
            pass
        
        # Strategy 2: Address parts with original 본번/부번 format
        # Extract city, district, neighborhood if they exist in the address
        try:
            address_parts = base_address.split()
            if len(address_parts) >= 3:  # Ensure we have at least city+district+neighborhood
                neighborhood = address_parts[2]  # e.g., "개포동"
                district = address_parts[1]      # e.g., "강남구"
                city = address_parts[0]          # e.g., "서울특별시"
                
                # Different combinations
                if pd.notna(row['본번']):
                    if pd.notna(row['부번']) and row['부번'] != 0:
                        plot_number = f"{int(row['본번'])}-{int(row['부번'])}"
                    else:
                        plot_number = str(int(row['본번']))
                    
                    keys.append(f"{city} {district} {neighborhood} {plot_number}")
        except:
            pass
        
        # Strategy 3: Try with just the neighborhood and 번지
        try:
            if len(address_parts) >= 3 and pd.notna(row['본번']):
                neighborhood = address_parts[2]  # e.g., "개포동"
                
                if pd.notna(row['부번']) and row['부번'] != 0:
                    plot_number = f"{int(row['본번'])}-{int(row['부번'])}"
                else:
                    plot_number = str(int(row['본번']))
                
                keys.append(f"{neighborhood} {plot_number}")
        except:
            pass
        
        # Strategy 4: Extract address using regular expressions
        # This helps handle different address formats
        try:
            # Look for patterns like "서울특별시 강남구 개포동" or "서울시 강남구 개포동"
            match = re.search(r'(서울[특별시]*|인천[광역시]*|부산[광역시]*|대구[광역시]*|광주[광역시]*|대전[광역시]*|울산[광역시]*|세종[특별자치시]*|경기도|강원도|충청북도|충청남도|전라북도|전라남도|경상북도|경상남도|제주[특별자치도]*)\s+([^\s]+)\s+([^\s]+)', base_address)
            if match:
                city = match.group(1)
                district = match.group(2)
                neighborhood = match.group(3)
                
                if pd.notna(row['본번']):
                    if pd.notna(row['부번']) and row['부번'] != 0:
                        plot_number = f"{int(row['본번'])}-{int(row['부번'])}"
                    else:
                        plot_number = str(int(row['본번']))
                    
                    keys.append(f"{city} {district} {neighborhood} {plot_number}")
        except:
            pass
            
    except Exception as e:
        print(f"Error creating address keys: {e}")
    
    return keys

# 4. Create inverse lookup from coordinates to address for additional matching
coord_to_address = {}
for addr, coords in adres_to_geo.items():
    if coords not in coord_to_address:
        coord_to_address[coords] = []
    coord_to_address[coords].append(addr)

# 5. Fill in missing coordinates
missing_indices = df[df['좌표X'].isna() | df['좌표Y'].isna()].index
filled_count = 0
match_methods = {
    "exact_match": 0,
    "address_only_match": 0,
    "neighborhood_match": 0,
    "apartment_match": 0
}

for idx in missing_indices:
    row = df.loc[idx]
    
    # Try strategy 1-4: Direct key matching
    possible_keys = create_address_keys(row)
    matched = False
    
    for key in possible_keys:
        if key in adres_to_geo:
            # Values in pickle are (latitude, longitude) tuples
            latitude, longitude = adres_to_geo[key]
            df.loc[idx, '좌표Y'] = latitude      # Y coordinate is latitude
            df.loc[idx, '좌표X'] = longitude     # X coordinate is longitude
            filled_count += 1
            match_methods["exact_match"] += 1
            matched = True
            break
    
    if matched:
        continue
    
    # Strategy 5: Match on apartment name if available
    try:
        if '아파트명' in df.columns and pd.notna(row['아파트명']):
            apt_name = row['아파트명']
            address_parts = row.iloc[0].split()
            
            if len(address_parts) >= 3:
                for addr in adres_to_geo.keys():
                    # Check if both the neighborhood and apartment name match
                    if address_parts[2] in addr and apt_name in addr:
                        latitude, longitude = adres_to_geo[addr]
                        df.loc[idx, '좌표Y'] = latitude      
                        df.loc[idx, '좌표X'] = longitude    
                        filled_count += 1
                        match_methods["apartment_match"] += 1
                        matched = True
                        break
    except:
        pass
    
    if matched:
        continue
    
    # Strategy 6: Match on just neighborhood
    try:
        address_parts = row.iloc[0].split()
        if len(address_parts) >= 3:
            neighborhood = address_parts[2]  # e.g., "개포동"
            
            # Find any address in the same neighborhood
            neighborhood_matches = [addr for addr in adres_to_geo.keys() if neighborhood in addr]
            
            if neighborhood_matches:
                # Use the first match
                latitude, longitude = adres_to_geo[neighborhood_matches[0]]
                df.loc[idx, '좌표Y'] = latitude      
                df.loc[idx, '좌표X'] = longitude     
                filled_count += 1
                match_methods["neighborhood_match"] += 1
                matched = True
    except:
        pass

# 6. Check how many coordinates were successfully filled
final_missing = df[df['좌표X'].isna() | df['좌표Y'].isna()].shape[0]
print(f"Successfully filled {filled_count} out of {initial_missing} missing coordinates")
print(f"Remaining missing coordinates: {final_missing}")
print(f"Match methods statistics:")
for method, count in match_methods.items():
    print(f"  - {method}: {count}")

# 7. Save the updated dataset
df.to_csv('updated_test_with_coordinates.csv', index=False, encoding='utf-8')

print("Updated dataset saved successfully!")

Initial missing coordinates: 869670


KeyboardInterrupt: 