In [None]:
import pandas as pd
from geopy.distance import geodesic

# Load data
light_pollution = pd.read_csv('light_pollution_2019.csv')
county_coords = pd.read_csv('GEOINFO2023.GEOINFO-Data.csv')

# Make sure the latitude and longitude columns are floating point numbers
county_coords['INTPTLAT'] = pd.to_numeric(county_coords['INTPTLAT'], errors='coerce')
county_coords['INTPTLON'] = pd.to_numeric(county_coords['INTPTLON'], errors='coerce')
light_pollution['Latitude'] = pd.to_numeric(light_pollution['Latitude'], errors='coerce')
light_pollution['Longitude'] = pd.to_numeric(light_pollution['Longitude'], errors='coerce')

# NaNCheck if there are any invalid values ​​(NaN)
print(county_coords[['INTPTLAT', 'INTPTLON']].isnull().sum())
print(light_pollution[['Latitude', 'Longitude']].isnull().sum())

county_coords = county_coords.dropna(subset=['INTPTLAT', 'INTPTLON'])


light_pollution['Nearest_County'] = light_pollution.apply(
    lambda row: find_nearest_county(row, county_coords), axis=1
)

# # Traverse the light pollution data and find the nearest county
def find_nearest_county(row, counties):
    light_point = (row['Latitude'], row['Longitude'])
    min_distance = float('inf')
    nearest_county = None
    for _, county in counties.iterrows():
        county_center = (county['INTPTLAT'], county['INTPTLON'])
        distance = geodesic(light_point, county_center).km
        if distance < min_distance:
            min_distance = distance
            nearest_county = county['NAME']
    return nearest_county

light_pollution['Nearest_County'] = light_pollution.apply(
    lambda row: find_nearest_county(row, county_coords), axis=1
)

# Aggregate data
county_light_pollution = light_pollution.groupby('Nearest_County')['Brightness'].sum().reset_index()

county_light_pollution.to_csv('county_light_pollution_2019.csv', index=False)

print("光污染数据按县聚合完成并保存。")
