In [2]:
import os
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import re

In [9]:
def parse_dms_extended(coord_str):
    coord_str = coord_str.strip()
    if "°" in coord_str:
        pattern = r"(\d+)[°]\s*(\d+)[\'’]\s*(\d+(?:\.\d+)?)[\"']?\s*([NSEW])"
        match = re.search(pattern, coord_str)
        if match:
            deg, minute, second, direction = match.groups()
            dd = float(deg) + float(minute) / 60 + float(second) / 3600
            if direction in ['S', 'W']:
                dd = -dd
            return dd
        else:
            return None
    else:
        pattern = r"(\d{2,3})(\d{2})(\d{2})([NSEW])"
        m = re.match(pattern, coord_str.replace(" ", ""))
        if m:
            deg, minute, second, direction = m.groups()
            dd = int(deg) + int(minute) / 60 + int(second) / 3600
            if direction in ['S', 'W']:
                dd = -dd
            return dd
        return None

def extract_coordinates(text):
    text = text.replace("–", " ")
    pattern2 = r"(\d+[°]\s*\d+[\'’]\s*\d+(?:\.\d+)?[\"']?\s*[NSEW])"
    matches2 = re.findall(pattern2, text)
    if matches2:
        return matches2
    else:
        pattern1 = r"(\d{6,7}[NSEW])"
        return re.findall(pattern1, text)

# 1. 페이지 크롤링 및 전체 텍스트 추출
url = "https://droneportal.or.kr/subList/22000000157"
headers = {'User-Agent': 'Mozilla/5.0'}
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req) as response:
    html_data = response.read().decode('utf-8')
soup = BeautifulSoup(html_data, 'html.parser')
page_text = soup.get_text(separator="\n")

# 2. 대상 항목 추출
target_titles = [
    "대전 금강변 드론공원",
    "광주 북구 영산강변 드론공원",
    "광나루 한강변 드론공원"
]

results = []
for i, title in enumerate(target_titles):
    start_idx = page_text.find(title)
    if start_idx == -1:
        continue
    if i < len(target_titles) - 1:
        next_idx = page_text.find(target_titles[i+1], start_idx + len(title))
        block = page_text[start_idx: next_idx] if next_idx != -1 else page_text[start_idx:]
    else:
        block = page_text[start_idx:]
    
    # 전체 블록에서 좌표 문자열 추출 (위치 텍스트가 따로 없을 수 있으므로)
    coord_strs = extract_coordinates(block)
    converted = [parse_dms_extended(cs) for cs in coord_strs if parse_dms_extended(cs) is not None]
    pairs = []
    for j in range(0, len(converted), 2):
        if j+1 < len(converted):
            pairs.append((converted[j], converted[j+1]))
    
    results.append({
        "지역": title,
        "좌표": pairs
    })

df_dron_park = pd.DataFrame(results, columns=["지역", "좌표"])
print("추출된 신규 데이터:")
print(df_dron_park)

df_dron_park.to_csv('df_dron_park.csv', encoding='cp949', index=False)

추출된 신규 데이터:
                지역                                                 좌표
0      대전 금강변 드론공원  [(36.465, 127.39055555555557), (36.46583333333...
1  광주 북구 영산강변 드론공원          [(35.22166666666667, 126.86166666666666)]
2     광나루 한강변 드론공원  [(37.546638888888886, 127.12052777777777), (12...
