In [1]:
''' 
이 파일에서는 위도 경도를 이용해 거리를 측정하는 코드들을 다루겠습니다.
'''

import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('sample2.csv')
df.drop(columns='Unnamed: 0', inplace=True) #데이터를 불러옵니다.

In [3]:
station = pd.read_csv('station.csv')
hospital = pd.read_csv('Hospital.csv', encoding='cp949')
bus = pd.read_csv('bus.csv', encoding='cp949')
school = pd.read_csv("school.csv", encoding="utf-8")
hangang = pd.read_csv('bridge.csv', encoding='utf-8')

In [4]:
station.head()

Unnamed: 0,Line,Station Name,Address,Latitude,Longitude
0,우이신설,4.19민주묘지,서울특별시 강북구 우이동 72-186,37.649456,127.013506
1,3호선,가락시장,서울특별시 송파구 가락동 184-23 가락시장역(3호선),37.495686,127.118405
2,1호선,가산디지털단지,서울특별시 금천구 가산동 468-4,37.482412,126.88224
3,우이신설,가오리,서울특별시 강북구 수유동 338-46,37.641701,127.016948
4,경의중앙,가좌,서울특별시 서대문구 남가좌동 296-12,37.568883,126.915167


In [5]:
'''
Haversine 함수를 정의합니다.
속도의 이점을 위해 Numpy 배열로 변환합니다.
'''

def haversine(lat1, lon1, lat2, lon2): #Haversine 함수를 정의합니다. 
    
    lat1, lon1, lat2, lon2 = map(np.radians, [np.array(x).astype(float) for x in [lat1, lon1, lat2, lon2]])
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = 6371 * c # 6371 = 지구 반지름
    
    return distance

In [8]:
'''
가장 가까운 지하철의 위치를 구하고, 역명과 거리를 구하는 과정
'''

def find_nearest_stations(df, station):

    lat1 = df['Latitude'].values[:, np.newaxis]
    lon1 = df['Longitude'].values[:, np.newaxis]
    lat2 = station['Latitude'].values
    lon2 = station['Longitude'].values

    distances = haversine(lat1, lon1, lat2, lon2)
    nearest_idx = np.argmin(distances, axis=1)
    nearest_stations = station.iloc[nearest_idx]
    
    # 가장 가까운 역의 이름과 거리를 DataFrame에 추가
    df['Nearest Station'] = nearest_stations['Station Name'].values
    df['Distance to NS'] = np.min(distances, axis=1)

find_nearest_stations(df, station)


In [9]:
df.head()

Unnamed: 0,Date,Name,Year,Month,Gu,Dong,Address,Longitude,Latitude,major,...,Price per Area Log Scale,Total Population,Children,Adolescent,Youth,Middle-Aged,Old Age,Consumption,Nearest Station,Distance to NS
0,20231231,블루힐하우스,2023,202312,서초구,잠원동,서초구 잠원동 0071-0001,127.00599,37.511931,0,...,7.675002,50621,0.107821,0.154027,0.188124,0.420339,0.12967,8.125861,잠원,0.424406
1,20231231,극동,2023,202312,서대문구,홍은동,서대문구 홍은동 0454-0000,126.95087,37.602631,0,...,6.379766,50678,0.07372,0.115297,0.222128,0.396839,0.191997,6.647932,홍제,1.097229
2,20231231,e편한세상상도노빌리티,2023,202312,동작구,상도동,동작구 상도동 0903-0000,126.94813,37.505278,2,...,7.459861,113485,0.071904,0.118174,0.262343,0.373468,0.174111,6.31837,상도,0.316074
3,20231231,화랑대디오베이션,2023,202312,노원구,공릉동,노원구 공릉동 0760-0000,127.082217,37.620393,0,...,6.907255,77249,0.079561,0.151277,0.231446,0.377351,0.160365,6.329894,화랑대(서울여대입구),1.169617
4,20231231,율창,2023,202312,강서구,화곡동,강서구 화곡동 0888-0001,126.850504,37.528725,0,...,6.010602,217278,0.066463,0.10007,0.285293,0.378695,0.169474,7.385394,신정(은행정),0.88372


In [10]:
hospital.head()

Unnamed: 0,기관ID,주소,병원분류,병원분류명,응급의료기관코드,응급의료기관코드명,응급실운영여부(1/2),비고,기관설명상세,간이약도,...,진료시간(목요일)S,진료시간(금요일)S,진료시간(토요일)S,진료시간(일요일)S,진료시간(공휴일)S,우편번호1,우편번호2,병원경도,병원위도,작업시간
0,A1124291,서울특별시 강남구 봉은사로 612 (삼성동),B,병원,G099,응급의료기관 이외,2,점심시간 13:00~14:00,,성신여대입구역 7번출구/ 골든타워 5~6층,...,900,900,900,900.0,,61,70,127.062147,37.514281,2024-02-07 14:56:04.0
1,B1105790,"서울특별시 종로구 자하문로11길 16-2, 1층 (통인동)",G,한의원,G099,응급의료기관 이외,2,"월,화,수,목,금,토,일 100% 예약진료","피부질환, 면역질환, 희귀성, 난치병 전문 경희 길 한의원 입니다.",명동역 8번출구 선샤인빌딩 9층,...,1200,1200,1200,1200.0,1200.0,30,36,126.970436,37.579801,2024-02-07 14:56:06.0
2,B1101372,"서울특별시 서초구 고무래로 22, 쌍동빌딩 서관 2층 (반포동)",G,한의원,G099,응급의료기관 이외,2,"월,화,수,목,금,토,일 100% 예약진료","피부질환, 면역질환, 희귀성, 난치병 전문 경희 길 한의원 입니다.",명동역 8번출구 선샤인빌딩 9층,...,930,930,930,1200.0,1200.0,65,93,127.010418,37.503008,2024-02-07 14:56:06.0
3,B1101127,"서울특별시 성동구 천호대로 312, 3층 (용답동)",G,한의원,G099,응급의료기관 이외,2,"월,화,수,목,금,토,일 100% 예약진료","피부질환, 면역질환, 희귀성, 난치병 전문 경희 길 한의원 입니다.",답십리역 5번출구 ?데리아 옆,...,930,930,930,1200.0,1200.0,48,5,127.054561,37.564981,2024-02-07 14:56:06.0
4,B1101611,"서울특별시 노원구 동일로 1676, 2층 (상계동, 한사랑메디컬빌딩)",G,한의원,G099,응급의료기관 이외,2,"월,화,수,목,금,토,일 100% 예약진료","피부질환, 면역질환, 희귀성, 난치병 전문 경희 길 한의원 입니다.",수락산역 2번 출구,...,1400,930,930,1200.0,1200.0,16,25,127.055661,37.677952,2024-02-07 14:56:06.0


In [11]:
medical_center = hospital[hospital['병원분류명'] == '종합병원']
hospital = hospital[hospital['병원분류명'] != '종합병원'] 
medical_center.rename(columns={'병원위도': 'Latitude', '병원경도':'Longitude'}, inplace=True)


In [12]:
'''
가장 가까운 종합병원의 위치를 구하고, 종합병원의 이름과 거리를 구하는 과정
'''

def find_nearest_medical_centers(df, medical_center):

    lat1 = df['Latitude'].values[:, np.newaxis]
    lon1 = df['Longitude'].values[:, np.newaxis]
    lat2 = medical_center['Latitude'].values
    lon2 = medical_center['Longitude'].values

    distances = haversine(lat1, lon1, lat2, lon2)
    nearest_idx = np.argmin(distances, axis=1)
    nearest_medical_centers = medical_center.iloc[nearest_idx]
    
    df['Nearest Medical Center'] = nearest_medical_centers['기관명'].values
    df['Distance to MC'] = np.min(distances, axis=1)

find_nearest_medical_centers(df, medical_center)

In [13]:
df.head()

Unnamed: 0,Date,Name,Year,Month,Gu,Dong,Address,Longitude,Latitude,major,...,Children,Adolescent,Youth,Middle-Aged,Old Age,Consumption,Nearest Station,Distance to NS,Nearest Medical Center,Distance to MC
0,20231231,블루힐하우스,2023,202312,서초구,잠원동,서초구 잠원동 0071-0001,127.00599,37.511931,0,...,0.107821,0.154027,0.188124,0.420339,0.12967,8.125861,잠원,0.424406,학교법인가톨릭학원가톨릭대학교서울성모병원,1.131965
1,20231231,극동,2023,202312,서대문구,홍은동,서대문구 홍은동 0454-0000,126.95087,37.602631,0,...,0.07372,0.115297,0.222128,0.396839,0.191997,6.647932,홍제,1.097229,의료법인동신의료재단동신병원,2.704469
2,20231231,e편한세상상도노빌리티,2023,202312,동작구,상도동,동작구 상도동 0903-0000,126.94813,37.505278,2,...,0.071904,0.118174,0.262343,0.373468,0.174111,6.31837,상도,0.316074,중앙대학교병원,1.13476
3,20231231,화랑대디오베이션,2023,202312,노원구,공릉동,노원구 공릉동 0760-0000,127.082217,37.620393,0,...,0.079561,0.151277,0.231446,0.377351,0.160365,6.329894,화랑대(서울여대입구),1.169617,한국원자력의학원원자력병원,0.937532
4,20231231,율창,2023,202312,강서구,화곡동,강서구 화곡동 0888-0001,126.850504,37.528725,0,...,0.066463,0.10007,0.285293,0.378695,0.169474,7.385394,신정(은행정),0.88372,홍익병원,1.1609


In [14]:
bus.rename(columns={'Y좌표': 'Latitude', 'X좌표':'Longitude'}, inplace=True)

In [16]:
'''
정해진 거리이내에 있는 버스정류장의 수를 새는 함수
오래 걸리는게 정상입니다. 저는 5분 걸립니다.
'''

from tqdm import tqdm #프로그레스바를 위해 도입


def count_bus_stations(apartments, bus):
    counts = []
    
    for index, row in tqdm(apartments.iterrows(), total=apartments.shape[0], desc='Counting bus stations'):
        lat = row['Latitude']
        lon = row['Longitude']

        #Bus 데이터가 너무 많아서 가까이 있는 버스 데이터를 선별해서 위치를 추출합니다.
        #위도 0.01 = 1.1km , 경도 0.01 = 0.785km, 더 멀리있는 버스정류장을 검색하려면 이 수치를 바꿔줘야합니다.
        nearby_bus = bus[((bus['Latitude'] >= lat - 0.01) & (bus['Latitude'] <= lat + 0.01)) & ((bus['Longitude'] >= lon - 0.01) & (bus['Longitude'] <= lon + 0.01))]
        if not nearby_bus.empty:
            distances = [haversine(lat, lon, bus_lat, bus_lon) for bus_lat, bus_lon in zip(nearby_bus['Latitude'], nearby_bus['Longitude'])]
            count = sum(d <= 0.5 for d in distances) / 2  # 버스는 하행과 상행이 존재하므로 2로 나눠줍니다. 0.5대신에 거리를 넣으면 됩니다.
            counts.append(count)
        else : 
            counts.append(0)

    return counts

df['Bus Station Within 500m'] = count_bus_stations(df, bus)

Counting bus stations: 100%|██████████| 203663/203663 [04:59<00:00, 680.92it/s]


In [17]:
elementary_school = school[school['학교종류명'] == '초등학교']

In [18]:
elementary_school.head()

Unnamed: 0,학교종류명,설립구분,학교명,영문학교명,남녀공학구분명,고등학교구분명,고등학교일반실업구분명,특수목적고등학교계열명,Longitude,Latitude,도로명주소,구주소,동,일반명문고
1,초등학교,공립,서울숭신초등학교,Seoul Soongshin Elementary School,남여공학,,일반계,,127.027827,37.567935,서울특별시 성동구 마장로 161 서울숭신초등학교,서울특별시 성동구 하왕십리동 1068 서울숭신초등학교,하왕십리동,x
25,초등학교,사립,한양초등학교,Hanyang Elementary School,남여공학,,일반계,,127.050387,37.557611,서울특별시 성동구 살곶이길 208 한양초등학교,서울특별시 성동구 사근동 120-2 한양초등학교,사근동,x
26,초등학교,사립,세종초등학교,Sejong Elementary School,남여공학,,일반계,,127.072906,37.553032,서울특별시 광진구 군자로 114 세종초등학교,서울특별시 광진구 군자동 98 세종초등학교,군자동,x
27,초등학교,사립,성동초등학교,Sungdong Elementary School,남여공학,,일반계,,127.089527,37.533178,서울특별시 광진구 자양로4길 63 성동초등학교,서울특별시 광진구 자양동 694-1 성동초등학교,자양동,x
28,초등학교,공립,서울행현초등학교,Seoul Haenghyun Elementary School,남여공학,,해당없음,,127.029405,37.557692,서울특별시 성동구 행당로 95 행현초등학교,서울특별시 성동구 행당동 317-18 행현초등학교,행당동,x


In [21]:
def count_schools(apartment, school):
    
    counts = []

    lat1 = apartment['Latitude'].values[:, np.newaxis]
    lon1 = apartment['Longitude'].values[:, np.newaxis]
    lat2 = school['Latitude'].values
    lon2 = school['Longitude'].values

    for i in tqdm(range(len(lat1))):
        distances = haversine(lat1[i], lon1[i], lat2, lon2)

        count = np.sum(distances <= 0.7) #거리를 수정
        counts.append(count)

    return counts

df['Elementary Schools Num'] = count_schools(df, elementary_school)

100%|██████████| 203663/203663 [00:08<00:00, 25080.34it/s]


In [22]:
df.head()

Unnamed: 0,Date,Name,Year,Month,Gu,Dong,Address,Longitude,Latitude,major,...,Youth,Middle-Aged,Old Age,Consumption,Nearest Station,Distance to NS,Nearest Medical Center,Distance to MC,Bus Station Within 500m,Elementary Schools Num
0,20231231,블루힐하우스,2023,202312,서초구,잠원동,서초구 잠원동 0071-0001,127.00599,37.511931,0,...,0.188124,0.420339,0.12967,8.125861,잠원,0.424406,학교법인가톨릭학원가톨릭대학교서울성모병원,1.131965,5.5,1
1,20231231,극동,2023,202312,서대문구,홍은동,서대문구 홍은동 0454-0000,126.95087,37.602631,0,...,0.222128,0.396839,0.191997,6.647932,홍제,1.097229,의료법인동신의료재단동신병원,2.704469,5.5,2
2,20231231,e편한세상상도노빌리티,2023,202312,동작구,상도동,동작구 상도동 0903-0000,126.94813,37.505278,2,...,0.262343,0.373468,0.174111,6.31837,상도,0.316074,중앙대학교병원,1.13476,23.5,3
3,20231231,화랑대디오베이션,2023,202312,노원구,공릉동,노원구 공릉동 0760-0000,127.082217,37.620393,0,...,0.231446,0.377351,0.160365,6.329894,화랑대(서울여대입구),1.169617,한국원자력의학원원자력병원,0.937532,10.5,2
4,20231231,율창,2023,202312,강서구,화곡동,강서구 화곡동 0888-0001,126.850504,37.528725,0,...,0.285293,0.378695,0.169474,7.385394,신정(은행정),0.88372,홍익병원,1.1609,8.0,3


In [23]:
hangang = pd.read_csv('bridge.csv', encoding='utf-8')

In [24]:
from scipy.interpolate import interp1d

hangang_sorted = hangang.sort_values(by='경도')

# 다리를 찍을 좌표를 선형보간으로 잇습니다.
interpolate_lon = np.linspace(hangang_sorted['경도'].min(), hangang_sorted['경도'].max(), 130) #선 상에 있는 좌표를 기록합니다. 130개
linear_interp = interp1d(hangang_sorted['경도'], hangang_sorted['위도'], kind='linear')
interpolate_lat = linear_interp(interpolate_lon)

selected_coords = np.column_stack((interpolate_lon, interpolate_lat)) 

# 선택된 좌표를 담은 데이터프레임 생성
selected_coords_df = pd.DataFrame(selected_coords, columns=['Longitude', 'Latitude'])

In [31]:
def apartment_distance(lat, lon, hangang):
    
    lat2 = hangang['Latitude'].values
    lon2 = hangang['Longitude'].values

    distances = haversine(np.array([[lat]]), np.array([[lon]]), lat2, lon2)
    return np.min(distances) <= 1.0 #한강으로 부터 1km 이내

df['Nearby Hangang'] = df.apply(lambda row: apartment_distance(row['Latitude'], row['Longitude'], selected_coords_df), axis=1).astype(int)

In [33]:
school = school[school['일반명문고'] == 'o']

In [34]:
def count_high_schools(apartment, school):
    
    counts = []

    lat1 = apartment['Latitude'].values[:, np.newaxis]
    lon1 = apartment['Longitude'].values[:, np.newaxis]
    lat2 = school['Latitude'].values
    lon2 = school['Longitude'].values

    for i in tqdm(range(len(lat1))):
        distances = haversine(lat1[i], lon1[i], lat2, lon2)

        count = np.sum(distances <= 2.5 ) #거리를 수정
        counts.append(count)

    return counts

df['High School Num'] = count_high_schools(df, school)

100%|██████████| 203663/203663 [00:04<00:00, 42142.31it/s]


In [35]:
facility = pd.read_csv('facility.csv', encoding='cp949')

In [36]:
facility.rename(columns={'위도':'Latitude', '경도':'Longitude'}, inplace=True)

In [37]:
LibMus = facility[(facility['시설용도분류'] == 'FU_BB') | (facility['시설용도분류'] == 'FU_BC')]
Mall = facility[facility['시설용도분류'] == 'FU_BA']
Hospital = facility[(facility['시설용도분류'] == 'FU_BD') | (facility['시설용도분류'] == 'FU_BE')| (facility['시설용도분류'] == 'FU_BH')| (facility['시설용도분류'] == 'FU_BG')]
Park = facility[(facility['시설용도분류'] == 'FU_BI') | (facility['시설용도분류'] == 'FU_BJ')]

In [42]:
def count_facility(apartment, facility, km, count_mode=True):
    
    counts = []

    lat1 = apartment['Latitude'].values[:, np.newaxis]
    lon1 = apartment['Longitude'].values[:, np.newaxis]
    lat2 = facility['Latitude'].values
    lon2 = facility['Longitude'].values

    for i in tqdm(range(len(lat1))):
        distances = haversine(lat1[i], lon1[i], lat2, lon2)
        if count_mode:
            # 지정된 반경 내의 시설 개수를 계산
            count = np.sum(distances <= km)
        else:
            # 지정된 반경 내에 최소 한 곳의 시설이 있는지 여부 (1 또는 0)
            count = np.any(distances <= km).astype(int)
        counts.append(count)

    return counts

df['Market Num'] = count_facility(df, Mall, 1)
df['Hospital Num'] = count_facility(df, Hospital, 1)
df['Park Presence'] = count_facility(df, Park, 1, count_mode=False)

100%|██████████| 203663/203663 [00:10<00:00, 20249.90it/s]
100%|██████████| 203663/203663 [00:09<00:00, 20946.85it/s]
100%|██████████| 203663/203663 [00:13<00:00, 15437.21it/s]


In [43]:
df.head()

Unnamed: 0,Date,Name,Year,Month,Gu,Dong,Address,Longitude,Latitude,major,...,Distance to NS,Nearest Medical Center,Distance to MC,Bus Station Within 500m,Elementary Schools Num,Nearby Hangang,High School Num,Market Num,Hospital Num,Park Presence
0,20231231,블루힐하우스,2023,202312,서초구,잠원동,서초구 잠원동 0071-0001,127.00599,37.511931,0,...,0.424406,학교법인가톨릭학원가톨릭대학교서울성모병원,1.131965,5.5,1,1,2,11,1,1
1,20231231,극동,2023,202312,서대문구,홍은동,서대문구 홍은동 0454-0000,126.95087,37.602631,0,...,1.097229,의료법인동신의료재단동신병원,2.704469,5.5,2,0,1,0,0,1
2,20231231,e편한세상상도노빌리티,2023,202312,동작구,상도동,동작구 상도동 0903-0000,126.94813,37.505278,2,...,0.316074,중앙대학교병원,1.13476,23.5,3,0,0,1,3,1
3,20231231,화랑대디오베이션,2023,202312,노원구,공릉동,노원구 공릉동 0760-0000,127.082217,37.620393,0,...,1.169617,한국원자력의학원원자력병원,0.937532,10.5,2,0,1,3,6,1
4,20231231,율창,2023,202312,강서구,화곡동,강서구 화곡동 0888-0001,126.850504,37.528725,0,...,0.88372,홍익병원,1.1609,8.0,3,0,4,5,5,1


In [44]:
df.to_csv('price_list_real_real_final.csv')