# 1. Load data and Rectify

In [9]:
import pandas as pd
from tqdm import tqdm

# Load friendly facilities
df = pd.read_csv("datas/station_only.csv", encoding="utf-8")

# Load house data
df2 = pd.read_csv("datas/seoul_pri_2020_ll.csv", encoding="utf-8")


In [10]:
# data rectify: delete null data

filt = df['longitude'].isnull()
df.drop(index=df[filt].index, inplace=True)

filt2 = df['latitude'].isnull()
df.drop(index=df[filt2].index, inplace=True)


In [11]:
# data rectify: convert longitde from str to float
df['longitude'] = df['longitude'].apply(lambda x: float(x))


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 738 entries, 0 to 737
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   명칭            738 non-null    object 
 1   latitude      738 non-null    float64
 2   longitude     738 non-null    float64
 3   Station_line  738 non-null    object 
 4   구분            738 non-null    object 
dtypes: float64(2), object(3)
memory usage: 50.8+ KB


In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163821 entries, 0 to 163820
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   주소         163821 non-null  object 
 1   건물주용도      163821 non-null  object 
 2   건물명        163821 non-null  object 
 3   건물면적       163821 non-null  float64
 4   물건금액       163821 non-null  int64  
 5   latitude   163821 non-null  float64
 6   longitude  163821 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 8.7+ MB


## 1.1 Prepare empty df

In [None]:
df_complete = pd.DataFrame(columns=['0.5km지하철', '1.0km지하철', '1.5km지하철'])

# 2. Define functions

In [None]:
from scipy.spatial import distance

In [None]:
# trial calculating
distance.euclidean((df2['longitude'][0], df2['latitude'][0]),(df['longitude'][1], df['latitude'][1]))

In [None]:
# distance's output is not km.
def make_km(x):
    # 2.3f
    return x // 0.00001 /1000

## 2.2 범위 내 개수 구하기

In [None]:
def list_element_is_in(base_list, element):
    for i in base_list:
        if len(base_list)==0:
            return False
        if [i] == element:           
            return True
    return False

# count num of friendly facilities in range
# Input: distance list, range list [km]
def st_count_in_distance(dist_list, range_km, station_lines):
    count = 0
    appended_line = []
    
    for i, dist in enumerate(dist_list):
        if 0 < dist < range_km and not list_element_is_in(appended_line, [station_lines[i]]):
            count +=1
            appended_line.append(station_lines[i])
    return count

In [None]:
# Input: house 는 데이터 하나, 우호시설은 데이터 column
def st_num_by_distance(house_lon, house_lat, friendly_lons, friendly_lats, range_list, station_lines):
    dist = []
    for i in range(len(friendly_lons)):
        dist.append(
            make_km(distance.euclidean((house_lon, house_lat), (friendly_lons[i], friendly_lats[i]))))
    
    ret = []
    for ran in range_list:
        ret.append(st_count_in_distance(dist, ran, station_lines))

        
    return ret # list

In [12]:
selection = ['지하철']
selection_range = [[0.5, 1, 1.5]]
filt = df['구분'] == selection[0]

for i in tqdm(range(len(df2['longitude'])), desc='Progress..', mininterval=0.1):
    ret = st_num_by_distance(df2['longitude'][i], df2['latitude'][i], df[filt]['longitude'].values, df[filt]['latitude'].values, selection_range[0], df[filt]['Station_line'].values)

    # add them in dataframe
    df_complete = df_complete.append(pd.Series(ret, index=df_complete.columns), ignore_index=True)

    if (i%10000==0): 
        df_complete.to_csv("station_processing_by_line_2020.csv", encoding="utf-8")
        

df_complete.to_csv("station_processing_by_line_2020.csv", encoding="utf-8")

Progress..: 100%|██████████| 163821/163821 [30:14<00:00, 90.28it/s] 


In [13]:
df_complete

Unnamed: 0,0.5km지하철,1.0km지하철,1.5km지하철
0,0,1,2
1,1,1,2
2,1,2,3
3,0,4,5
4,0,1,4
...,...,...,...
163816,1,1,3
163817,0,3,4
163818,0,2,3
163819,2,2,5


In [3]:
list1 = [1,2,3,4,5]
list2 = [6,7,8,9,10]


In [4]:
list1

[1, 2, 3, 4, 5]

In [8]:
import numpy as np
np.concatenate([list1, list2])

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [14]:
selection = ['지하철']
selection_range = [[0.5, 1, 1.5]]


In [15]:
selection[0] == '지하철'

True