# GeoPackage develop

## 1. setting modules

In [1]:
# load modules
import numpy as np
import pandas as pd
import geopandas as gpd
import tqdm
import random
import os

# split
from sklearn.model_selection import train_test_split

# models 
from xgboost import XGBRegressor, DMatrix
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from supervised.automl import AutoML

# tuning
import optuna

# vif
from statsmodels.stats.outliers_influence import variance_inflation_factor

# visualization
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib as mpl
import plotly
from shapely.geometry import MultiPolygon
from shapely.wkt import loads

# metrics
from sklearn.metrics import mean_squared_log_error

# 경고 무시
import warnings
warnings.filterwarnings('ignore')

# ttest
import scipy.stats as stats
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

%matplotlib inline

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
# load data
busstop = pd.read_csv('../Data/bus_stop.csv', encoding = 'euc-kr')

In [3]:
# random seed 고정하기
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [4]:
# load geopackages
daegu100 = gpd.read_file('../Data/external_open/대구 빅데이터 마트 데이터/0. Base/대구광역시_100.gpkg')

In [5]:
daegu100

Unnamed: 0,id,left,top,right,bottom,geometry
0,349,1.077033e+06,1.745787e+06,1.077133e+06,1.745687e+06,"MULTIPOLYGON (((1077102.303 1745692.817, 10771..."
1,350,1.077033e+06,1.745687e+06,1.077133e+06,1.745587e+06,"MULTIPOLYGON (((1077057.173 1745639.546, 10770..."
2,351,1.077033e+06,1.745587e+06,1.077133e+06,1.745487e+06,"MULTIPOLYGON (((1077051.871 1745538.189, 10770..."
3,352,1.077033e+06,1.745487e+06,1.077133e+06,1.745387e+06,"MULTIPOLYGON (((1077049.547 1745487.411, 10771..."
4,353,1.077033e+06,1.745387e+06,1.077133e+06,1.745287e+06,"MULTIPOLYGON (((1077044.105 1745368.495, 10770..."
...,...,...,...,...,...,...
89311,168686,1.113833e+06,1.766487e+06,1.113933e+06,1.766387e+06,"MULTIPOLYGON (((1113905.212 1766486.883, 11138..."
89312,168687,1.113833e+06,1.766387e+06,1.113933e+06,1.766287e+06,"MULTIPOLYGON (((1113847.421 1766386.670, 11138..."
89313,168688,1.113833e+06,1.766287e+06,1.113933e+06,1.766187e+06,"MULTIPOLYGON (((1113852.491 1766284.918, 11138..."
89314,168701,1.113833e+06,1.764987e+06,1.113933e+06,1.764887e+06,"MULTIPOLYGON (((1113842.714 1764974.150, 11138..."


In [6]:
# 현재 좌표계 (가정: EPSG 5179)
current_crs = daegu100.crs

# WGS84 좌표계 (EPSG 4326)
target_crs = "EPSG:4326"

# 좌표계 변환
daegu100_wgs84 = daegu100.to_crs(target_crs)

In [7]:
# GeoDataFrame의 각 행에 대해 중심 좌표 계산
daegu100_wgs84['centroid'] = daegu100_wgs84['geometry'].centroid

# 중심 좌표를 위도와 경도로 변환
daegu100_wgs84['latitude'] = daegu100_wgs84['centroid'].apply(lambda point: point.y)
daegu100_wgs84['longitude'] = daegu100_wgs84['centroid'].apply(lambda point: point.x)

# 결과 확인
#print(daegu100_wgs84[['id', 'geometry', 'latitude', 'longitude']])

In [8]:
# 
cross_square = daegu100_wgs84[['id', 'latitude', 'longitude']]

In [9]:
# 버스의 구군, 동, 경도, 위도만 산출
bus = busstop[['구군', '동', '경도', '위도']]

In [10]:
# 두 요인을 crossjoin -> 3분 30초 소요
k = pd.merge(cross_square, bus, how='cross')

In [11]:
# 개수조차 2억개 살벌하다.
len(k)

290098368

In [12]:
# 하버사인 거리 계산(5분 20초 소요)
lat1 = np.deg2rad(k['latitude'])
lat2 = np.deg2rad(k['위도'])
lon1 = np.deg2rad(k['longitude'])
lon2 = np.deg2rad(k['경도'])
dlat = lat2 - lat1 
dlon = lon2 - lon1 
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arcsin(np.sqrt(a))

In [13]:
# 하버사인 거리 계산하기(m단위로)
k['distance'] = c * 6371 * 1000

In [14]:
# 세부 정보를 확인해보자
k['distance'].describe()

count    2.900984e+08
mean     1.540455e+04
std      8.169061e+03
min      1.413405e+00
25%      9.342722e+03
50%      1.428957e+04
75%      2.032874e+04
max      5.243916e+04
Name: distance, dtype: float64

In [15]:
k.head(10)

Unnamed: 0,id,latitude,longitude,구군,동,경도,위도,distance
0,349,35.704564,128.352507,달성군,하빈면,128.424005,35.890744,21683.319487
1,349,35.704564,128.352507,남구,대명3동,128.57713,35.85869,26538.350773
2,349,35.704564,128.352507,중구,남산4동,128.577815,35.85864,26581.981106
3,349,35.704564,128.352507,남구,대명3동,128.580848,35.857112,26683.572433
4,349,35.704564,128.352507,중구,남산4동,128.580007,35.857675,26664.905932
5,349,35.704564,128.352507,중구,대신동,128.575672,35.86312,26759.569817
6,349,35.704564,128.352507,중구,남산4동,128.57676,35.863052,26828.512752
7,349,35.704564,128.352507,중구,대신동,128.577978,35.863695,26958.307801
8,349,35.704564,128.352507,중구,대신동,128.581183,35.864692,27249.292041
9,349,35.704564,128.352507,중구,남산2동,128.58674,35.866082,27730.350904


In [16]:
# 거리의 최소값을 가져가자
cross_square_address = k.loc[k.groupby('id')['distance'].idxmin()][['id', 'distance', '구군', '동']].reset_index(drop=True)

In [21]:
# 저장하자
cross_square_address.to_csv('../Data/cross100_address.csv', index = False, encoding = 'cp949')