In [1]:
import pandas as pd

In [2]:
commercial = pd.read_csv("data/commercial_with_xy.csv")
building = pd.read_csv("data/outer.csv")

In [3]:
commercial.columns

Index(['기준_년_코드', '상권_구분_코드', '상권_구분_코드_명', '상권_코드', '상권_코드_명', '총_생활인구_수',
       '집객시설_수', '아파트_평균_면적', '아파트_평균_시가', '상권_변화_지표', '상권_변화_지표_명',
       '운영_영업_개월_평균', '폐업_영업_개월_평균', '서울_운영_영업_개월_평균', '서울_폐업_영업_개월_평균', '구분',
       '점포_수', '매출_금액', '매출_건수', '엑스좌표_값', '와이좌표_값'],
      dtype='object')

In [4]:
commercial = commercial[['상권_코드_명','총_생활인구_수','집객시설_수','점포_수','아파트_평균_면적','아파트_평균_시가','서울_운영_영업_개월_평균','서울_폐업_영업_개월_평균','매출_건수', "매출_금액",'구분', '엑스좌표_값', '와이좌표_값']]
commercial = commercial.drop_duplicates()
commercial.rename(columns={'서울_운영_영업_개월_평균': '영업_개월_평균', '서울_폐업_영업_개월_평균':'폐업_개월_평균','구분':'업종_명'},inplace=True)
commercial = commercial.reset_index(drop=True)

In [5]:
building.columns

Index(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0',
       '관리_건축물대장_PK', '관리_상위_건축물대장_PK', '새주소_지상지하_코드', '주소', '지상층수', '지하층수',
       '승강기_수', '주건축물수', '세대수', '가구수', '동명칭', '호_명', '층_구분_코드_x', '층_구분_코드_y',
       '층_번호', '면적', '용적율', '주차_대수', 'y', 'x', 'commercial_row'],
      dtype='object')

In [6]:
x1 = building["x"][1]
y1 = building["y"][1]
(x1, y1)

(37.489084, 126.9731714)

In [7]:
commercial.iloc[:, -2:]

Unnamed: 0,엑스좌표_값,와이좌표_값
0,196193,456641
1,196193,456641
2,196193,456641
3,196193,456641
4,196193,456641
...,...,...
105193,200025,451782
105194,200025,451782
105195,200025,451782
105196,200025,451782


In [8]:
import math

def get_distance(point1, point2):
    x1, y1 = point1
    x2, y2 = point2
    return math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

In [9]:
import geopandas as gpd

def convert_coordinates(df):
    # 좌표 변환
    geometry = gpd.points_from_xy(df['엑스좌표_값'], df['와이좌표_값'])
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:5181")
    gdf = gdf.to_crs("EPSG:4326")

    # 변환된 좌표 열 추가
    df['new_x'] = gdf.geometry.y
    df['new_y'] = gdf.geometry.x

    return df

In [13]:
commercial = convert_coordinates(commercial)

In [14]:
commercial.shape

(105198, 16)

In [15]:
building_coordinates = list(zip(building['x'], building['y']))
commercial_coordinates = list(zip(commercial['new_x'], commercial['new_y']))

In [16]:
min_idx = 0
min_dist = 10000000000000000
building["commercial_row"] = None

for building_idx in range(len(building_coordinates)):
    for commercial_idx in range(len(commercial_coordinates)):
        dist = get_distance(building_coordinates[building_idx], commercial_coordinates[commercial_idx])
        if dist < min_dist:
            min_idx = commercial_idx
            min_dist = dist
    building.loc[building_idx, "commercial_row"] = min_idx
    min_idx = 0
    min_dist = 10000000000000000

In [17]:
building["commercial_row"].max()

105180

In [18]:
onehot = pd.get_dummies(commercial, columns = ['업종_명'])
onehot.columns  

Index(['상권_코드_명', '총_생활인구_수', '집객시설_수', '점포_수', '아파트_평균_면적', '아파트_평균_시가',
       '영업_개월_평균', '폐업_개월_평균', '매출_건수', '매출_금액', '엑스좌표_값', '와이좌표_값',
       'geometry', 'new_x', 'new_y', '업종_명_가전제품_및_통신기기_도소매업', '업종_명_개인_서비스업',
       '업종_명_교육_및_사업자원_서비스업', '업종_명_기타_도소매업', '업종_명_기타_서비스업', '업종_명_부동산_및_임대업',
       '업종_명_생활용품_도소매업', '업종_명_수리업', '업종_명_숙박업', '업종_명_스포츠_및_오락_서비스업',
       '업종_명_식료품_도소매업', '업종_명_음식점_및_주점업', '업종_명_제조업', '업종_명_중고상품_도소매업'],
      dtype='object')

In [19]:
onehot['업종_명_가전제품_및_통신기기_도소매업'] = 0
onehot['업종_명_개인_서비스업'] = 0
onehot['업종_명_교육_및_사업자원_서비스업'] = 0
onehot['업종_명_기타_도소매업'] = 0
onehot['업종_명_기타_서비스업'] = 0
onehot['업종_명_부동산_및_임대업'] = 0
onehot['업종_명_생활용품_도소매업'] = 0
onehot['업종_명_수리업'] = 0
onehot['업종_명_숙박업'] = 0
onehot['업종_명_스포츠_및_오락_서비스업'] = 0
onehot['업종_명_식료품_도소매업'] = 0
onehot['업종_명_음식점_및_주점업'] = 0
onehot['업종_명_제조업'] = 0
onehot['업종_명_중고상품_도소매업'] = 0

In [20]:
onehot

Unnamed: 0,상권_코드_명,총_생활인구_수,집객시설_수,점포_수,아파트_평균_면적,아파트_평균_시가,영업_개월_평균,폐업_개월_평균,매출_건수,매출_금액,...,업종_명_기타_서비스업,업종_명_부동산_및_임대업,업종_명_생활용품_도소매업,업종_명_수리업,업종_명_숙박업,업종_명_스포츠_및_오락_서비스업,업종_명_식료품_도소매업,업종_명_음식점_및_주점업,업종_명_제조업,업종_명_중고상품_도소매업
0,이북5도청사,139405.75,22.5,116,124.0,376996219.5,98.75,51.25,398,3.230693e+07,...,0,0,0,0,0,0,0,0,0,0
1,이북5도청사,139405.75,22.5,128,124.0,376996219.5,98.75,51.25,166,2.660860e+07,...,0,0,0,0,0,0,0,0,0,0
2,이북5도청사,139405.75,22.5,98,124.0,376996219.5,98.75,51.25,96,2.564806e+06,...,0,0,0,0,0,0,0,0,0,0
3,이북5도청사,139405.75,22.5,151,124.0,376996219.5,98.75,51.25,358,3.070350e+07,...,0,0,0,0,0,0,0,0,0,0
4,이북5도청사,139405.75,22.5,24,124.0,376996219.5,98.75,51.25,874,1.999051e+08,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105193,중부시장(신중부시장),927336.00,18.0,17,15.0,148636943.0,121.25,55.50,308,3.510748e+07,...,0,0,0,0,0,0,0,0,0,0
105194,중부시장(신중부시장),927336.00,18.0,25,15.0,148636943.0,121.25,55.50,375,1.061767e+07,...,0,0,0,0,0,0,0,0,0,0
105195,중부시장(신중부시장),927336.00,18.0,1663,15.0,148636943.0,121.25,55.50,147094,7.184864e+09,...,0,0,0,0,0,0,0,0,0,0
105196,중부시장(신중부시장),927336.00,18.0,243,15.0,148636943.0,121.25,55.50,19070,5.655092e+08,...,0,0,0,0,0,0,0,0,0,0


In [21]:
import xgboost as xgb

xgb_model = xgb.Booster()
xgb_model.load_model('xgb_model.model')

In [22]:
xgb_model

<xgboost.core.Booster at 0x1279c6bf0>

In [23]:
X = onehot[["총_생활인구_수", "집객시설_수", "점포_수", "아파트_평균_면적", "아파트_평균_시가", "영업_개월_평균", "폐업_개월_평균", "매출_건수", '업종_명_가전제품_및_통신기기_도소매업',
       '업종_명_개인_서비스업', '업종_명_교육_및_사업자원_서비스업', '업종_명_기타_도소매업', '업종_명_기타_서비스업',
       '업종_명_부동산_및_임대업', '업종_명_생활용품_도소매업', '업종_명_수리업', '업종_명_숙박업',
       '업종_명_스포츠_및_오락_서비스업', '업종_명_식료품_도소매업', '업종_명_음식점_및_주점업', '업종_명_제조업',
       '업종_명_중고상품_도소매업']]
y = onehot["매출_금액"]

sectors = ['업종_명_가전제품_및_통신기기_도소매업',
       '업종_명_개인_서비스업', '업종_명_교육_및_사업자원_서비스업', '업종_명_기타_도소매업', '업종_명_기타_서비스업',
       '업종_명_부동산_및_임대업', '업종_명_생활용품_도소매업', '업종_명_수리업', '업종_명_숙박업',
       '업종_명_스포츠_및_오락_서비스업', '업종_명_식료품_도소매업', '업종_명_음식점_및_주점업', '업종_명_제조업',
       '업종_명_중고상품_도소매업']

commercial_idx = building.loc[1, "commercial_row"]
row = onehot.loc[commercial_idx].copy()

dtest = xgb.DMatrix(X)

for sector in sectors:
       row[sector] = 1
       data = xgb.DMatrix(row[X.columns].values.reshape(1, -1))  # DMatrix로 변환
       pred = xgb_model.predict(data, ntree_limit=xgb_model.best_iteration)
       row[sector] = 0
       print(f"{sector}'s prediction = {pred}")

업종_명_가전제품_및_통신기기_도소매업's prediction = [1.7098928e+08]
업종_명_개인_서비스업's prediction = [1.5288058e+08]
업종_명_교육_및_사업자원_서비스업's prediction = [2.374113e+08]
업종_명_기타_도소매업's prediction = [1.7098928e+08]
업종_명_기타_서비스업's prediction = [1.7098928e+08]
업종_명_부동산_및_임대업's prediction = [1.7098928e+08]
업종_명_생활용품_도소매업's prediction = [2.0240483e+08]
업종_명_수리업's prediction = [1.7098928e+08]
업종_명_숙박업's prediction = [1.7098928e+08]
업종_명_스포츠_및_오락_서비스업's prediction = [1.43762e+08]
업종_명_식료품_도소매업's prediction = [1.791652e+08]
업종_명_음식점_및_주점업's prediction = [2.177781e+08]
업종_명_제조업's prediction = [1.7098928e+08]
업종_명_중고상품_도소매업's prediction = [3.118639e+09]




In [24]:
sectors = ['업종_명_가전제품_및_통신기기_도소매업',
       '업종_명_개인_서비스업', '업종_명_교육_및_사업자원_서비스업', '업종_명_기타_도소매업', '업종_명_기타_서비스업',
       '업종_명_부동산_및_임대업', '업종_명_생활용품_도소매업', '업종_명_수리업', '업종_명_숙박업',
       '업종_명_스포츠_및_오락_서비스업', '업종_명_식료품_도소매업', '업종_명_음식점_및_주점업', '업종_명_제조업',
       '업종_명_중고상품_도소매업']

X = onehot[["총_생활인구_수", "집객시설_수", "점포_수", "아파트_평균_면적", "아파트_평균_시가", "영업_개월_평균", "폐업_개월_평균", "매출_건수", '업종_명_가전제품_및_통신기기_도소매업',
       '업종_명_개인_서비스업', '업종_명_교육_및_사업자원_서비스업', '업종_명_기타_도소매업', '업종_명_기타_서비스업',
       '업종_명_부동산_및_임대업', '업종_명_생활용품_도소매업', '업종_명_수리업', '업종_명_숙박업',
       '업종_명_스포츠_및_오락_서비스업', '업종_명_식료품_도소매업', '업종_명_음식점_및_주점업', '업종_명_제조업',
       '업종_명_중고상품_도소매업']]
y = onehot["매출_금액"]

max_value = 0
max_sector = None
max_sector_idx = 0

building["업종"] = None
building["업종_idx"] = None

dtest = xgb.DMatrix(X)

for idx in range(len(building)):
       commercial_idx = building.loc[idx, "commercial_row"]
       row = onehot.loc[commercial_idx].copy()
       
       for sector in sectors:
              row[sector] = 1
              data = xgb.DMatrix(row[X.columns].values.reshape(1, -1))  # DMatrix로 변환
              pred = xgb_model.predict(data, iteration_range=(0, xgb_model.best_iteration))
              row[sector] = 0

              if pred > max_value:
                     max_value = pred
                     max_sector = sector
                     max_sector_idx = sectors.index(sector)

       building.loc[idx, "업종"] = max_sector
       building.loc[idx, "업종_idx"] = max_sector_idx
       max_value = 0
       max_sector = None
       max_sector_idx = 0

In [25]:
building["업종_idx"].unique()

array([13, 0, 6], dtype=object)

In [27]:
building.columns

Index(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0',
       '관리_건축물대장_PK', '관리_상위_건축물대장_PK', '새주소_지상지하_코드', '주소', '지상층수', '지하층수',
       '승강기_수', '주건축물수', '세대수', '가구수', '동명칭', '호_명', '층_구분_코드_x', '층_구분_코드_y',
       '층_번호', '면적', '용적율', '주차_대수', 'y', 'x', 'commercial_row', '업종',
       '업종_idx'],
      dtype='object')

In [29]:
building.drop(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0'], axis=1, inplace=True)

KeyError: "['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0'] not found in axis"

In [32]:
building.drop(["commercial_row", "업종"], axis=1, inplace=True)

In [34]:
building.to_csv("data/building_with_sector.csv", index=False)