In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.neighbors import KDTree
from geopy.distance import geodesic

# 0) 파라미터 · 경로
WALK_SPEED_KMPH     = 4.8
TRAIN_SPEED_KMPH    = 32
TRACK_FACTOR_OTHER  = 1.25
TRACK_FACTOR_LINE2  = 1.35
AVG_WAIT_MIN        = 2.5

DATA_DIR   = '/mnt/data'
SUBWAY_CSV = os.path.join(DATA_DIR, 'subway_feature.csv')
TRAIN_CSV  = os.path.join(DATA_DIR, 'train.csv')
TEST_CSV   = os.path.join(DATA_DIR, 'test.csv')
OUT_TRAIN  = os.path.join(DATA_DIR, 'train_time_gangnam.csv')
OUT_TEST   = os.path.join(DATA_DIR, 'test_time_gangnam.csv')


# 1) 지하철 데이터 로드
subway = (
    pd.read_csv(SUBWAY_CSV,
                usecols=['역사명','호선','위도','경도'])
      .rename(columns={'역사명':'station_name','호선':'line',
                       '위도':'lat','경도':'lon'})cd 
      .dropna(subset=['lat','lon'])
)
# 강남역(2호선) 좌표
g = subway[
    subway['station_name'].str.contains('강남') &
    subway['line'].astype(str).str.contains('2')
].iloc[0]
gangnam_lat, gangnam_lon = g.lat, g.lon

# 역→강남 직선거리(km)
subway['dist2gn_km'] = subway.apply(
    lambda r: geodesic((r.lat, r.lon),(gangnam_lat,gangnam_lon)).km,
    axis=1
)

# KD-Tree
kdt = KDTree(subway[['lat','lon']].values, metric='euclidean')


# 2) 아파트 로드 + 번지 포함
def load_apt(path):
    df = pd.read_csv(
        path,
        usecols=['시군구','번지','본번','부번','아파트명','좌표Y','좌표X']
    ).rename(columns={
        '시군구':'addr','번지':'b','본번':'b_main','부번':'b_sub',
        '아파트명':'apt_name','좌표Y':'orig_lat','좌표X':'orig_lon'
    })
    df['apt_id'] = df.index.astype(int)
    return df

apt_train = load_apt(TRAIN_CSV)
apt_test  = load_apt(TEST_CSV)


# 3) 번지·행정구별 평균 좌표 (train 기준)
jcols = ['addr','b','b_main','b_sub']
vt = apt_train.dropna(subset=['orig_lat','orig_lon'])

grp_jib = vt.groupby(jcols)[['orig_lat','orig_lon']].mean()
grp_jib.columns = ['lat_jib','lon_jib']
grp_addr = vt.groupby('addr')[['orig_lat','orig_lon']].mean()
grp_addr.columns = ['lat_addr','lon_addr']

# 맵으로 꺼내기
lat_jib_map  = grp_jib['lat_jib']
lon_jib_map  = grp_jib['lon_jib']
lat_addr_map = grp_addr['lat_addr']
lon_addr_map = grp_addr['lon_addr']


# 4) 좌표 보정(orig>지번>행정구)
def impute_coords(df):
    df['lat'] = df['orig_lat']
    df['lon'] = df['orig_lon']
    df['jib_key'] = list(zip(df['addr'],df['b'],df['b_main'],df['b_sub']))
    m1 = df['lat'].isna()
    df.loc[m1,'lat'] = df.loc[m1,'jib_key'].map(lat_jib_map)
    df.loc[m1,'lon'] = df.loc[m1,'jib_key'].map(lon_jib_map)
    m2 = df['lat'].isna()
    df.loc[m2,'lat'] = df.loc[m2,'addr'].map(lat_addr_map)
    df.loc[m2,'lon'] = df.loc[m2,'addr'].map(lon_addr_map)
    df.drop(columns=['jib_key'], inplace=True)
    return df

apt_train = impute_coords(apt_train)
apt_test  = impute_coords(apt_test)


# 5) 최근접역+도보거리
def attach_nearest(df):
    out = df.copy()
    out['nearest_idx']  = np.nan
    out['dist_walk_km'] = np.nan

    m = out['lat'].notna() & out['lon'].notna()
    coords = out.loc[m,['lat','lon']].values
    _, idx = kdt.query(coords, k=1)
    out.loc[m,'nearest_idx'] = idx.squeeze()

    for i in out.index[m]:
        j = int(out.at[i,'nearest_idx'])
        out.at[i,'dist_walk_km'] = geodesic(
            (out.at[i,'lat'],out.at[i,'lon']),
            (subway.at[j,'lat'],subway.at[j,'lon'])
        ).km

    return out

apt_train = attach_nearest(apt_train)
apt_test  = attach_nearest(apt_test)


# 6) 소요시간 계산 (NaN-safe!)
def calc_tt(df):
    n = len(df)
    tt = np.full(n, np.nan)
    mask = df['dist_walk_km'].notna()

    # walk
    walk_min = df.loc[mask,'dist_walk_km'] / WALK_SPEED_KMPH * 60

    # rail
    nearest = df.loc[mask,'nearest_idx'].astype(int).values
    rail_km  = subway.loc[nearest,'dist2gn_km'].values
    rail_min = rail_km / TRAIN_SPEED_KMPH * 60

    # track factor
    is2 = subway.loc[nearest,'line'].astype(str).str.contains('2').values
    tf  = np.where(is2, TRACK_FACTOR_LINE2, TRACK_FACTOR_OTHER)

    # 합산
    tt_vals = walk_min.values + rail_min * tf + AVG_WAIT_MIN
    tt[mask.values] = tt_vals
    return pd.Series(tt, index=df.index)

apt_train['tt_gangnam'] = calc_tt(apt_train)
apt_test ['tt_gangnam'] = calc_tt(apt_test)


# 7) 저장
for df,out in [(apt_train,OUT_TRAIN),(apt_test,OUT_TEST)]:
    df[['apt_id','apt_name','tt_gangnam']]\
      .to_csv(out, index=False, encoding='utf-8-sig')
    print(f"{out}: rows={len(df)}, filled={df['tt_gangnam'].notna().sum()}")

/Users/leejunyeong/Documents/서울시 집값 데이터셋 예측/upstageailab-ml-competition-ml-6/leejy/data/train_time_gangnam.csv: rows=1118822, filled=1035844
/Users/leejunyeong/Documents/서울시 집값 데이터셋 예측/upstageailab-ml-competition-ml-6/leejy/data/test_time_gangnam.csv: rows=9272, filled=8684
