In [None]:
import pandas as pd
import glob
from scipy.spatial import cKDTree
import os
import requests
import time
import geopandas as gpd
import numpy as np

base_path = './data/'
folder_path = os.path.join(base_path, 'raw/')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Energy Data

file_list = glob.glob(os.path.join(folder_path, 'eais_*.txt'))

energy_data = []
cols = [
    'yyyymm', 'num', 'SIGUNGU_ID', 'DONG_ID', 'SIDO_NM', 'SIGUNGU_NM', 'DONG_NM',
    'LANDUSAGE_ID', 'MAIN_NM', 'SUB_NM', 'LOC_ID', 'LOC_NM', 'FLOOR_ID',
    'FLOOR_NM', 'NEW_MAIN_NM', 'NEW_SUB_NM', 'energy_usage'
]

for file_path in file_list:
    file_name = os.path.basename(file_path)
    parts = file_name.replace('.txt', '').split('_')


    if len(parts) < 3:
        continue

    encodings = ['utf-8', 'cp949', 'utf-8-sig', 'euc-kr']
    temp_df = None

    for enc in encodings:
        try:
            temp_df = pd.read_csv(file_path, sep='|', names=cols, encoding=enc, dtype=str, index_col=False)
            break
        except (UnicodeDecodeError, pd.errors.ParserError):
            continue

    if temp_df is None:
        print(f"Error: {file_name}")
        continue


    temp_df['LANDUSAGE_ID'] = temp_df['LANDUSAGE_ID'].replace('0', '1')

    temp_df['PNU'] = (
        temp_df['SIGUNGU_ID'].str.zfill(5) +
        temp_df['DONG_ID'].str.zfill(5) +
        temp_df['LANDUSAGE_ID'].str.zfill(1) +
        temp_df['MAIN_NM'].str.zfill(4) +
        temp_df['SUB_NM'].str.zfill(4)
    )

    temp_df['energy_usage'] = pd.to_numeric(temp_df['energy_usage'], errors='coerce').fillna(0)

    energy_data.append(temp_df[['PNU', 'SIGUNGU_ID', 'SIDO_NM', 'SIGUNGU_NM', 'yyyymm', 'energy_usage']])

if energy_data:
    energy_ex_df = pd.concat(energy_data, ignore_index=True)

    energy_df = energy_ex_df.groupby(['PNU', 'SIGUNGU_ID', 'SIDO_NM', 'SIGUNGU_NM', 'yyyymm'])['energy_usage'].sum().reset_index()


    energy_df = energy_df[energy_df['energy_usage'] > 0]

    pnu_counts = energy_df.groupby('PNU')['yyyymm'].nunique()
    valid_pnu = pnu_counts[pnu_counts == 12].index
    energy_df = energy_df[energy_df['PNU'].isin(valid_pnu)]

    energy_df = energy_df.sort_values(['PNU', 'yyyymm']).reset_index(drop=True)

    print(f"Integrated, (Number of buildings : {energy_df['PNU'].nunique():,})")
else:
    print("Error, No buildings to integrate")

Integrated, (Number of buildings : 1,320,041)


In [None]:
save_path = os.path.join(base_path, 'processed/energy_df.csv')
energy_df.to_csv(save_path, index=False, encoding='utf-8-sig')

In [None]:
load_path =  os.path.join(base_path, 'processed/energy_df.csv')
energy_df = pd.read_csv(load_path, dtype={'SIGUNGU코드': str, 'PNU': str}, encoding='utf-8-sig')

In [None]:
url = "https://raw.githubusercontent.com/southkorea/southkorea-maps/master/kostat/2013/json/skorea_municipalities_geo_simple.json"
gdf = gpd.read_file(url)

sido_map = {
    '11': '서울특별시', '21': '부산광역시', '22': '대구광역시', '23': '인천광역시',
    '24': '광주광역시', '25': '대전광역시', '26': '울산광역시', '29': '세종특별자치시',
    '31': '경기도', '32': '강원도', '33': '충청북도', '34': '충청남도',
    '35': '전라북도', '36': '전라남도', '37': '경상북도', '38': '경상남도', '39': '제주특별자치도'
}

gdf_projected = gdf.to_crs(epsg=5179)
gdf['centroid'] = gdf_projected.geometry.centroid.to_crs(epsg=4326)

municipal_coords = pd.DataFrame({
    'sido_map': gdf['code'].str[:2].map(sido_map),
    'sigungu_clean': gdf['name'].str.replace(' ', '').str.strip(),
    'lat': gdf['centroid'].y,
    'lon': gdf['centroid'].x
})

municipal_coords = municipal_coords.drop_duplicates(['sido_map', 'sigungu_clean'])

region_ref_df = energy_df[['SIDO_NM', 'SIGUNGU_NM']].drop_duplicates().reset_index(drop=True)

sido_std = {'강원특별자치도': '강원도', '전북특별자치도': '전라북도', '제주특별자치도': '제주특별자치도'}
region_ref_df['sido_std'] = region_ref_df['SIDO_NM'].astype(str).str.strip().replace(sido_std)
region_ref_df['sigungu_clean'] = region_ref_df['SIGUNGU_NM'].astype(str).str.replace(' ', '').str.strip()

region_ref_df.loc[region_ref_df['SIGUNGU_NM'].str.contains('군위군', na=False), 'sido_std'] = '경상북도'

manual_fix = {
    '미추홀구': '남구', '세종특별자치시': '세종시',
    '여주시': '여주군', '청주시서원구': '청주시흥덕구', '청주시청원구': '청원군'
}
region_ref_df['sigungu_clean'] = region_ref_df['sigungu_clean'].replace(manual_fix)

region_ref_df = pd.merge(
    region_ref_df,
    municipal_coords,
    left_on=['sido_std', 'sigungu_clean'],
    right_on=['sido_map', 'sigungu_clean'],
    how='left'
)

mask = region_ref_df['lat'].isna()
if mask.any():
    region_ref_df.loc[mask, 'sigungu_short'] = region_ref_df.loc[mask, 'SIGUNGU_NM'].apply(lambda x: str(x).split()[-1].strip())

    short_coords = municipal_coords.copy()
    short_coords['sigungu_short'] = short_coords['sigungu_clean']

    region_ref_df = pd.merge(
        region_ref_df,
        short_coords[['sido_map', 'sigungu_short', 'lat', 'lon']],
        left_on=['sido_std', 'sigungu_short'],
        right_on=['sido_map', 'sigungu_short'],
        how='left', suffixes=('', '_new')
    )

    region_ref_df['lat'] = region_ref_df['lat'].fillna(region_ref_df['lat_new'])
    region_ref_df['lon'] = region_ref_df['lon'].fillna(region_ref_df['lon_new'])

final_cols = ['SIDO_NM', 'SIGUNGU_NM', 'lat', 'lon']
region_coords_df = region_ref_df[final_cols].copy()

missing_regions = region_coords_df[region_coords_df['lat'].isna()]

print(f"Regional coordinates computed (count: {len(region_coords_df)})")
if not missing_regions.empty:
    print(f"Missing regions (count: {len(missing_regions)}):")
    print(missing_regions[['SIDO_NM', 'SIGUNGU_NM']].values)
else:
    print("Every region has coordinates.")

region_coords_df

Regional coordinates computed (count: 252)
Every region has coordinates.


Unnamed: 0,SIDO_NM,SIGUNGU_NM,lat,lon
0,서울특별시,종로구,37.591304,126.978897
1,서울특별시,중구,37.556835,126.995549
2,서울특별시,용산구,37.530702,126.981999
3,서울특별시,성동구,37.548262,127.042314
4,서울특별시,광진구,37.545274,127.089023
...,...,...,...,...
247,전북특별자치도,장수군,35.653840,127.545669
248,전북특별자치도,임실군,35.594924,127.239048
249,전북특별자치도,순창군,35.431342,127.091780
250,전북특별자치도,고창군,35.446464,126.618031


In [None]:
load_path =  os.path.join(base_path, 'raw/OBS_ASOS_TIM.csv')
weather_df = pd.read_csv(load_path, encoding='cp949', index_col=False)
weather_df = weather_df.rename(columns={'지점': 'STN_ID', '지점명': 'STN_NAME', '일시': 'time', '기온(°C)': 'temp', '기온 QC플래그':'QC'})

weather_df['time'] = pd.to_datetime(weather_df['time'])
weather_df['yyyymm'] = weather_df['time'].dt.strftime('%Y%m')
weather_df['hour'] = weather_df['time'].dt.hour
weather_df

Unnamed: 0,STN_ID,STN_NAME,time,temp,QC,yyyymm,hour
0,90,속초,2024-05-01 01:00:00,10.3,,202405,1
1,90,속초,2024-05-01 02:00:00,10.1,,202405,2
2,90,속초,2024-05-01 03:00:00,9.7,,202405,3
3,90,속초,2024-05-01 04:00:00,9.4,,202405,4
4,90,속초,2024-05-01 05:00:00,9.0,,202405,5
...,...,...,...,...,...,...,...
849618,296,북부산,2025-04-30 19:00:00,17.1,,202504,19
849619,296,북부산,2025-04-30 20:00:00,15.7,,202504,20
849620,296,북부산,2025-04-30 21:00:00,14.1,,202504,21
849621,296,북부산,2025-04-30 22:00:00,12.1,,202504,22


In [None]:
# I linearly interpolated the temperature with QC Flag == 9 (Error)

weather_df.loc[weather_df['QC'] == 9, 'temp'] = np.nan

print(f"NULL before interpolation: {weather_df['temp'].isnull().sum()}")

weather_df['temp'] = weather_df['temp'].interpolate(method='linear')

print(f"NULL after interpolation: {weather_df['temp'].isnull().sum()}")

NULL before interpolation: 1306
NULL after interpolation: 0


In [None]:
save_path = os.path.join(base_path, 'processed/weather_df.csv')
weather_df.to_csv(save_path, index=False, encoding='utf-8-sig')

In [None]:
load_path = os.path.join(base_path, 'processed/weather_df.csv')
weather_df = pd.read_csv(load_path, encoding='utf-8-sig')

In [None]:
load_path = os.path.join(base_path, 'raw/META_관측지점정보.csv')
temp_loc_df = pd.read_csv(load_path, encoding='cp949', index_col=False)
temp_loc_df = temp_loc_df[['지점', '지점명', '위도', '경도']]
temp_loc_df = temp_loc_df.rename(columns={'지점':'STN_ID', '지점명':'STN_NAME', '위도':'lat', '경도':'lon'})
temp_loc_df

Unnamed: 0,STN_ID,STN_NAME,lat,lon
0,90,속초,38.2509,128.5647
1,93,북춘천,37.9474,127.7544
2,95,철원,38.1479,127.3042
3,98,동두천,37.9019,127.0607
4,99,파주,37.8859,126.7665
...,...,...,...,...
141,288,밀양,35.4915,128.7441
142,289,산청,35.4130,127.8791
143,294,거제,34.8882,128.6046
144,295,남해,34.8166,127.9264


In [None]:
def match_nearest_weather_station(target_df, station_df):
    station_coords = station_df[['lat', 'lon']].values
    target_coords = target_df[['lat', 'lon']].values

    tree = cKDTree(station_coords)
    dist, idx = tree.query(target_coords, k=1)

    target_df['STN_ID'] = station_df.iloc[idx]['STN_ID'].values
    target_df['STN_NAME'] = station_df.iloc[idx]['STN_NAME'].values

    target_df['dist_degree'] = dist

    return target_df

region_station_df = match_nearest_weather_station(region_coords_df, temp_loc_df)

print(f"Weather station and region matched. (Count: {len(region_station_df)})")
region_station_df

Weather station and region matched. (Count: 252)


Unnamed: 0,SIDO_NM,SIGUNGU_NM,lat,lon,STN_ID,STN_NAME,dist_degree
0,서울특별시,종로구,37.591304,126.978897,108,서울,0.023827
1,서울특별시,중구,37.556835,126.995549,108,서울,0.033123
2,서울특별시,용산구,37.530702,126.981999,108,서울,0.043803
3,서울특별시,성동구,37.548262,127.042314,108,서울,0.079936
4,서울특별시,광진구,37.545274,127.089023,108,서울,0.125962
...,...,...,...,...,...,...,...
247,전북특별자치도,장수군,35.653840,127.545669,248,장수,0.025565
248,전북특별자치도,임실군,35.594924,127.239048,244,임실,0.049585
249,전북특별자치도,순창군,35.431342,127.091780,254,순창군,0.070432
250,전북특별자치도,고창군,35.446464,126.618031,251,고창군,0.081429


In [None]:
save_path = os.path.join(base_path, 'processed/region_station_df.csv')
region_station_df.to_csv(save_path, index=False, encoding='utf-8-sig')

In [None]:
load_path = os.path.join(base_path, 'processed/region_station_df.csv')
region_station_df = pd.read_csv(load_path, encoding='utf-8-sig')

In [None]:
dfs = {
    "energy_df": energy_df,
    "region_station_df": region_station_df,
    "weather_df": weather_df
}

for name, df in dfs.items():
    print(f"[{name}] (Shape: {df.shape})")
    print(f" - Columns: {df.columns.tolist()}")
    print("-" * 50)

[energy_df] (Shape: (15840492, 6))
 - Columns: ['PNU', 'SIGUNGU_ID', 'SIDO_NM', 'SIGUNGU_NM', 'yyyymm', 'energy_usage']
--------------------------------------------------
[region_station_df] (Shape: (252, 7))
 - Columns: ['SIDO_NM', 'SIGUNGU_NM', 'lat', 'lon', 'STN_ID', 'STN_NAME', 'dist_degree']
--------------------------------------------------
[weather_df] (Shape: (849623, 7))
 - Columns: ['STN_ID', 'STN_NAME', 'time', 'temp', 'QC', 'yyyymm', 'hour']
--------------------------------------------------
