In [12]:
!pip -q install geopandas pandas pyproj shapely fiona

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!pip -q install geopandas pandas pyproj shapely fiona

from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive
Mounted at /content/drive


In [13]:
import os

ROOT = '/content/drive/MyDrive/colab/firecast'

FIRE_PROC_PATH  = f'{ROOT}/data/processed/fires_with_manual_station.parquet'
WEATHER_RAW_DIR = f'{ROOT}/data/raw/weather'
OUT_DIR         = f'{ROOT}/data/processed'

os.makedirs(OUT_DIR, exist_ok=True)

print('ROOT:', ROOT)
print('FIRE_PROC_PATH:', FIRE_PROC_PATH)
print('WEATHER_RAW_DIR:', WEATHER_RAW_DIR)


ROOT: /content/drive/MyDrive/colab/firecast
FIRE_PROC_PATH: /content/drive/MyDrive/colab/firecast/data/processed/fires_with_manual_station.parquet
WEATHER_RAW_DIR: /content/drive/MyDrive/colab/firecast/data/raw/weather


In [14]:
import pandas as pd
from typing import List, Optional

def pick_column(columns: List[str], candidates: List[str]) -> Optional[str]:
    """columns(실제 컬럼명 리스트)에서 candidates 리스트 중 하나와 case-insensitive 매칭되는 컬럼을 찾아 반환"""
    lower_map = {c.lower(): c for c in columns}
    for cand in candidates:
        if cand.lower() in lower_map:
            return lower_map[cand.lower()]
    return None


In [15]:
import geopandas as gpd
import pandas as pd

fires = gpd.read_parquet(FIRE_PROC_PATH)
print('fires shape:', fires.shape)
print('fires columns:', list(fires.columns))

# 날짜 컬럼 만들기
# 1순위: OCCRR_DATE (이미 만들어둔 경우)
if 'OCCRR_DATE' in fires.columns:
    fires['fire_date'] = pd.to_datetime(fires['OCCRR_DATE'])
# 2순위: OCCRR_DTM에서 YYYYMMDD 추출
elif 'OCCRR_DTM' in fires.columns:
    fires['OCCRR_DATE_STR'] = fires['OCCRR_DTM'].astype('string').str.slice(0, 8)
    fires['fire_date'] = pd.to_datetime(fires['OCCRR_DATE_STR'], format='%Y%m%d', errors='coerce')
else:
    raise ValueError('산불 데이터에서 날짜 컬럼(OCCRR_DATE / OCCRR_DTM)을 찾을 수 없습니다.')

fires['fire_date'] = fires['fire_date'].dt.normalize()  # 시간 정보 제거 (00:00:00)
fires[['fire_date']].head()


fires shape: (3663, 21)
fires columns: ['OCCRR_DTM', 'OCUR_DYWK', 'EXTING_DTM', 'RQRMN_TM', 'ARA_NM', 'CTPRV_NM', 'SGNG_NM', 'EMNDN_NM', 'OCCCRR_RI', 'ARA_LTNMB', 'CUSE_NM', 'DMG_AREA', 'DMG_MONEY', 'TM_X', 'TM_Y', 'geometry', 'index_right', 'station_id', 'name_kr', 'name_en', 'dist_m']


Unnamed: 0,fire_date
0,2020-01-01
1,2020-01-03
2,2020-01-03
3,2020-01-04
4,2020-01-04


In [16]:
import glob

csv_paths = sorted(glob.glob(os.path.join(WEATHER_RAW_DIR, '*.csv')))
print('weather files:', [os.path.basename(p) for p in csv_paths])

dfs = []
for p in csv_paths:
    try:
        df0 = pd.read_csv(p, low_memory=False)
    except UnicodeDecodeError:
        df0 = pd.read_csv(p, encoding='cp949', low_memory=False)
    df0['__source_file'] = os.path.basename(p)
    dfs.append(df0)

weather_raw = pd.concat(dfs, ignore_index=True)
print('weather_raw shape:', weather_raw.shape)
print('sample columns:', list(weather_raw.columns)[:20])


weather files: ['SURFACE_ASOS_104_DAY_2020_2020_2021.csv', 'SURFACE_ASOS_104_DAY_2021_2021_2022.csv', 'SURFACE_ASOS_105_DAY_2020_2020_2021.csv', 'SURFACE_ASOS_105_DAY_2021_2021_2022.csv']
weather_raw shape: (1462, 60)
sample columns: ['지점', '일시', '평균기온(°C)', '최저기온(°C)', '최저기온 시각(hhmi)', '최고기온(°C)', '최고기온 시각(hhmi)', '강수 계속시간(hr)', '10분 최다 강수량(mm)', '10분 최다강수량 시각(hhmi)', '1시간 최다강수량(mm)', '1시간 최다 강수량 시각(hhmi)', '일강수량(mm)', '최대 순간 풍속(m/s)', '최대 순간 풍속 풍향(16방위)', '최대 순간풍속 시각(hhmi)', '최대 풍속(m/s)', '최대 풍속 풍향(16방위)', '최대 풍속 시각(hhmi)', '평균 풍속(m/s)']


In [17]:
import pandas as pd
from typing import List, Optional

def pick_column(columns: List[str], candidates: List[str]) -> Optional[str]:
    """columns(실제 컬럼명 리스트)에서 candidates 리스트 중 하나와 case-insensitive 매칭되는 컬럼을 찾아 반환"""
    lower_map = {c.lower(): c for c in columns}
    for cand in candidates:
        if cand.lower() in lower_map:
            return lower_map[cand.lower()]
    return None

# 지점번호 / 날짜 / 평균기온 / 강수량 등의 컬럼 추정
stn_col  = pick_column(weather_raw.columns, ['STN', 'stn', 'stnid', '지점', '지점번호'])
date_col = pick_column(weather_raw.columns, ['TM', '날짜', 'date', 'YYYYMMDD', '일시'])
# 기상 변수들은 상황에 맞게 원하는 걸 추가
tavg_col = pick_column(weather_raw.columns, ['TA', 'TAVG', '평균기온', 'avgTa', '평균기온(°C)'])
tmin_col = pick_column(weather_raw.columns, ['TMN', '최저기온', '최저기온(°C)'])
tmax_col = pick_column(weather_raw.columns, ['TMX', '최고기온', '최고기온(°C)'])
prcp_col = pick_column(weather_raw.columns, ['RN', 'PRCP', '강수량', '일강수량(mm)'])

print('stn_col :', stn_col)
print('date_col:', date_col)
print('tavg_col:', tavg_col, ', tmin_col:', tmin_col, ', tmax_col:', tmax_col, ', prcp_col:', prcp_col)

if stn_col is None or date_col is None:
    raise ValueError('지점번호 또는 날짜 컬럼을 찾을 수 없습니다. 위에서 pick_column 후보를 조정해 주세요.')

# 필요한 컬럼만 남기기
keep_cols = [c for c in [stn_col, date_col, tavg_col, tmin_col, tmax_col, prcp_col, '__source_file'] if c is not None]
print('keep_cols before selection:', keep_cols)
weather = weather_raw[keep_cols].copy()
print('weather.columns after selection:', weather.columns.tolist())

# 컬럼 이름 표준화
rename_map = {
    stn_col:  'station_id',
    date_col: 'obs_datetime',
}
if tavg_col: rename_map[tavg_col] = 'TA'
if tmin_col: rename_map[tmin_col] = 'TMN'
if tmax_col: rename_map[tmax_col] = 'TMX'
if prcp_col: rename_map[prcp_col] = 'RN'

weather = weather.rename(columns=rename_map)
print('weather.columns after rename:', weather.columns.tolist())
weather.head()

stn_col : 지점
date_col: 일시
tavg_col: 평균기온(°C) , tmin_col: 최저기온(°C) , tmax_col: 최고기온(°C) , prcp_col: 일강수량(mm)
keep_cols before selection: ['지점', '일시', '평균기온(°C)', '최저기온(°C)', '최고기온(°C)', '일강수량(mm)', '__source_file']
weather.columns after selection: ['지점', '일시', '평균기온(°C)', '최저기온(°C)', '최고기온(°C)', '일강수량(mm)', '__source_file']
weather.columns after rename: ['station_id', 'obs_datetime', 'TA', 'TMN', 'TMX', 'RN', '__source_file']


Unnamed: 0,station_id,obs_datetime,TA,TMN,TMX,RN,__source_file
0,104,2020-01-01,0.1,-5.6,6.9,,SURFACE_ASOS_104_DAY_2020_2020_2021.csv
1,104,2020-01-02,2.7,-2.4,7.6,,SURFACE_ASOS_104_DAY_2020_2020_2021.csv
2,104,2020-01-03,3.9,-0.6,9.6,,SURFACE_ASOS_104_DAY_2020_2020_2021.csv
3,104,2020-01-04,4.0,-0.8,8.8,,SURFACE_ASOS_104_DAY_2020_2020_2021.csv
4,104,2020-01-05,4.0,-1.0,8.6,,SURFACE_ASOS_104_DAY_2020_2020_2021.csv


In [18]:
# obs_datetime 형식을 보고 파싱
sample = weather['obs_datetime'].astype('string').iloc[0]
print('sample obs_datetime:', sample)

# KMA 일자료는 보통 YYYYMMDD 정수/문자열 형식
# 길이 8이면 YYYYMMDD로 가정
def parse_obs_date(s: str):
    s = str(s)
    if len(s) >= 8 and s[:8].isdigit():
        return pd.to_datetime(s[:8], format='%Y%m%d', errors='coerce')
    # 그 외 형식일 때 추가 처리 필요하면 여기 보강
    try:
        return pd.to_datetime(s, errors='coerce')
    except Exception:
        return pd.NaT

weather['obs_date'] = weather['obs_datetime'].astype('string').map(parse_obs_date)
weather['obs_date'] = weather['obs_date'].dt.normalize()
weather[['station_id', 'obs_datetime', 'obs_date']].head()

sample obs_datetime: 2020-01-01


Unnamed: 0,station_id,obs_datetime,obs_date
0,104,2020-01-01,2020-01-01
1,104,2020-01-02,2020-01-02
2,104,2020-01-03,2020-01-03
3,104,2020-01-04,2020-01-04
4,104,2020-01-05,2020-01-05


In [19]:
# 강릉(105), 북강릉(104)만 사용 (다른 지점도 쓰려면 리스트 확장)
target_stations = [104, 105]

# station_id를 정수로 변환 시도
weather['station_id'] = pd.to_numeric(weather['station_id'], errors='coerce')

w_sub = weather[
    weather['station_id'].isin(target_stations) &
    weather['obs_date'].notna()
].copy()

print('filtered weather shape:', w_sub.shape)
print('w_sub columns:', w_sub.columns.tolist())
w_sub[['station_id', 'obs_date']].head()

# 같은 station_id + date에 여러 관측값이 있을 수 있으니 평균/합으로 축약 (일자료면 사실상 1행일 것)
agg_dict = {}
for col in ['TA','TMN','TMX','RN']:
    if col in w_sub.columns:
        agg_dict[col] = 'mean'  # RN은 일강수량이면 mean이나 max 모두 상관 X

weather_daily = (
    w_sub
    .groupby(['station_id', 'obs_date'])
    .agg(agg_dict)
    .reset_index()
)

print('weather_daily shape:', weather_daily.shape)
weather_daily.head()

filtered weather shape: (1462, 8)
w_sub columns: ['station_id', 'obs_datetime', 'TA', 'TMN', 'TMX', 'RN', '__source_file', 'obs_date']
weather_daily shape: (1462, 6)


Unnamed: 0,station_id,obs_date,TA,TMN,TMX,RN
0,104,2020-01-01,0.1,-5.6,6.9,
1,104,2020-01-02,2.7,-2.4,7.6,
2,104,2020-01-03,3.9,-0.6,9.6,
3,104,2020-01-04,4.0,-0.8,8.8,
4,104,2020-01-05,4.0,-1.0,8.6,


In [20]:
# fires 쪽 station_id도 숫자 형식으로 맞추기
fires['station_id'] = pd.to_numeric(fires['station_id'], errors='coerce')

# join을 위해 fires의 날짜 컬럼명을 맞춰줌
fires_for_join = fires.copy()
fires_for_join = fires_for_join.rename(columns={'fire_date': 'date'})

weather_for_join = weather_daily.rename(columns={'obs_date': 'date'})

merged = fires_for_join.merge(
    weather_for_join,
    how='left',
    on=['station_id', 'date'],
    suffixes=('', '_w')
)

print('merged shape:', merged.shape)
merged[['station_id', 'date'] + [c for c in ['TA','TMN','TMX','RN'] if c in merged.columns]].head()

merged shape: (3663, 27)


Unnamed: 0,station_id,date,TA,TMN,TMX,RN
0,105,2020-01-01,1.7,-3.0,6.4,
1,105,2020-01-03,5.6,2.9,11.4,
2,104,2020-01-03,3.9,-0.6,9.6,
3,104,2020-01-04,4.0,-0.8,8.8,
4,104,2020-01-04,4.0,-0.8,8.8,


In [21]:
out_path = os.path.join(OUT_DIR, 'fire_weather_merged.parquet')
merged.to_parquet(out_path, index=False)
print('saved ->', out_path)

# 필요하면 csv도
merged.to_csv(os.path.join(OUT_DIR, 'fire_weather_merged.csv'), index=False)


saved -> /content/drive/MyDrive/colab/firecast/data/processed/fire_weather_merged.parquet


In [22]:
def build_past_n_days_features(df: pd.DataFrame, n_days: int = 3) -> pd.DataFrame:
    """
    station_id + date + (TA, TMN, TMX, RN ...) 이 있는 df에서
    각 date 기준으로 과거 n일치 기상변수를 옆으로 붙이는 형태의 피처 테이블 생성.
    """
    base_cols = ['station_id', 'date']
    feature_cols = [c for c in df.columns if c not in base_cols]

    out = df[base_cols].copy()
    for k in range(1, n_days+1):
        shifted = df.copy()
        shifted['date'] = shifted['date'] + pd.Timedelta(days=k)  # "k일 뒤"와 join → 결국 과거 k일이 현재로 붙음
        rename_map = {c: f'{c}_minus{k}d' for c in feature_cols}
        shifted = shifted[base_cols + feature_cols].rename(columns=rename_map)
        out = out.merge(shifted, how='left', on=base_cols)

    return out

# 기상만으로 윈도우 피처 만들기
# weather_daily의 'obs_date' 컬럼을 'date'로 변경하여 함수에 전달
weather_daily_for_features = weather_daily.rename(columns={'obs_date': 'date'})
weather_features_3d = build_past_n_days_features(weather_daily_for_features, n_days=3)
print('weather_features_3d shape:', weather_features_3d.shape)
weather_features_3d.head()

weather_features_3d shape: (1462, 14)


Unnamed: 0,station_id,date,TA_minus1d,TMN_minus1d,TMX_minus1d,RN_minus1d,TA_minus2d,TMN_minus2d,TMX_minus2d,RN_minus2d,TA_minus3d,TMN_minus3d,TMX_minus3d,RN_minus3d
0,104,2020-01-01,,,,,,,,,,,,
1,104,2020-01-02,0.1,-5.6,6.9,,,,,,,,,
2,104,2020-01-03,2.7,-2.4,7.6,,0.1,-5.6,6.9,,,,,
3,104,2020-01-04,3.9,-0.6,9.6,,2.7,-2.4,7.6,,0.1,-5.6,6.9,
4,104,2020-01-05,4.0,-0.8,8.8,,3.9,-0.6,9.6,,2.7,-2.4,7.6,
