## 기상 데이터 처리

3월 7일부터의 데이터 밖에 존재하지 않아, 직접 데이터를 처리하여 삽입.

기상데이터 단기예보 데이터 다운로드: https://data.kma.go.kr/data/rmt/rmtList.do?code=410&pgmNo=571

In [156]:
import os
import pandas as pd
import sqlalchemy as sa
import glob
from datetime import datetime
from dotenv import dotenv_values

env = dotenv_values('.env')

In [157]:
dir_path = '../data/weather_data/before/*.csv'
csv_files = glob.glob(dir_path)

combined_df = pd.read_csv(csv_files[0], skipinitialspace=True)
for f in csv_files[1:]:
    df = pd.read_csv(f, skipinitialspace=True)
    df.columns = [f"{col}_{f.split('/')[-1].split('.')[0]}" if col not in ['format: day', 'hour', 'forecast'] else col for col in df.columns]
    combined_df = pd.merge(combined_df, df, on=['format: day', 'hour', 'forecast'], suffixes=('_left', '_right'))


columns = [
    'day',
    'hour',
    'forecast',
    'rainfall',
    'rainfall_type',
    'temp',
    'south_north_wind_component',
    'lightning_strike',
    'east_west_wind_component',
    'hum',
    'wind_speed',
    'wind_direction',
    'sky_status',
]

combined_df.columns = columns
combined_df

Unnamed: 0,day,hour,forecast,rainfall,rainfall_type,temp,south_north_wind_component,lightning_strike,east_west_wind_component,hum,wind_speed,wind_direction,sky_status
0,1,30.0,1.0,0.0,0.0,-0.9,-1.3,0.0,-2.1,80.0,3.0,57.0,3.0
1,1,30.0,2.0,0.0,0.0,0.0,-1.3,0.0,-1.9,75.0,2.0,55.0,3.0
2,1,30.0,3.0,0.0,0.0,2.0,-1.3,0.0,-1.7,70.0,2.0,51.0,3.0
3,1,30.0,4.0,0.0,0.0,4.0,-1.3,0.0,-1.1,65.0,2.0,41.0,3.0
4,1,30.0,5.0,0.0,0.0,7.0,-1.3,0.0,-0.7,60.0,2.0,32.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4054,28,2330.0,2.0,0.0,0.0,7.0,-0.3,0.0,-2.0,70.0,2.0,81.0,3.0
4055,28,2330.0,3.0,0.0,0.0,9.0,0.0,0.0,-0.9,65.0,1.0,88.0,4.0
4056,28,2330.0,4.0,0.0,0.0,11.0,1.0,0.0,-0.2,60.0,1.0,167.0,4.0
4057,28,2330.0,5.0,0.0,0.0,12.0,1.2,0.0,0.6,55.0,1.0,207.0,4.0


In [158]:
print(combined_df.dtypes)

day                            object
hour                          float64
forecast                      float64
rainfall                      float64
rainfall_type                 float64
temp                          float64
south_north_wind_component    float64
lightning_strike              float64
east_west_wind_component      float64
hum                           float64
wind_speed                    float64
wind_direction                float64
sky_status                    float64
dtype: object


In [159]:
df = combined_df[combined_df['forecast'] == 6].copy()
df

Unnamed: 0,day,hour,forecast,rainfall,rainfall_type,temp,south_north_wind_component,lightning_strike,east_west_wind_component,hum,wind_speed,wind_direction,sky_status
5,1,30.0,6.0,0.0,0.0,7.0,-1.4,0.0,-0.6,55.0,2.0,25.0,3.0
11,1,130.0,6.0,0.0,0.0,7.0,-1.6,0.0,-0.3,55.0,2.0,13.0,3.0
17,1,230.0,6.0,0.0,0.0,7.0,-1.1,0.0,0.1,50.0,1.0,355.0,1.0
23,1,330.0,6.0,0.0,0.0,5.0,-1.5,0.0,-0.1,55.0,2.0,7.0,3.0
29,1,430.0,6.0,0.0,0.0,4.0,-1.3,0.0,-0.4,60.0,1.0,20.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4034,28,1930.0,6.0,0.0,0.0,7.0,0.0,0.0,-1.2,80.0,1.0,90.0,3.0
4040,28,2030.0,6.0,0.0,0.0,10.0,0.8,0.0,-0.1,65.0,1.0,166.0,3.0
4046,28,2130.0,6.0,0.0,0.0,11.0,1.4,0.0,0.1,60.0,1.0,184.0,4.0
4052,28,2230.0,6.0,0.0,0.0,12.0,1.3,0.0,0.7,55.0,1.0,208.0,4.0


In [160]:
df['hour'] = df['hour'].astype(str)
df['day'] = df['day'].astype(str)
df['rainfall_type'] = df['rainfall_type'].astype(str)
df['sky_status'] = df['sky_status'].astype(str)

df.dtypes

day                            object
hour                           object
forecast                      float64
rainfall                      float64
rainfall_type                  object
temp                          float64
south_north_wind_component    float64
lightning_strike              float64
east_west_wind_component      float64
hum                           float64
wind_speed                    float64
wind_direction                float64
sky_status                     object
dtype: object

## 전처리

In [161]:
df['hour'] = df['hour'].str.replace('.0', '').str.replace('30', '00').str.zfill(4)
df['hour'] = df['hour'].apply(lambda x: x[:2] + ":" + x[2:])
df['day'] = df['day'].str.zfill(2)

df['rainfall_type'] = df['rainfall_type'].str.replace('.0', '')
df['sky_status'] = df['sky_status'].str.replace('.0', '')

df['day'], df['hour']

(5       01
 11      01
 17      01
 23      01
 29      01
         ..
 4034    28
 4040    28
 4046    28
 4052    28
 4058    28
 Name: day, Length: 672, dtype: object,
 5       00:00
 11      01:00
 17      02:00
 23      03:00
 29      04:00
         ...  
 4034    19:00
 4040    20:00
 4046    21:00
 4052    22:00
 4058    23:00
 Name: hour, Length: 672, dtype: object)

In [162]:
df['forecast_time_str'] =  '2025-02-' + df['day'] + ' ' + df['hour'] + ':00'
df['forecast_time'] = pd.to_datetime(df['forecast_time_str'], format='%Y-%m-%d %H:%M:%S')
df.drop(['forecast_time_str', 'day', 'hour', 'forecast'], axis=1, inplace=True)

df

Unnamed: 0,rainfall,rainfall_type,temp,south_north_wind_component,lightning_strike,east_west_wind_component,hum,wind_speed,wind_direction,sky_status,forecast_time
5,0.0,0,7.0,-1.4,0.0,-0.6,55.0,2.0,25.0,3,2025-02-01 00:00:00
11,0.0,0,7.0,-1.6,0.0,-0.3,55.0,2.0,13.0,3,2025-02-01 01:00:00
17,0.0,0,7.0,-1.1,0.0,0.1,50.0,1.0,355.0,1,2025-02-01 02:00:00
23,0.0,0,5.0,-1.5,0.0,-0.1,55.0,2.0,7.0,3,2025-02-01 03:00:00
29,0.0,0,4.0,-1.3,0.0,-0.4,60.0,1.0,20.0,4,2025-02-01 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...
4034,0.0,0,7.0,0.0,0.0,-1.2,80.0,1.0,90.0,3,2025-02-28 19:00:00
4040,0.0,0,10.0,0.8,0.0,-0.1,65.0,1.0,166.0,3,2025-02-28 20:00:00
4046,0.0,0,11.0,1.4,0.0,0.1,60.0,1.0,184.0,4,2025-02-28 21:00:00
4052,0.0,0,12.0,1.3,0.0,0.7,55.0,1.0,208.0,4,2025-02-28 22:00:00


## 데이터 입력

In [163]:
df['created_at'] = 'now()'
df['updated_at'] = 'now()'

In [164]:
POSTGRES_USER = env.get('POSTGRES_USER')
POSTGRES_PASSWORD = env.get('POSTGRES_PASSWORD')
POSTGRES_HOST = env.get('POSTGRES_HOST')
POSTGRES_PORT = env.get('POSTGRES_PORT')
POSTGRES_DB = env.get('POSTGRES_DB')

print(POSTGRES_DB, POSTGRES_HOST, POSTGRES_PORT, POSTGRES_USER, POSTGRES_PASSWORD)

seoul_real_traffic_db localhost 5441 postgres postgres1234


In [None]:
engine = sa.create_engine(f'postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}')

with engine.begin() as conn:
    upsert_query = sa.text('''
        INSERT INTO public.weather_data
        (created_at, updated_at, forecast_time, "temp", hum, sky_status, lightning_strike, wind_direction, wind_speed, rainfall, rainfall_type, east_west_wind_component, south_north_wind_component)
        VALUES(:created_at, :updated_at, :forecast_time, :temp, :hum, :sky_status, :lightning_strike, :wind_direction, :wind_speed, :rainfall, :rainfall_type, :east_west_wind_component, :south_north_wind_component)
        ON CONFLICT (forecast_time)        
        DO UPDATE 
        SET created_at=EXCLUDED.created_at, updated_at=EXCLUDED.updated_at, forecast_time=EXCLUDED.forecast_time, "temp"=EXCLUDED."temp", hum=EXCLUDED.hum, sky_status=EXCLUDED.sky_status, lightning_strike=EXCLUDED.lightning_strike, wind_direction=EXCLUDED.wind_direction, wind_speed=EXCLUDED.wind_speed, rainfall=EXCLUDED.rainfall, rainfall_type=EXCLUDED.rainfall_type, east_west_wind_component=EXCLUDED.east_west_wind_component, south_north_wind_component=EXCLUDED.south_north_wind_component
    ''')
    conn.execute(upsert_query, df.to_dict(orient='records'))