In [None]:
import requests
import pandas as pd
from io import StringIO
import os
from dotenv import load_dotenv

In [None]:
load_dotenv()  # .env 파일 읽어오기
api_key = os.getenv('AD_API_KEY')

def download_file(file_url, save_path):
    with open(save_path, 'wb') as f: # 저장할 파일을 바이너리 쓰기 모드로 열기
        response = requests.get(file_url) # 파일 URL에 GET 요청 보내기
        f.write(response.content) # 응답의 내용을 파일에 쓰기

# # URL과 저장 경로 변수를 지정합니다.
url = f"https://apihub.kma.go.kr/api/typ01/url/kma_pm10.php?tm1=200804281215&tm2=202505271230&stn=108&authKey={api_key}"

save_file_path = '../datapipeline/data/data2.csv'

# # 파일 다운로드 함수를 호출합니다.
download_file(url, save_file_path)

In [None]:
with open('../datapipeline/data/data2.csv', 'r', encoding='cp949') as f:
    lines = f.readlines()

# 주석(#) 제거
data_lines = [line for line in lines if not line.startswith('#')]

# 데이터 문자열로 합치기
data_str = ''.join(data_lines)

# 쉼표 구분자로 읽기
df = pd.read_csv(StringIO(data_str), sep=',', header=None, on_bad_lines='skip', low_memory=False)

# 날짜(PM10) 컬럼 위치 자동 탐색
if df.shape[1] >= 3:
    df_ad = df.iloc[:, [0, 2]]
    df_ad.columns = ['날짜시간', 'PM10']
else:
    raise ValueError("데이터 형식 확인 필요: 예상 컬럼 3개 이상이어야 함")

# 날짜시간 컬럼: 12자리 숫자(YYYYMMDDHHMM)만 추출 후 날짜 변환
df_ad['날짜시간'] = df_ad['날짜시간'].astype(str).str.extract(r'(\d{12})')[0]
df_ad['날짜'] = pd.to_datetime(df_ad['날짜시간'], format='%Y%m%d%H%M', errors='coerce').dt.date

# PM10 숫자형 변환
df_ad['PM10'] = pd.to_numeric(df_ad['PM10'], errors='coerce')

# 날짜별 최소/최대/평균 구하기
df_pm10 = df_ad.groupby('날짜')['PM10'].agg(['min', 'max', 'mean']).reset_index()
df_pm10.columns = ['날짜', 'PM10_MIN', 'PM10_MAX', 'PM10_AVG']

df_pm10 = df_pm10.round(1)

df_pm10['날짜'] = pd.to_datetime(df_pm10['날짜'].astype(str))

# 저장
df_pm10.to_csv('../datapipeline/data/PM10_data.csv', index=False, encoding='utf-8-sig')

print(df_pm10.head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ad['날짜시간'] = df_ad['날짜시간'].astype(str).str.extract(r'(\d{12})')[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ad['날짜'] = pd.to_datetime(df_ad['날짜시간'], format='%Y%m%d%H%M', errors='coerce').dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ad['PM10'] = pd.to_numeric(df_ad['PM10'], 

          날짜  PM10_MIN  PM10_MAX  PM10_AVG
0 2008-04-28      47.0      93.0      75.5
1 2008-04-29      53.0     128.0      85.2
2 2008-04-30      34.0     130.0      73.5
3 2008-05-01      35.0      85.0      55.6
4 2008-05-02      42.0      95.0      59.8


In [82]:
df_pm10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6221 entries, 0 to 6220
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   날짜        6221 non-null   datetime64[ns]
 1   PM10_MIN  6221 non-null   float64       
 2   PM10_MAX  6221 non-null   float64       
 3   PM10_AVG  6221 non-null   float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 194.5 KB
