# Preprocessing
- 데이터를 전처리하는 모든 과정을 담았다.
- 모든 전처리과정은 함수로 구성하여 공유하기 용이하도록 하였다.

### 패키지 로딩

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rc('font', family = 'Malgun Gothic')

### 데이터 로딩

In [28]:
air_train = pd.read_csv('C:/dust/data/air_2021.csv')
weather_train = pd.read_csv('C:/dust/data/weather_2021.csv',encoding = 'cp949')
sunrise_sunset_train = pd.read_csv("C:/dust/preprocessing/sunrise_sunset_2021.csv")

air_test = pd.read_csv('C:/dust/data/air_2022.csv')
weather_test = pd.read_csv('C:/dust/data/weather_2022.csv',encoding = 'cp949')
sunrise_sunset_test = pd.read_csv("C:/dust/preprocessing/sunrise_sunset_test.csv")

### 공기질 데이터 전처리 함수 구축

In [29]:
def make_air_dataset(air) :
    air = air.drop(columns = ["Unnamed: 0"])
    air["측정일시"] = air["측정일시"].astype(str) + "00"
    pat = '(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})(?P<hour>\d{2})(?P<minute>\d{2})'
    air["측정일시"] = pd.to_datetime(air["측정일시"].str.extract(pat, expand=True))
    air = air.drop(columns = ["지역","망","측정소코드","측정소명","주소"])
    air.rename(columns = {"측정일시" : "datetime"}, inplace = True)
    return air

### 일출, 일몰시간 데이터 전처리 함수 구축

In [30]:
def rise_set_clean(sunrise_sunset) :
    sunrise_sunset['sunrise'] = sunrise_sunset['sunrise'].astype(str).str.slice(start =0, stop = 1)
    sunrise_sunset['sunset'] = sunrise_sunset['sunset'].astype(str).str.slice(start =0, stop = 2).astype(int) + 1
    sunrise_sunset["date"] = sunrise_sunset["date"].astype(str)
    sunrise_sunset["date"] = sunrise_sunset["date"].str.slice(0,4) + "-" + sunrise_sunset["date"].str.slice(4,6) + "-" + sunrise_sunset["date"].str.slice(6,8)
    sunrise_sunset["date"] = pd.to_datetime(sunrise_sunset["date"]).astype(str)
    sunrise_sunset['sunrise'] = sunrise_sunset['sunrise'].astype(int)
    sunrise_sunset['sunset'] = sunrise_sunset['sunset'].astype(int)
    return sunrise_sunset

### 날씨 데이터 전처리 함수 구축

In [31]:
def weather_clean(weather, sunrise_sunset) :
    weather["일시"] = pd.to_datetime(weather["일시"])
    for i in weather.columns :
        if "QC" in i :
            weather = weather.drop(columns = [i])
    weather = weather.drop(columns = ["지점","지점명","지면상태(지면상태코드)","현상번호(국내식)","운형(운형약어)","최저운고(100m )"])
    weather["date"] = weather["일시"].dt.date.astype(str)
    weather = pd.merge(weather, sunrise_sunset, on = "date", how = "left")
    weather["시각"] = weather["일시"].dt.hour.astype(int)
    weather["일조(hr)"] = np.where(weather["시각"].between(weather["sunrise"], weather["sunset"]),weather["일조(hr)"], 0)
    weather["일사(MJ/m2)"] = np.where(weather["시각"].between(weather["sunrise"], weather["sunset"]),weather["일사(MJ/m2)"], 0)
    weather["강수량(mm)"] = weather["강수량(mm)"].fillna(0)
    weather["적설(cm)"] = weather["적설(cm)"].fillna(0)
    weather["3시간신적설(cm)"] = weather["3시간신적설(cm)"].fillna(0)
    weather = weather.fillna(method = "ffill")
    weather = weather.drop(columns = ['date', 'sunrise', 'sunset', '시각'])
    weather.rename(columns = {"일시" : "datetime"}, inplace = True)
    return weather

### 전처리 적용

In [32]:
air_train = make_air_dataset(air_train)
air_test = make_air_dataset(air_test)

sunrise_sunset_train = rise_set_clean(sunrise_sunset_train)
sunrise_sunset_test = rise_set_clean(sunrise_sunset_test)

weather_train = weather_clean(weather_train, sunrise_sunset_train)
weather_test = weather_clean(weather_test, sunrise_sunset_test)

### 분석 목적에 맞는 데이터 병합

In [33]:
train = pd.merge(air_train,weather_train,how = "inner", on = "datetime")
test  = pd.merge(air_test,weather_test,how = "inner", on = "datetime")

### 데이터 저장

In [34]:
train.to_csv("train.csv")
test.to_csv("test.csv")