# 시작하기 전에

terminal에서 아래 라이브러리를 설치해줍니다.
```
pip install numpy pandas
pip install plotly nbformat
pip install scikit-learn xgboost
```

# 라이브러리

In [2]:
import numpy as np
import pandas as pd

# 데이터 전처리 (Data Preprocessing)

### 데이터 로드

In [3]:
df = pd.read_csv("data/train.csv")
df

Unnamed: 0,신고접수번호,접수경로,신고접수일시,시군구,접수분류,긴급구조종류
0,JGT2IBW4,이동전화,20130101_0001,,안내,
1,74ED11Z4,기타,20130101_0002,,안내,
2,B4I8RIBW,이동전화,20130101_0002,,안내,
3,482FI3AJ,이동전화,20130101_0003,북구,출동,구급
4,AR9N3QT4,이동전화,20130101_0004,,안내,
...,...,...,...,...,...,...
6415797,KAOUMKLU,기타,20220630_2354,,안내,
6415798,LAI1MQ2K,이동전화,20220630_2354,사상구,출동,구급
6415799,N5KF2ELA,이동전화,20220630_2356,동구,출동,구급
6415800,53Q98D6Z,의료지도연결,20220630_2358,,안내,


### 데이터 확인

In [4]:
# df.isna().sum()
# df['접수경로'].value_counts()
# df['시군구'].value_counts()
# df['접수분류'].value_counts()
df['긴급구조종류'].value_counts()

긴급구조종류
구급    1582608
기타     403642
구조     321461
화재      92466
Name: count, dtype: int64

데이터셋의 일부만 이용해서 코드를 먼저 점검 할 수 있습니다.

In [5]:
# 작은 데이터셋을 만들어서 코드 테스트
#df = pd.read_csv("data/train.csv")[:50000]

## 데이터 가공

제공된 데이터를 주어진 과제에 맞게 가공하는 작업

목표: 건별 데이터 -> 주간 / 야간 데이터

> 1. 시간대 이동
> 2. 시간별 신고량 합계
> 3. 주간/야간 기준으로 데이터 병합


### 1. 시간대 이동

데이터를 편하게 다루기 위해서 6시간 shift (이동)

* 야간 : 18시 ~ 9시 -> (+6h) 00시 ~ 15시  
* 주간 : 9시 ~ 18시 -> (+6h) 15시 ~ 00시


데이터 상에서는 다음과 같습니다.  

* 야간 : 18:00 ~ 08:59 -> (+6h) 00:00 ~ 14:59  
* 주간 : 09:00 ~ 17:59 -> (+6h) 15:00 ~ 23:59

In [6]:
from datetime import datetime, timedelta
# dt = datetime
df["dt"] = df["신고접수일시"].apply(lambda x: datetime.strptime(x, "%Y%m%d_%H%M"))
df["_dt"] = df["dt"] + timedelta(hours=6) # 6 hours shift (18~09 -> 00~15)
df["_date"] = df._dt.dt.date.astype(str)
df["_hour"] = df._dt.dt.hour
df

Unnamed: 0,신고접수번호,접수경로,신고접수일시,시군구,접수분류,긴급구조종류,dt,_dt,_date,_hour
0,JGT2IBW4,이동전화,20130101_0001,,안내,,2013-01-01 00:01:00,2013-01-01 06:01:00,2013-01-01,6
1,74ED11Z4,기타,20130101_0002,,안내,,2013-01-01 00:02:00,2013-01-01 06:02:00,2013-01-01,6
2,B4I8RIBW,이동전화,20130101_0002,,안내,,2013-01-01 00:02:00,2013-01-01 06:02:00,2013-01-01,6
3,482FI3AJ,이동전화,20130101_0003,북구,출동,구급,2013-01-01 00:03:00,2013-01-01 06:03:00,2013-01-01,6
4,AR9N3QT4,이동전화,20130101_0004,,안내,,2013-01-01 00:04:00,2013-01-01 06:04:00,2013-01-01,6
...,...,...,...,...,...,...,...,...,...,...
6415797,KAOUMKLU,기타,20220630_2354,,안내,,2022-06-30 23:54:00,2022-07-01 05:54:00,2022-07-01,5
6415798,LAI1MQ2K,이동전화,20220630_2354,사상구,출동,구급,2022-06-30 23:54:00,2022-07-01 05:54:00,2022-07-01,5
6415799,N5KF2ELA,이동전화,20220630_2356,동구,출동,구급,2022-06-30 23:56:00,2022-07-01 05:56:00,2022-07-01,5
6415800,53Q98D6Z,의료지도연결,20220630_2358,,안내,,2022-06-30 23:58:00,2022-07-01 05:58:00,2022-07-01,5


### 2. 시간별 신고량 합계

건별 신고를 시간별 신고량으로 합쳐줍니다.

In [7]:
df = df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()

column_map = {"_date": "date", "_hour": "hour", "신고접수번호": "y"}
df = df.rename(column_map, axis="columns")
df.head()

Unnamed: 0,date,hour,y
0,2013-01-01,6,77
1,2013-01-01,7,81
2,2013-01-01,8,72
3,2013-01-01,9,45
4,2013-01-01,10,47


\+ 일별로 24시간의 데이터가 모두 있는지 확인

In [8]:
check = df[["date", "y"]].groupby("date").count()
drop_dates = list(check[check["y"] != 24].index)
print(drop_dates)

# column "date" -> index
df = df.set_index("date")
df = df.drop(index=drop_dates)
df = df.reset_index()
df.head()

['2013-01-01', '2022-07-01']


Unnamed: 0,date,hour,y
0,2013-01-02,0,97
1,2013-01-02,1,103
2,2013-01-02,2,102
3,2013-01-02,3,102
4,2013-01-02,4,61


### 3. 주간/야간 데이터 병합

시간별 데이터를 구간별 데이터로 병합합니다

* 야간 : 00:00 ~ 14:59  
* 주간 : 15:00 ~ 23:59

In [9]:
night = df[(df["hour"] >= 0) & (df["hour"] < 15)]
night = night[["date", "y"]].groupby("date").sum()
day = df[(df["hour"] >= 15) & (df["hour"] < 24)]
day = day[["date", "y"]].groupby("date").sum()

display(night.head())
display(day.head())

Unnamed: 0_level_0,y
date,Unnamed: 1_level_1
2013-01-02,785
2013-01-03,1013
2013-01-04,1005
2013-01-05,791
2013-01-06,966


Unnamed: 0_level_0,y
date,Unnamed: 1_level_1
2013-01-02,561
2013-01-03,577
2013-01-04,622
2013-01-05,786
2013-01-06,1506


In [10]:
data_df = pd.concat([night, day], axis="columns") # columns == 1
data_df = data_df.reset_index()
data_df.columns = ["date", "night_y", "day_y"]
data_df.tail()

Unnamed: 0,date,night_y,day_y
3462,2022-06-26,1356,1029
3463,2022-06-27,1375,1099
3464,2022-06-28,1030,884
3465,2022-06-29,1159,827
3466,2022-06-30,1058,901


### (추가) datetime features

In [11]:
display(data_df.head())

dt_df = pd.DataFrame(pd.to_datetime(data_df["date"]))
dt_df["doy"] = dt_df["date"].dt.dayofyear
dt_df["year"] = dt_df["date"].dt.year
dt_df["month"] = dt_df["date"].dt.month
dt_df["day"] = dt_df["date"].dt.day
dt_df["weekday"] = dt_df["date"].dt.weekday

dt_df["date"] = dt_df["date"].dt.date.astype(str)

display(dt_df.head())

Unnamed: 0,date,night_y,day_y
0,2013-01-02,785,561
1,2013-01-03,1013,577
2,2013-01-04,1005,622
3,2013-01-05,791,786
4,2013-01-06,966,1506


Unnamed: 0,date,doy,year,month,day,weekday
0,2013-01-02,2,2013,1,2,2
1,2013-01-03,3,2013,1,3,3
2,2013-01-04,4,2013,1,4,4
3,2013-01-05,5,2013,1,5,5
4,2013-01-06,6,2013,1,6,6


data와 datetime feautres 합치기

In [12]:
data = pd.concat([data_df, dt_df], axis="columns")
data = pd.merge(left=data_df, right=dt_df, on="date")
data

Unnamed: 0,date,night_y,day_y,doy,year,month,day,weekday
0,2013-01-02,785,561,2,2013,1,2,2
1,2013-01-03,1013,577,3,2013,1,3,3
2,2013-01-04,1005,622,4,2013,1,4,4
3,2013-01-05,791,786,5,2013,1,5,5
4,2013-01-06,966,1506,6,2013,1,6,6
...,...,...,...,...,...,...,...,...
3462,2022-06-26,1356,1029,177,2022,6,26,6
3463,2022-06-27,1375,1099,178,2022,6,27,0
3464,2022-06-28,1030,884,179,2022,6,28,1
3465,2022-06-29,1159,827,180,2022,6,29,2


### 데이터 시각화

시각화를 위한 `plotly` 라이브러리 설치

In [13]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

dates = data["date"].to_list()
night_y = data["night_y"].to_list()
day_y = data["day_y"].to_list()

fig = make_subplots(rows=1, cols=1)
fig.update_layout(title_text=f"Data")

fig.add_trace(go.Scatter(x=dates, y=night_y, name="night_y"), row=1, col=1)
fig.add_trace(go.Scatter(x=dates, y=day_y, name="day_y"), row=1, col=1)
fig.show()

## Dataset 준비

In [14]:
data.columns

Index(['date', 'night_y', 'day_y', 'doy', 'year', 'month', 'day', 'weekday'], dtype='object')

In [15]:
X_columns = ["doy", "year", "month", "day", "weekday", "night_y"]
y_columns = ["day_y"]

# train validation data split
from sklearn.model_selection import train_test_split
train, val = train_test_split(data, train_size=0.8, shuffle=False)

display(train)
display(val)

X_train = train[X_columns].to_numpy()
y_train = train[y_columns].to_numpy()

X_val = val[X_columns].to_numpy()
y_val = val[y_columns].to_numpy()

Unnamed: 0,date,night_y,day_y,doy,year,month,day,weekday
0,2013-01-02,785,561,2,2013,1,2,2
1,2013-01-03,1013,577,3,2013,1,3,3
2,2013-01-04,1005,622,4,2013,1,4,4
3,2013-01-05,791,786,5,2013,1,5,5
4,2013-01-06,966,1506,6,2013,1,6,6
...,...,...,...,...,...,...,...,...
2768,2020-08-01,1206,961,214,2020,8,1,5
2769,2020-08-02,1345,1141,215,2020,8,2,6
2770,2020-08-03,1109,873,216,2020,8,3,0
2771,2020-08-04,1142,833,217,2020,8,4,1


Unnamed: 0,date,night_y,day_y,doy,year,month,day,weekday
2773,2020-08-06,1025,826,219,2020,8,6,3
2774,2020-08-07,895,786,220,2020,8,7,4
2775,2020-08-08,1553,893,221,2020,8,8,5
2776,2020-08-09,1167,1059,222,2020,8,9,6
2777,2020-08-10,1091,919,223,2020,8,10,0
...,...,...,...,...,...,...,...,...
3462,2022-06-26,1356,1029,177,2022,6,26,6
3463,2022-06-27,1375,1099,178,2022,6,27,0
3464,2022-06-28,1030,884,179,2022,6,28,1
3465,2022-06-29,1159,827,180,2022,6,29,2


# 학습

예측 모델로 XGBRegressor(XGBoost Regressor)를 사용합니다.  

다음과 같은 순서로 진행합니다.

> 1. 모델 파라미터 정의
> 2. 모델 정의
> 3. 데이터 확인
> 4. 모델 학습
> 5. 모델 평가

## Train

In [16]:
from xgboost.sklearn import XGBRegressor

# 모델 파라미터 설정
xgb_params = {
    "n_estimators": 100,
    "max_depth": 5,
    "eval_metric": "mape",
    "early_stopping_rounds": 10,
    "random_state": 514
}
# default objective function
# reg:squarederror (MSE)

# 모델 정의
model = XGBRegressor(**xgb_params)

# 평가 데이터
eval_set = [(X_val, y_val)]

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

model.fit(X=X_train,
          y=y_train,
          eval_set=eval_set,
          verbose=True)

(2773, 6) (2773, 1) (694, 6) (694, 1)
[0]	validation_0-mape:0.69067
[1]	validation_0-mape:0.47552
[2]	validation_0-mape:0.33855
[3]	validation_0-mape:0.25561
[4]	validation_0-mape:0.20682
[5]	validation_0-mape:0.17787
[6]	validation_0-mape:0.16448
[7]	validation_0-mape:0.15622
[8]	validation_0-mape:0.15300
[9]	validation_0-mape:0.15209
[10]	validation_0-mape:0.15120
[11]	validation_0-mape:0.15196
[12]	validation_0-mape:0.15117
[13]	validation_0-mape:0.15092
[14]	validation_0-mape:0.15024
[15]	validation_0-mape:0.14986
[16]	validation_0-mape:0.14886
[17]	validation_0-mape:0.15000
[18]	validation_0-mape:0.14984
[19]	validation_0-mape:0.15023
[20]	validation_0-mape:0.14959
[21]	validation_0-mape:0.14950
[22]	validation_0-mape:0.14880
[23]	validation_0-mape:0.14988
[24]	validation_0-mape:0.15033
[25]	validation_0-mape:0.15082
[26]	validation_0-mape:0.15142
[27]	validation_0-mape:0.15097
[28]	validation_0-mape:0.15094
[29]	validation_0-mape:0.15096
[30]	validation_0-mape:0.15174
[31]	valida

## Validation

In [17]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
# MAE = Mean Absolute Error
# MAPE = Mean Absolute Percentage Error

# inference
preds = model.predict(X_val)

mae = mean_absolute_error(y_val, preds)
mape = mean_absolute_percentage_error(y_val, preds)
print(mae, mape)

154.87267635466387 0.14880456532179887


### 시각화

In [18]:
# 위에서 데이터 시각화 부분 copy
# data -> val 로 변경

dates = val["date"].to_list()
night_y = val["night_y"].to_list()
day_y = val["day_y"].to_list()

fig = make_subplots(rows=1, cols=1)
fig.update_layout(title_text=f"Validation Data") # title 변경

fig.add_trace(go.Scatter(x=dates, y=night_y, name="night_y"), row=1, col=1)
fig.add_trace(go.Scatter(x=dates, y=day_y, name="day_y"), row=1, col=1)
fig.show()

# 추론 (Inference)

### 테스트 파일 로드

In [19]:
df = pd.read_csv("data/test.csv")

# 1. 시간대 이동
from datetime import datetime, timedelta
# dt = datetime
df["dt"] = df["신고접수일시"].apply(lambda x: datetime.strptime(x, "%Y%m%d_%H%M"))
df["_dt"] = df["dt"] + timedelta(hours=6) # 6 hours shift (18~09 -> 00~15)
df["_date"] = df._dt.dt.date.astype(str)
df["_hour"] = df._dt.dt.hour

# 2. 시간별 신고량 합계
df = df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()
column_map = {"_date": "date", "_hour": "hour", "신고접수번호": "y"}
df = df.rename(column_map, axis="columns")
df

# 3. 주간/야간 데이터 병합
night = df[(df["hour"] >= 0) & (df["hour"] < 15)]
night = night[["date", "y"]].groupby("date").sum()
day = df[(df["hour"] >= 15) & (df["hour"] < 24)]
day = day[["date", "y"]].groupby("date").sum()

data_df = pd.concat([night, day], axis="columns") # columns == 1
data_df = data_df.reset_index()
data_df.columns = ["date", "night_y", "day_y"]
data_df

# (추가) datetime features
dt_df = pd.DataFrame(pd.to_datetime(data_df["date"]))
dt_df["doy"] = dt_df["date"].dt.dayofyear
dt_df["year"] = dt_df["date"].dt.year
dt_df["month"] = dt_df["date"].dt.month
dt_df["day"] = dt_df["date"].dt.day
dt_df["weekday"] = dt_df["date"].dt.weekday

dt_df["date"] = dt_df["date"].dt.date.astype(str)

# data와 datetime feautres 합치기
data = pd.concat([data_df, dt_df], axis="columns")
data = pd.merge(left=data_df, right=dt_df, on="date")
data

Unnamed: 0,date,night_y,day_y,doy,year,month,day,weekday
0,2022-07-01,1246,,182,2022,7,1,4
1,2022-07-02,1327,,183,2022,7,2,5
2,2022-07-03,1398,,184,2022,7,3,6
3,2022-07-04,1188,,185,2022,7,4,0
4,2022-07-05,1181,,186,2022,7,5,1
...,...,...,...,...,...,...,...,...
102,2022-10-11,1048,,284,2022,10,11,1
103,2022-10-12,962,,285,2022,10,12,2
104,2022-10-13,940,,286,2022,10,13,3
105,2022-10-14,920,,287,2022,10,14,4


### 모델 추론

In [20]:
test_preds = model.predict(data[X_columns])

### 제출 파일 로드

In [21]:
submission = pd.read_csv("data/sample_submission.csv")
submission["y"] = test_preds.reshape(-1)
submission.to_csv("submission.csv", index=False)