#### 큰 흐름
- 라이브러리 로드
- 데이터 확인
- 모델 학습
- 제출 파일

---

##### 라이브러리 로드

In [1]:
# !pip install scikit-learn xgboost

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error

# df = data frame
df = pd.read_csv("data/train.csv")
# df
# df["접수경로"].value_counts()

df["dt"] = df["신고접수일시"].apply(lambda x : datetime.strptime(x, "%Y%m%d_%H%M"))
df["_dt"] = df["dt"] + timedelta(hours=6)
df["_date"] = df._dt.dt.date.astype(str)
df["_hour"] = df._dt.dt.hour
df = df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()

night = df[(df["_hour"] >= 0) & (df["_hour"] < 15)]
night = night[["_date", "신고접수번호"]].groupby("_date").sum()
day = df[(df["_hour"] >= 15) & (df["_hour"] < 24)]
day = day[["_date", "신고접수번호"]].groupby("_date").sum()

data = pd.concat([night, day], axis="columns")
data = data.reset_index()
# data
data.columns = ["date", "night_y", "day_y"]
# data
drop_date = data.dropna()
print(drop_date.shape)
# check = df[["date", "y"]].groupby("date").count()
# print(check)
# drop_dates = list(check[check["y"]] <= 24)

train, val = train_test_split(drop_date, train_size=0.8, shuffle=False)

X_train = train[["date", "night_y"]]
y_train = train[["date", "day_y"]]
X_val = val[["date", "night_y"]]
y_val = val[["date", "day_y"]]

xgb_params = {
    # "n_estimator" : 100,
    "max_depth" : 7,
    "eval_metric" : "mape",
    "early_stopping_rounds" : 20,
    "random_state": 514
}

model = XGBRegressor(**xgb_params)
eval_set = [(X_val['night_y'], y_val['day_y'])]

model.fit(X=X_train['night_y'],
          y=y_train['day_y'],
          eval_set=eval_set,
          verbose=True)
prediction = model.predict(X_val['night_y'])

mape = mean_absolute_percentage_error(y_val['day_y'], prediction)

test_df = pd.read_csv("data/test.csv")
test_df["dt"] = test_df["신고접수일시"].apply(lambda x : datetime.strptime(x, "%Y%m%d_%H%M"))
test_df["_dt"] = test_df["dt"] + timedelta(hours=6)
test_df["_date"] = test_df._dt.dt.date.astype(str)
test_df["_hour"] = test_df._dt.dt.hour
test_df = test_df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()
target_df = test_df[["_date", "신고접수번호"]].groupby("_date").sum()
target_df = target_df.reset_index()
target_df = target_df.rename(columns={"index": "date"})
target_df.columns = ['date', 'night_y']
# target_df

prediction_result = model.predict(target_df['night_y'])

submission = pd.read_csv('./data/sample_submission.csv')
submission["y"] = prediction_result.reshape(-1)
submission.to_csv("submission.csv", index=False)

(3468, 3)
[0]	validation_0-mape:0.15301
[1]	validation_0-mape:0.14597
[2]	validation_0-mape:0.14956
[3]	validation_0-mape:0.15510
[4]	validation_0-mape:0.16032
[5]	validation_0-mape:0.16483
[6]	validation_0-mape:0.16841
[7]	validation_0-mape:0.17066
[8]	validation_0-mape:0.17278
[9]	validation_0-mape:0.17431
[10]	validation_0-mape:0.17557
[11]	validation_0-mape:0.17643
[12]	validation_0-mape:0.17738
[13]	validation_0-mape:0.17783
[14]	validation_0-mape:0.17828
[15]	validation_0-mape:0.17854
[16]	validation_0-mape:0.17876
[17]	validation_0-mape:0.17888
[18]	validation_0-mape:0.17916
[19]	validation_0-mape:0.17928
[20]	validation_0-mape:0.17934
