#### 큰 흐름
- 라이브러리 로드
- 데이터 확인
- 모델 학습
- 제출 파일

---

##### 라이브러리 로드

In [1]:
# !pip install scikit-learn xgboost

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error

# df = data frame
df = pd.read_csv("data/train.csv")
# df
# df["접수경로"].value_counts()

df["dt"] = df["신고접수일시"].apply(lambda x : datetime.strptime(x, "%Y%m%d_%H%M"))
df["_dt"] = df["dt"] + timedelta(hours=6)
df["_date"] = df._dt.dt.date.astype(str)
df["_hour"] = df._dt.dt.hour
df = df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()

night = df[(df["_hour"] >= 0) & (df["_hour"] < 15)]
night = night[["_date", "신고접수번호"]].groupby("_date").sum()
print(night)

day = df[(df["_hour"] >= 15) & (df["_hour"] < 24)]
day = day[["_date", "신고접수번호"]].groupby("_date").sum()
print(day)
data = pd.concat([night, day], axis="columns")
data = data.reset_index()
# data
data.columns = ["date", "night_y", "day_y"]
# data
drop_date = data.dropna()
print(drop_date.shape)
# check = df[["date", "y"]].groupby("date").count()
# print(check)
# drop_dates = list(check[check["y"]] <= 24)

train, val = train_test_split(drop_date, train_size=0.8, shuffle=False)

X_train = train[["date", "night_y"]]
y_train = train[["date", "day_y"]]
X_val = val[["date", "night_y"]]
y_val = val[["date", "day_y"]]

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}

model = XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_percentage_error')
grid_search.fit(X_train['night_y'], y_train['day_y'])
best_model = grid_search.best_estimator_
# eval_set = [(X_val['night_y'], y_val['day_y'])]

# model.fit(X=X_train['night_y'],
#           y=y_train['day_y'],
#           eval_set=eval_set,
#           verbose=True)
prediction = best_model.predict(X_val['night_y'])

print(mean_absolute_percentage_error(y_val['day_y'], prediction))

test_df = pd.read_csv("data/test.csv")
test_df["dt"] = test_df["신고접수일시"].apply(lambda x : datetime.strptime(x, "%Y%m%d_%H%M"))
test_df["_dt"] = test_df["dt"] + timedelta(hours=6)
test_df["_date"] = test_df._dt.dt.date.astype(str)
test_df["_hour"] = test_df._dt.dt.hour
test_df = test_df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()
target_df = test_df[["_date", "신고접수번호"]].groupby("_date").sum()
target_df = target_df.reset_index()
target_df = target_df.rename(columns={"index": "date"})
target_df.columns = ['date', 'night_y']
# target_df

prediction_result = best_model.predict(target_df['night_y'])

submission = pd.read_csv('./data/sample_submission.csv')
submission["y"] = prediction_result.reshape(-1)
submission.to_csv("submission.csv", index=False)

            신고접수번호
_date             
2013-01-01     613
2013-01-02     785
2013-01-03    1013
2013-01-04    1005
2013-01-05     791
...            ...
2022-06-27    1375
2022-06-28    1030
2022-06-29    1159
2022-06-30    1058
2022-07-01     708

[3469 rows x 1 columns]
            신고접수번호
_date             
2013-01-01    2057
2013-01-02     561
2013-01-03     577
2013-01-04     622
2013-01-05     786
...            ...
2022-06-26    1029
2022-06-27    1099
2022-06-28     884
2022-06-29     827
2022-06-30     901

[3468 rows x 1 columns]
(3468, 3)
0.16446047562782828
