In [None]:
# !pip install scikit-learn xgboost

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error


df = pd.read_csv("data/train.csv")

In [None]:
# print(df["신고접수번호"].value_counts())
# print(df["접수경로"].value_counts())
# print(df["신고접수일시"].value_counts())
# print(df["시군구"].value_counts())
# print(df["접수분류"].value_counts().sum())
# print(df["긴급구조종류"].value_counts())
print(df["접수분류"].isnull().sum())

In [None]:
df["dt"] = df["신고접수일시"].apply(lambda x : datetime.strptime(x, "%Y%m%d_%H%M"))
df["_dt"] = df["dt"] + timedelta(hours=6)
df["_date"] = df._dt.dt.date.astype(str)
df["_hour"] = df._dt.dt.hour
df


In [None]:
df_ = df.pivot_table(index = ['_date', '_hour'], columns = '접수분류', aggfunc='size').reset_index()
print(df_.describe())

In [None]:
# 야간 시간별 안내, 출동
# 주간 시간별 안내, 출동

night_hour = df_[(df_["_hour"] >= 0) & (df_["_hour"] < 15)]
night_hour = night_hour[["_date", "안내", "출동"]].groupby("_date").sum()
night_hour = night_hour.reset_index()
night_hour.columns = ["date", "안내", "출동"]

day_hour = df_[(df_["_hour"] >= 15) & (df_["_hour"] < 24)]
day_hour = day_hour[["_date", "안내", "출동"]].groupby("_date").sum()
day_hour = day_hour.reset_index()
day_hour.columns = ["date", "안내", "출동"]

# print(night_hour.describe())
# print(day_hour.describe())

print("\n==============\n")

# 야간 시간별 안내, 출동
# 주간 시간별 안내, 출동

night_date = df_[(df_["_hour"] >= 0) & (df_["_hour"] < 15)].groupby("_date").sum()
night_date = night_date.reset_index()
night_date.drop(["_hour"], axis=1, inplace=True)
night_date.columns = ["date", "night_0", "night_1"]


day_date = df_[(df_["_hour"] >= 15) & (df_["_hour"] < 24)].groupby("_date").sum()
day_date = day_date.reset_index()
day_date.drop(["_hour"], axis=1, inplace=True)
day_date.columns = ["date", "day_0", "day_1"]

# print(night_date)
# print(day_date)

# 야간 신고 건수(날짜별)
# 주간 신고 건수(날짜별)

# print("\n==============\n")
# night_date_all = df_[(df_["_hour"] >= 0) & (df_["_hour"] < 15)].groupby("_date").sum()
# night_date_all["night_y"] = night_date_all["안내"] + night_date_all["출동"]
# night_date_all = night_date_all.reset_index()
# night_date_all.drop(["_hour", "안내", "출동"], axis=1, inplace=True)



# day_date_all = df_[(df_["_hour"] >= 15) & (df_["_hour"] < 24)].groupby("_date").sum()
# day_date_all["day_y"] = day_date_all["안내"] + day_date_all["출동"]
# day_date_all = day_date_all.reset_index()
# day_date_all.drop(["_hour", "안내", "출동"], axis=1, inplace=True)


# print(night_date_all)
# print(day_date_all)

# data = pd.concat([night, day], axis="columns")
# data = data.reset_index()
# # data
# data.columns = ["date", "night_y", "day_y", "안내", "출동"]
# # data
# drop_date = data.dropna()
# print(drop_date.shape)
# # check = df[["date", "y"]].groupby("date").count()
# # print(check)
# # drop_dates = list(check[check["y"]] <= 24)

In [None]:
data = pd.concat([night_date, day_date], axis="columns")
data = pd.merge(left=night_date, right=day_date, on="date")
data

In [None]:
train, val = train_test_split(data, train_size=0.8, shuffle=False)

X_train_0 = train[["date", "night_0"]]
y_train_0 = train[["date", "day_0"]]
X_val_0 = val[["date", "night_0"]]
y_val_0 = val[["date", "day_0"]]

xgb_params = {
    "n_estimator" : 100,
    "max_depth" : 5,
    "eval_metric" : "mape",
    "early_stopping_rounds" : 10,
    "random_state": 514
}

model_0 = XGBRegressor(**xgb_params)
eval_set = [(X_val_0['night_0'], y_val_0['day_0'])]

model_0.fit(X=X_train_0['night_0'],
          y=y_train_0['day_0'],
          eval_set=eval_set,
          verbose=True)
prediction = model_0.predict(X_val_0['night_0'])

print(mean_absolute_percentage_error(y_val_0['day_0'], prediction))



In [None]:
X_train_1 = train[["date", "night_1"]]
y_train_1 = train[["date", "day_1"]]
X_val_1 = val[["date", "night_1"]]
y_val_1 = val[["date", "day_1"]]

xgb_params = {
    "n_estimator" : 100,
    "max_depth" : 5,
    "eval_metric" : "mape",
    "early_stopping_rounds" : 10,
    "random_state": 514
}

model_1 = XGBRegressor(**xgb_params)
eval_set = [(X_val_1['night_1'], y_val_1['day_1'])]

model_1.fit(X=X_train_1['night_1'],
          y=y_train_1['day_1'],
          eval_set=eval_set,
          verbose=True)
prediction = model_1.predict(X_val_1['night_1'])

print(mean_absolute_percentage_error(y_val_1['day_1'], prediction))

In [None]:

test_df = pd.read_csv("data/test.csv")
test_df["dt"] = test_df["신고접수일시"].apply(lambda x : datetime.strptime(x, "%Y%m%d_%H%M"))
test_df["_dt"] = test_df["dt"] + timedelta(hours=6)
test_df["_date"] = test_df._dt.dt.date.astype(str)
test_df["_hour"] = test_df._dt.dt.hour
test_df_ = test_df.pivot_table(index = ['_date', '_hour'], columns = '접수분류', aggfunc='size').reset_index()
# test_df = test_df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()
# target_df = test_df[["_date", "신고접수번호"]].groupby("_date").sum()
# target_df = target_df.reset_index()
# target_df = target_df.rename(columns={"index": "date"})
# target_df.columns = ['date', 'night_y']
target_df = test_df_[(test_df_["_hour"] >= 0) & (test_df_["_hour"] < 15)].groupby("_date").sum()
target_df = target_df.reset_index()
target_df.drop(["_hour"], axis=1, inplace=True)
target_df.columns = ["date", "night_0", "night_1"]

target_df

prediction_result_0 = model_0.predict(target_df['night_0'])
prediction_result_1 = model_1.predict(target_df['night_1'])

submission = pd.read_csv('data/sample_submission.csv')
submission["y"] = prediction_result_0.reshape(-1) + prediction_result_1.reshape(-1)
submission.to_csv("submission.csv", index=False)

# import pickle

# # insert model_name
# model_name = "02" # <------
# ## Save pickle
# with open(f"{model_name}.pickle","wb") as f:
#     pickle.dump(model, f)

# # Load pickle
# with open(f"{model_name}.pickle","rb") as f:
#     data = pickle.load(f)
#     print(data)
