#### 큰 흐름
- 라이브러리 로드
- 데이터 확인
- 모델 학습
- 제출 파일

---

##### 라이브러리 로드

In [1]:
import numpy as np
import pandas as pd

In [2]:
# df = data frame
df = pd.read_csv("data/train.csv")
df

Unnamed: 0,신고접수번호,접수경로,신고접수일시,시군구,접수분류,긴급구조종류
0,JGT2IBW4,이동전화,20130101_0001,,안내,
1,74ED11Z4,기타,20130101_0002,,안내,
2,B4I8RIBW,이동전화,20130101_0002,,안내,
3,482FI3AJ,이동전화,20130101_0003,북구,출동,구급
4,AR9N3QT4,이동전화,20130101_0004,,안내,
...,...,...,...,...,...,...
6415797,KAOUMKLU,기타,20220630_2354,,안내,
6415798,LAI1MQ2K,이동전화,20220630_2354,사상구,출동,구급
6415799,N5KF2ELA,이동전화,20220630_2356,동구,출동,구급
6415800,53Q98D6Z,의료지도연결,20220630_2358,,안내,


In [3]:
df["접수경로"].value_counts()

접수경로
이동전화          3559271
기타             908928
일반전화           867867
의료지도연결         834152
IP전화           135052
공중전화            44241
SMS신고           25947
사후각지            25826
영상신고             5603
MMS신고            4843
시스템입력            3029
모바일앱신고            581
WEB신고             362
전통시장신고             19
긴급신고통합앱            12
구급예약                4
승강기신고               1
수출입 위험물 신고          1
Name: count, dtype: int64

In [4]:
from datetime import datetime, timedelta

df["dt"] = df["신고접수일시"].apply(lambda x : datetime.strptime(x, "%Y%m%d_%H%M"))
df["_dt"] = df["dt"] + timedelta(hours=6)
df["_date"] = df._dt.dt.date.astype(str)
df["_hour"] = df._dt.dt.hour
df = df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()
df

Unnamed: 0,_date,_hour,신고접수번호
0,2013-01-01,6,77
1,2013-01-01,7,81
2,2013-01-01,8,72
3,2013-01-01,9,45
4,2013-01-01,10,47
...,...,...,...
83227,2022-07-01,1,136
83228,2022-07-01,2,128
83229,2022-07-01,3,90
83230,2022-07-01,4,100


In [5]:
night = df[(df["_hour"] >= 0) & (df["_hour"] < 15)]
night = night[["_date", "신고접수번호"]].groupby("_date").sum()
print(night)

day = df[(df["_hour"] >= 15) & (df["_hour"] < 24)]
day = day[["_date", "신고접수번호"]].groupby("_date").sum()
print(day)

            신고접수번호
_date             
2013-01-01     613
2013-01-02     785
2013-01-03    1013
2013-01-04    1005
2013-01-05     791
...            ...
2022-06-27    1375
2022-06-28    1030
2022-06-29    1159
2022-06-30    1058
2022-07-01     708

[3469 rows x 1 columns]
            신고접수번호
_date             
2013-01-01    2057
2013-01-02     561
2013-01-03     577
2013-01-04     622
2013-01-05     786
...            ...
2022-06-26    1029
2022-06-27    1099
2022-06-28     884
2022-06-29     827
2022-06-30     901

[3468 rows x 1 columns]


In [6]:
data = pd.concat([night, day], axis="columns")
data = data.reset_index()
data

Unnamed: 0,_date,신고접수번호,신고접수번호.1
0,2013-01-01,613,2057.0
1,2013-01-02,785,561.0
2,2013-01-03,1013,577.0
3,2013-01-04,1005,622.0
4,2013-01-05,791,786.0
...,...,...,...
3464,2022-06-27,1375,1099.0
3465,2022-06-28,1030,884.0
3466,2022-06-29,1159,827.0
3467,2022-06-30,1058,901.0


In [7]:
data.columns = ["date", "night_y", "day_y"]
data

Unnamed: 0,date,night_y,day_y
0,2013-01-01,613,2057.0
1,2013-01-02,785,561.0
2,2013-01-03,1013,577.0
3,2013-01-04,1005,622.0
4,2013-01-05,791,786.0
...,...,...,...
3464,2022-06-27,1375,1099.0
3465,2022-06-28,1030,884.0
3466,2022-06-29,1159,827.0
3467,2022-06-30,1058,901.0


In [8]:
drop_date = data.dropna()
print(drop_date.shape)
# check = df[["date", "y"]].groupby("date").count()
# print(check)
# drop_dates = list(check[check["y"]] <= 24)


(3468, 3)


In [9]:
# pip install scikit-learn xgboost
# !pip install scikit-learn xgboost

In [10]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(drop_date, train_size=0.8, shuffle=False)

X_train = train[["date", "night_y"]]
y_train = train[["date", "day_y"]]

print(X_train["night_y"])
X_val = val[["date", "night_y"]]
y_val = val[["date", "day_y"]]
# print(X_val)

0        613
1        785
2       1013
3       1005
4        791
        ... 
2769    1206
2770    1345
2771    1109
2772    1142
2773    1078
Name: night_y, Length: 2774, dtype: int64


In [11]:
# print(type(X_train_night))

In [12]:
from xgboost.sklearn import XGBRegressor

xgb_params = {
    "n_estimator" : 100,
    "max_depth" : 5,
    "eval_metric" : "mape",
    "early_stopping_rounds" : 10,
    "random_state": 514
}

model = XGBRegressor(**xgb_params)
eval_set = [(X_val['night_y'], y_val['day_y'])]

model.fit(X=X_train['night_y'],
          y=y_train['day_y'],
          eval_set=eval_set,
          verbose=True)
prediction = model.predict(X_val['night_y'])



[0]	validation_0-mape:0.15153
[1]	validation_0-mape:0.14389
[2]	validation_0-mape:0.14805
[3]	validation_0-mape:0.15284
[4]	validation_0-mape:0.15766
[5]	validation_0-mape:0.16137
[6]	validation_0-mape:0.16486
[7]	validation_0-mape:0.16801
[8]	validation_0-mape:0.17015
[9]	validation_0-mape:0.17158
[10]	validation_0-mape:0.17227


Parameters: { "n_estimator" } are not used.



In [13]:
from sklearn.metrics import mean_absolute_percentage_error

mean_absolute_percentage_error(y_val['day_y'], prediction)

0.14389383811855994

In [14]:
# df["dt"] = df["신고접수일시"].apply(lambda x : datetime.strptime(x, "%Y%m%d_%H%M"))
# df["_dt"] = df["dt"] + timedelta(hours=6)
# df["_date"] = df._dt.dt.date.astype(str)
# df["_hour"] = df._dt.dt.hour
# df = df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()

test_df = pd.read_csv("data/test.csv")
test_df["dt"] = test_df["신고접수일시"].apply(lambda x : datetime.strptime(x, "%Y%m%d_%H%M"))
test_df["_dt"] = test_df["dt"] + timedelta(hours=6)
test_df["_date"] = test_df._dt.dt.date.astype(str)
test_df["_hour"] = test_df._dt.dt.hour
test_df = test_df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()
target_df = test_df[["_date", "신고접수번호"]].groupby("_date").sum()
target_df = target_df.reset_index()
target_df = target_df.rename(columns={"index": "date"})
target_df.columns = ['date', 'night_y']
target_df

Unnamed: 0,date,night_y
0,2022-07-01,1246
1,2022-07-02,1327
2,2022-07-03,1398
3,2022-07-04,1188
4,2022-07-05,1181
...,...,...
102,2022-10-11,1048
103,2022-10-12,962
104,2022-10-13,940
105,2022-10-14,920


In [15]:
prediction_result = model.predict(target_df['night_y'])

In [19]:
submission = pd.read_csv('./data/sample_submission.csv')
submission["y"] = prediction_result.reshape(-1)
submission.to_csv("submission.csv", index=False)

In [79]:
# submission = pd.DataFrame(prediction_result)
# # submission.columns = ['dates', 'y']
# result_df = pd.concat([target_df.date, submission], axis=1)
# result_df.columns = ["date", "y"]
# result_df
# # submission.to_csv('submission.csv', index=False)


Unnamed: 0,date,y
0,2022-07-01,1017.954834
1,2022-07-02,1017.954834
2,2022-07-03,1017.954834
3,2022-07-04,974.863281
4,2022-07-05,974.863281
...,...,...
102,2022-10-11,932.417236
103,2022-10-12,912.058411
104,2022-10-13,847.625061
105,2022-10-14,847.625061
