#### 큰 흐름
- 라이브러리 로드
- 데이터 확인
- 모델 학습
- 제출 파일

---

##### 라이브러리 로드

In [90]:
import numpy as np
import pandas as pd

In [91]:
# df = data frame
df = pd.read_csv("data/train.csv")
df

Unnamed: 0,신고접수번호,접수경로,신고접수일시,시군구,접수분류,긴급구조종류
0,JGT2IBW4,이동전화,20130101_0001,,안내,
1,74ED11Z4,기타,20130101_0002,,안내,
2,B4I8RIBW,이동전화,20130101_0002,,안내,
3,482FI3AJ,이동전화,20130101_0003,북구,출동,구급
4,AR9N3QT4,이동전화,20130101_0004,,안내,
...,...,...,...,...,...,...
6415797,KAOUMKLU,기타,20220630_2354,,안내,
6415798,LAI1MQ2K,이동전화,20220630_2354,사상구,출동,구급
6415799,N5KF2ELA,이동전화,20220630_2356,동구,출동,구급
6415800,53Q98D6Z,의료지도연결,20220630_2358,,안내,


In [92]:
df["접수경로"].value_counts()

접수경로
이동전화          3559271
기타             908928
일반전화           867867
의료지도연결         834152
IP전화           135052
공중전화            44241
SMS신고           25947
사후각지            25826
영상신고             5603
MMS신고            4843
시스템입력            3029
모바일앱신고            581
WEB신고             362
전통시장신고             19
긴급신고통합앱            12
구급예약                4
승강기신고               1
수출입 위험물 신고          1
Name: count, dtype: int64

In [93]:
from datetime import datetime, timedelta

df["dt"] = df["신고접수일시"].apply(lambda x : datetime.strptime(x, "%Y%m%d_%H%M"))
df["_dt"] = df["dt"] + timedelta(hours=6)
df["_date"] = df._dt.dt.date.astype(str)
df["_hour"] = df._dt.dt.hour
df = df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()
df

Unnamed: 0,_date,_hour,신고접수번호
0,2013-01-01,6,77
1,2013-01-01,7,81
2,2013-01-01,8,72
3,2013-01-01,9,45
4,2013-01-01,10,47
...,...,...,...
83227,2022-07-01,1,136
83228,2022-07-01,2,128
83229,2022-07-01,3,90
83230,2022-07-01,4,100


In [94]:
night = df[(df["_hour"] >= 0) & (df["_hour"] < 15)]
night = night[["_date", "신고접수번호"]].groupby("_date").sum()
print(night)

day = df[(df["_hour"] >= 15) & (df["_hour"] < 24)]
day = day[["_date", "신고접수번호"]].groupby("_date").sum()
print(day)

            신고접수번호
_date             
2013-01-01     613
2013-01-02     785
2013-01-03    1013
2013-01-04    1005
2013-01-05     791
...            ...
2022-06-27    1375
2022-06-28    1030
2022-06-29    1159
2022-06-30    1058
2022-07-01     708

[3469 rows x 1 columns]
            신고접수번호
_date             
2013-01-01    2057
2013-01-02     561
2013-01-03     577
2013-01-04     622
2013-01-05     786
...            ...
2022-06-26    1029
2022-06-27    1099
2022-06-28     884
2022-06-29     827
2022-06-30     901

[3468 rows x 1 columns]


In [95]:
data = pd.concat([night, day], axis="columns")
data = data.reset_index()
data

Unnamed: 0,_date,신고접수번호,신고접수번호.1
0,2013-01-01,613,2057.0
1,2013-01-02,785,561.0
2,2013-01-03,1013,577.0
3,2013-01-04,1005,622.0
4,2013-01-05,791,786.0
...,...,...,...
3464,2022-06-27,1375,1099.0
3465,2022-06-28,1030,884.0
3466,2022-06-29,1159,827.0
3467,2022-06-30,1058,901.0


In [96]:
data.columns = ["date", "night_y", "day_y"]
data

Unnamed: 0,date,night_y,day_y
0,2013-01-01,613,2057.0
1,2013-01-02,785,561.0
2,2013-01-03,1013,577.0
3,2013-01-04,1005,622.0
4,2013-01-05,791,786.0
...,...,...,...
3464,2022-06-27,1375,1099.0
3465,2022-06-28,1030,884.0
3466,2022-06-29,1159,827.0
3467,2022-06-30,1058,901.0


In [97]:
drop_date = data.dropna()
print(drop_date.shape)
# check = df[["date", "y"]].groupby("date").count()
# print(check)
# drop_dates = list(check[check["y"]] <= 24)


(3468, 3)


In [98]:
# pip install scikit-learn xgboost
# !pip install scikit-learn xgboost

In [99]:
X_columns = ["night_y"]
y_columns = ["day_y"]
from sklearn.model_selection import train_test_split, GridSearchCV

train, val = train_test_split(drop_date, train_size=0.8, shuffle=False)

X_train = train[X_columns].to_numpy()
y_train = train[y_columns].to_numpy()

X_val = val[X_columns].to_numpy()
y_val = val[y_columns].to_numpy()


In [110]:
from catboost import CatBoostRegressor

catboost_model = CatBoostRegressor()

# 하이퍼파라미터 그리드 정의
param_grid = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

# GridSearchCV를 사용하여 하이퍼파라미터 튜닝
grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, 
                           cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1, verbose=1)

# 모델 훈련
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
0:	learn: 396.2113498	total: 676us	remaining: 67ms
0:	learn: 414.6358690	total: 736us	remaining: 72.9ms
1:	learn: 394.2203178	total: 1.01ms	remaining: 49.5ms
1:	learn: 412.7678311	total: 1.09ms	remaining: 53.2ms
2:	learn: 392.2691452	total: 1.3ms	remaining: 42.2ms
2:	learn: 410.9398502	total: 1.4ms	remaining: 45.3ms
3:	learn: 390.3820663	total: 1.62ms	remaining: 38.8ms
3:	learn: 409.1567392	total: 1.7ms	remaining: 40.8ms
4:	learn: 388.5186294	total: 1.9ms	remaining: 36.2ms
4:	learn: 407.3675743	total: 2ms	remaining: 38ms
5:	learn: 386.6416689	total: 2.19ms	remaining: 34.3ms
5:	learn: 405.5917907	total: 2.3ms	remaining: 36.1ms
6:	learn: 384.7456568	total: 2.49ms	remaining: 33.1ms
6:	learn: 403.9000168	total: 2.6ms	remaining: 34.6ms
7:	learn: 382.9734882	total: 2.76ms	remaining: 31.8ms
8:	learn: 381.1362215	total: 3.05ms	remaining: 30.9ms
7:	learn: 402.2438189	total: 2.98ms	remaining: 34.3ms
9:	learn: 379.3427462	total: 3.33ms

In [111]:

# Best Parameters: {'depth': 6, 'iterations': 100, 'l2_leaf_reg': 5, 'learning_rate': 0.05}
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_val)
mape = mean_absolute_percentage_error(y_val, predictions)
print(mape)


0.1667247679668808


In [103]:
# from catboost import CatBoostRegressor
# model = CatBoostRegressor(depth=6, iterations=100, l2_leaf_reg=5, learning_rate=0.05, verbose=0)
# model.fit(X=X_train, y=y_train)
# y_pred = model.predict(X_val)
# # MAPE 계산
# mape = mean_absolute_percentage_error(y_val, y_pred)


0.1667247679668808


In [104]:

# # 주간 신고 건수(day_y)를 예측하는 함수를 만든다.
# def predict_day_count(night_count):
#     day_count = model.predict(night_count)
#     return pd.Series([day_count])

# # 예측 결과를 저장할 데이터프레임을 생성한다.
# predicted_results = pd.DataFrame(columns=['day_y'])

# # 각 날짜에 대한 주간 신고 건수를 예측한다.
# for date, night_count in X_val.iterrows():
#     day_count = predict_day_count(night_count)
#     predicted_results = predicted_results.append(day_count)

# mape = mean_absolute_percentage_error(y_test, predicted_results)
# print(mape)

In [112]:
test_df = pd.read_csv("data/test.csv")
test_df["dt"] = test_df["신고접수일시"].apply(lambda x : datetime.strptime(x, "%Y%m%d_%H%M"))
test_df["_dt"] = test_df["dt"] + timedelta(hours=6)
test_df["_date"] = test_df._dt.dt.date.astype(str)
test_df["_hour"] = test_df._dt.dt.hour
test_df = test_df[["_date", "_hour", "신고접수번호"]].groupby(["_date", "_hour"]).count().reset_index()
target_df = test_df[["_date", "신고접수번호"]].groupby("_date").sum()
target_df = target_df.reset_index()
target_df = target_df.rename(columns={"index": "date"})
target_df.columns = ['date', 'night_y']
target_df

Unnamed: 0,date,night_y
0,2022-07-01,1246
1,2022-07-02,1327
2,2022-07-03,1398
3,2022-07-04,1188
4,2022-07-05,1181
...,...,...
102,2022-10-11,1048
103,2022-10-12,962
104,2022-10-13,940
105,2022-10-14,920


In [113]:
prediction_result = best_model.predict(target_df[X_columns].to_numpy())
print(prediction_result)

# submission = pd.read_csv('./data/sample_submission.csv')
# submission["y"] = prediction_result.reshape(-1)
# submission.to_csv("submission.csv", index=False)

[1159.55647864 1185.09705337 1337.78693418 1098.07474979 1098.07474979
 1027.25786469 1001.19317768 1093.98101499 1247.57859077 2053.45605383
  913.81043075 1013.74894536 1061.50834778 1019.13016636  866.2264147
 1179.91651756 1696.78964783 1247.57859077 1165.93173084 1035.57657408
 1430.9704078  1247.57859077 1157.01939192 2053.45605383 1512.05933685
 1247.57859077 1161.00135959 1093.98101499 1512.05933685 1512.05933685
 2389.19849427 1696.78964783 1247.57859077 1512.05933685 1247.57859077
 1512.05933685 2389.19849427 2984.29121767 1696.78964783 2389.19849427
 1696.78964783 1247.57859077 1696.78964783 2053.45605383 2984.29121767
 2561.3346402  2389.19849427 1512.05933685 2389.19849427 1512.05933685
 1512.05933685 2389.19849427 2389.19849427 1247.57859077 1159.55647864
 1093.98101499 1247.57859077 1696.78964783 2053.45605383 1337.78693418
 1188.52597744 1165.93173084 1006.95752179 1013.74894536 1247.57859077
 2053.45605383 1696.78964783 2561.3346402  1159.55647864 1185.09705337
 1696.7

In [115]:
submission = pd.read_csv("data/sample_submission.csv")
submission["y"] = prediction_result.reshape(-1)
submission.to_csv("submission.csv", index=False)

# submission.to_csv('submission.csv', index=False)
