In [None]:
import pandas as pd
import numpy as np
import h2o
from h2o.estimators import H2OXGBoostEstimator
import xgboost as xgb
from sklearn.model_selection import train_test_split
import math
from datetime import datetime
import shap
import json
import ubjson

In [2]:
## 데이터 불러오기
dt_train = pd.read_csv('./train.csv')
dt_test = pd.read_csv('./test.csv')

In [3]:
# 2. 데이터 처리

## ECLO 축소
max_v = dt_train[['사망자수', '중상자수', '경상자수', '부상자수']].quantile(0.9995).to_dict()
# print(max_v) # {'사망자수': 1.0, '중상자수': 4.0, '경상자수': 8.0, '부상자수': 4.0}
# 각각 99.95%를 최댓값으로 설정 후 ECLO 재계산

for col in ['사망자수', '중상자수', '경상자수', '부상자수']:
    dt_train[col] = np.minimum(dt_train[col], max_v[col])

dt_train['ECLO'] = 10 * dt_train['사망자수'] + 5 * dt_train['중상자수'] + 3 * dt_train['경상자수'] + dt_train['부상자수']

In [4]:
dt_train

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,...,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,ECLO
0,ACCIDENT_00000,2019-01-01 00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,...,상해없음,보행자,여,70세,중상,0.0,1.0,0.0,0.0,5.0
1,ACCIDENT_00001,2019-01-01 00,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,보도통행중,기타,...,상해없음,보행자,남,61세,경상,0.0,0.0,1.0,0.0,3.0
2,ACCIDENT_00002,2019-01-01 01,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,차도통행중,안전운전불이행,...,상해없음,보행자,남,38세,경상,0.0,0.0,1.0,0.0,3.0
3,ACCIDENT_00003,2019-01-01 02,화요일,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,남,36세,중상,0.0,1.0,0.0,0.0,5.0
4,ACCIDENT_00004,2019-01-01 04,화요일,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,남,52세,경상,0.0,0.0,1.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,ACCIDENT_39604,2021-12-31 19,금요일,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,측면충돌,신호위반,...,상해없음,이륜,남,28세,경상,0.0,0.0,1.0,0.0,3.0
39605,ACCIDENT_39605,2021-12-31 19,금요일,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,측면충돌,안전거리미확보,...,상해없음,승용,남,52세,경상,0.0,0.0,1.0,0.0,3.0
39606,ACCIDENT_39606,2021-12-31 21,금요일,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,측면충돌,교차로운행방법위반,...,중상,승용,남,73세,중상,0.0,2.0,0.0,0.0,10.0
39607,ACCIDENT_39607,2021-12-31 22,금요일,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,여,57세,경상,0.0,0.0,1.0,0.0,3.0


In [5]:
# 나이 값 전처리: 정수화 및 '미분류' 값 nan처리 
def age_transform(x):
  try:
    ret = int(x.split('세')[0])
  except:
    ret = np.NaN

  return ret

dt_train['가해운전자 연령'] = dt_train['가해운전자 연령'].apply(lambda x:age_transform(x))
dt_train['피해운전자 연령'] = dt_train['피해운전자 연령'].apply(lambda x:age_transform(x))

In [6]:
dt_train.isna().sum()

ID                0
사고일시              0
요일                0
기상상태              0
시군구               0
도로형태              0
노면상태              0
사고유형              0
사고유형 - 세부분류       0
법규위반              0
가해운전자 차종          0
가해운전자 성별          0
가해운전자 연령        994
가해운전자 상해정도        0
피해운전자 차종        991
피해운전자 성별        991
피해운전자 연령       1060
피해운전자 상해정도      991
사망자수              0
중상자수              0
경상자수              0
부상자수              0
ECLO              0
dtype: int64

In [7]:
# 시군구별 집계값 계산
grouped_dt = dt_train.groupby('시군구')
dt_agg = grouped_dt.agg(
   사고건수 = ('ID', 'size'),
   사망자수 = ('사망자수', 'sum'),
   중상자수 = ('중상자수', 'sum'),
   경상자수 = ('경상자수', 'sum'),
   부상자수 = ('부상자수', 'sum'),
   화물건설비중=('가해운전자 차종', lambda x: (x.isin(['화물', '건설기계']).mean())),
   승합비중=('가해운전자 차종', lambda x: (x.isin(['승합']).mean())),
   이륜비중=('가해운전자 차종', lambda x: (x.isin(['이륜']).mean())),
   보행자비중=('피해운전자 차종', lambda x: (x.isin(['보행자']).mean())),
   가해중위연령=('가해운전자 연령', 'median'),
   피해중위연령=('피해운전자 연령', 'median'),
   가해고령비율= ('가해운전자 연령', lambda x: (x >= 65).mean()),
   피해고령비율= ('피해운전자 연령', lambda x: (x >= 65).mean()),   
)
dt_agg['사망중상'] = (dt_agg['사망자수'] + dt_agg['중상자수'])/ dt_agg['사고건수']
dt_agg['경상부상'] = (dt_agg['경상자수'] + dt_agg['부상자수'])/ dt_agg['사고건수']

In [8]:
dt_agg.reset_index(inplace=True)


In [9]:
# 지역구분 열 추가
dt_agg['지역구분'] = np.select(
    [
        dt_agg['사고건수'] >= 400,
        dt_agg['사고건수'] >= 200,
        dt_agg['사고건수'] >= 100,
        dt_agg['사고건수'] >= 50,
        dt_agg['사고건수'] >= 25
    ],
    [
        dt_agg['시군구'].str.replace('대구광역시 ', ''),
        'G200',
        'G100',
        'G050',
        'G025'
    ],
    default='G000'
)

In [10]:
# 평균 피해자수 기준 그룹 설정

# 사망중상구분 열 추가
dt_agg['사망중상구분'] = pd.cut(dt_agg['사망중상'], bins=[-1, 0.25, 0.3, 0.4, 2], labels=['GA1', 'GA2', 'GA3', 'GA4'])

# 경상부상구분 열 추가
dt_agg['경상부상구분'] = pd.cut(dt_agg['경상부상'], bins=[-1, 1.15, 1.3, 100], labels=['GB1', 'GB2', 'GB3'])

# 사망중상, 경상부상 열 삭제 및 필요 없는 열 삭제
dt_agg.drop(columns=['사망중상', '경상부상', '사망자수', '중상자수', '경상자수', '부상자수'], inplace=True)

# 사고건수가 10 미만인 행에 대해 각 열 값을 NA로 변경
dt_agg.loc[dt_agg['사고건수'] < 10, ['가해중위연령', '피해중위연령', '가해고령비율', '피해고령비율', '화물건설비중', '승합비중', '이륜비중', '보행자비중']] = np.nan

In [11]:
dt_agg

Unnamed: 0,시군구,사고건수,화물건설비중,승합비중,이륜비중,보행자비중,가해중위연령,피해중위연령,가해고령비율,피해고령비율,지역구분,사망중상구분,경상부상구분
0,대구광역시 남구 대명동,1749,0.076615,0.032018,0.126358,0.219554,50.0,44.0,0.180103,0.134934,남구 대명동,GA1,GB2
1,대구광역시 남구 봉덕동,625,0.100800,0.040000,0.091200,0.177600,49.0,46.0,0.185600,0.164800,남구 봉덕동,GA2,GB2
2,대구광역시 남구 이천동,185,0.081081,0.016216,0.113514,0.151351,47.5,47.0,0.183784,0.178378,G100,GA2,GB2
3,대구광역시 달서구 갈산동,110,0.218182,0.036364,0.063636,0.081818,50.5,49.0,0.090909,0.045455,G100,GA3,GB1
4,대구광역시 달서구 감삼동,678,0.112094,0.029499,0.112094,0.163717,49.0,43.0,0.131268,0.092920,달서구 감삼동,GA1,GB2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,대구광역시 중구 태평로3가,40,0.150000,0.000000,0.100000,0.075000,55.0,54.5,0.250000,0.275000,G025,GA3,GB2
195,대구광역시 중구 포정동,26,0.115385,0.115385,0.230769,0.346154,56.0,46.5,0.153846,0.230769,G025,GA2,GB1
196,대구광역시 중구 하서동,10,0.100000,0.000000,0.200000,0.000000,48.5,26.0,0.200000,0.200000,G000,GA2,GB3
197,대구광역시 중구 향촌동,5,,,,,,,,,G000,GA1,GB1


In [12]:
dt_train

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,...,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,ECLO
0,ACCIDENT_00000,2019-01-01 00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,...,상해없음,보행자,여,70.0,중상,0.0,1.0,0.0,0.0,5.0
1,ACCIDENT_00001,2019-01-01 00,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,보도통행중,기타,...,상해없음,보행자,남,61.0,경상,0.0,0.0,1.0,0.0,3.0
2,ACCIDENT_00002,2019-01-01 01,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,차도통행중,안전운전불이행,...,상해없음,보행자,남,38.0,경상,0.0,0.0,1.0,0.0,3.0
3,ACCIDENT_00003,2019-01-01 02,화요일,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,남,36.0,중상,0.0,1.0,0.0,0.0,5.0
4,ACCIDENT_00004,2019-01-01 04,화요일,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,남,52.0,경상,0.0,0.0,1.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,ACCIDENT_39604,2021-12-31 19,금요일,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,측면충돌,신호위반,...,상해없음,이륜,남,28.0,경상,0.0,0.0,1.0,0.0,3.0
39605,ACCIDENT_39605,2021-12-31 19,금요일,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,측면충돌,안전거리미확보,...,상해없음,승용,남,52.0,경상,0.0,0.0,1.0,0.0,3.0
39606,ACCIDENT_39606,2021-12-31 21,금요일,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,측면충돌,교차로운행방법위반,...,중상,승용,남,73.0,중상,0.0,2.0,0.0,0.0,10.0
39607,ACCIDENT_39607,2021-12-31 22,금요일,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,여,57.0,경상,0.0,0.0,1.0,0.0,3.0


In [13]:
# 데이터 결합
dt_train =  pd.merge(dt_train, dt_agg, on='시군구')
dt_test =  pd.merge(dt_test, dt_agg, on='시군구')

In [14]:
dt_train

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,...,승합비중,이륜비중,보행자비중,가해중위연령,피해중위연령,가해고령비율,피해고령비율,지역구분,사망중상구분,경상부상구분
0,ACCIDENT_00000,2019-01-01 00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,...,0.022901,0.167939,0.305344,54.5,49.0,0.229008,0.236641,G100,GA2,GB1
1,ACCIDENT_00090,2019-01-04 04,금요일,맑음,대구광역시 중구 대신동,교차로 - 교차로부근,건조,차대차,추돌,안전운전불이행,...,0.022901,0.167939,0.305344,54.5,49.0,0.229008,0.236641,G100,GA2,GB1
2,ACCIDENT_00296,2019-01-10 15,목요일,맑음,대구광역시 중구 대신동,교차로 - 교차로부근,건조,차대사람,횡단중,보행자보호의무위반,...,0.022901,0.167939,0.305344,54.5,49.0,0.229008,0.236641,G100,GA2,GB1
3,ACCIDENT_00459,2019-01-15 17,화요일,맑음,대구광역시 중구 대신동,단일로 - 터널,건조,차대사람,횡단중,보행자보호의무위반,...,0.022901,0.167939,0.305344,54.5,49.0,0.229008,0.236641,G100,GA2,GB1
4,ACCIDENT_00588,2019-01-19 09,토요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,기타,안전운전불이행,...,0.022901,0.167939,0.305344,54.5,49.0,0.229008,0.236641,G100,GA2,GB1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,ACCIDENT_33623,2021-07-04 13,일요일,맑음,대구광역시 중구 동성로1가,단일로 - 기타,건조,차대사람,기타,기타,...,,,,,,,,G000,GA4,GB1
39605,ACCIDENT_39308,2021-12-22 18,수요일,맑음,대구광역시 중구 동성로1가,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,,,,,,,,G000,GA4,GB1
39606,ACCIDENT_33845,2021-07-11 17,일요일,맑음,대구광역시 동구 평광동,단일로 - 기타,젖음/습기,차대차,측면충돌,안전운전불이행,...,,,,,,,,G000,GA1,GB3
39607,ACCIDENT_36160,2021-09-21 16,화요일,비,대구광역시 북구 도남동,단일로 - 기타,젖음/습기,차량단독,전도전복 - 전도,안전운전불이행,...,,,,,,,,G000,GA1,GB3


In [15]:
# 추가 파생변수 생성
dt_train[['시군구', '읍면동']] = dt_train['시군구'].str.split(' ', n=1, expand=True)
dt_train[['도로형태1', '도로형태2']] = dt_train['도로형태'].str.split(' - ', n=1, expand=True)
dt_train['사고일시'] = pd.to_datetime(dt_train['사고일시'], format='%Y-%m-%d %H')
dt_train['사고월'] = dt_train['사고일시'].dt.month
dt_train['사고시'] = dt_train['사고일시'].dt.hour
dt_train['사고시s'] = np.sin(np.pi * dt_train['사고시'] / 24 * 2)
dt_train['사고시c'] = np.cos(np.pi * dt_train['사고시'] / 24 * 2)
dt_train['사고시간대'] = np.select(
    [(dt_train['사고시'].between(7, 9)),
     (dt_train['사고시'].between(17, 19)),
     (dt_train['사고시'].between(10, 17)),
     (dt_train['사고시'].between(20, 22))],
    ['1_출근', '2_퇴근', '3_주간', '4_야간'],
    default='5_심야'
)
dt_train['요일2'] = np.select(
    [dt_train['요일'].isin(['토요일', '일요일']),
     dt_train['요일'].isin(['월요일', '화요일', '수요일', '목요일', '금요일'])],
    ['1_주말', '2_주중'],
    default='3_기타'
)

dt_test[['시군구', '읍면동']] = dt_test['시군구'].str.split(' ', n=1, expand=True)
dt_test[['도로형태1', '도로형태2']] = dt_test['도로형태'].str.split(
    ' - ', n=1, expand=True)
dt_test['사고일시'] = pd.to_datetime(dt_test['사고일시'], format='%Y-%m-%d %H')
dt_test['사고월'] = dt_test['사고일시'].dt.month
dt_test['사고시'] = dt_test['사고일시'].dt.hour
dt_test['사고시s'] = np.sin(np.pi * dt_test['사고시'] / 24 * 2)
dt_test['사고시c'] = np.cos(np.pi * dt_test['사고시'] / 24 * 2)
dt_test['사고시간대'] = np.select(
    [(dt_test['사고시'].between(7, 9)),
     (dt_test['사고시'].between(17, 19)),
     (dt_test['사고시'].between(10, 17)),
     (dt_test['사고시'].between(20, 22))],
    ['1_출근', '2_퇴근', '3_주간', '4_야간'],
    default='5_심야'
)
dt_test['요일2'] = np.select(
    [dt_test['요일'].isin(['토요일', '일요일']),
     dt_test['요일'].isin(['월요일', '화요일', '수요일', '목요일', '금요일'])],
    ['1_주말', '2_주중'],
    default='3_기타'
)


In [16]:
# 휴일 처리(수작업)
hds = pd.to_datetime(['2019-01-01', '2019-02-04', '2019-02-05', '2019-02-06', '2019-03-01', '2019-05-05', '2019-05-06', '2019-05-12', '2019-06-06', '2019-08-15', '2019-09-12', '2019-09-13', '2019-09-14', '2019-10-03', '2019-10-09', '2019-12-25', 
        '2020-01-01', '2020-01-24', '2020-01-25', '2020-01-26', '2020-01-27', '2020-03-01', '2020-04-15', '2020-04-30', '2020-05-05', '2020-06-06', '2020-08-15', '2020-08-17', '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03', '2020-10-09', '2020-12-25', 
        '2021-01-01', '2021-02-11', '2021-02-12', '2021-02-13', '2021-03-01', '2021-05-05', '2021-05-19', '2021-06-06', '2021-08-15', '2021-08-16', '2021-09-20', '2021-09-21', '2021-09-22', '2021-10-03', '2021-10-04', '2021-10-09', '2021-10-11', '2021-12-25', 
        '2022-01-01', '2022-01-31', '2022-02-01', '2022-02-02', '2022-03-01', '2022-03-09', '2022-05-05', '2022-05-08', '2022-06-01', '2022-06-06', '2022-08-15', '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2022-10-10', '2022-12-25'])
hds = hds.date
dt_train['휴일구분'] = np.where(dt_train['사고일시'].dt.date.isin(hds), '공휴일', 'N')
dt_test['휴일구분'] = np.where(dt_test['사고일시'].dt.date.isin(hds), '공휴일', 'N')

In [17]:
# 형식 변환
cat_vars = ['요일', '요일2', '지역구분', '사망중상구분', '경상부상구분', '기상상태', '노면상태', '도로형태', '도로형태1', '도로형태2', '사고유형', '시군구', '읍면동', '사고월',
            '사고시간대', '사고시', '휴일구분']
dt_train[cat_vars] = dt_train[cat_vars].astype('category')
dt_test[cat_vars] = dt_test[cat_vars].astype('category')

In [18]:
dt_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39609 entries, 0 to 39608
Data columns (total 45 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   ID           39609 non-null  object        
 1   사고일시         39609 non-null  datetime64[ns]
 2   요일           39609 non-null  category      
 3   기상상태         39609 non-null  category      
 4   시군구          39609 non-null  category      
 5   도로형태         39609 non-null  category      
 6   노면상태         39609 non-null  category      
 7   사고유형         39609 non-null  category      
 8   사고유형 - 세부분류  39609 non-null  object        
 9   법규위반         39609 non-null  object        
 10  가해운전자 차종     39609 non-null  object        
 11  가해운전자 성별     39609 non-null  object        
 12  가해운전자 연령     38615 non-null  float64       
 13  가해운전자 상해정도   39609 non-null  object        
 14  피해운전자 차종     38618 non-null  object        
 15  피해운전자 성별     38618 non-null  object        
 16  피해운전

In [19]:
# 3. 모형 적합
## 설명 변수 목록
x_names = list(set(dt_test.columns) - set(['ID', 'lECLO', 'ECLO', '사고일시', '도로형태', '읍면동']))
## log(ECLO + 1) 변환: 변환 + RMSE 활용 = RMSLE
dt_train['lECLO'] = np.log1p(dt_train['ECLO'])

# 데이터셋 분할
X_train, X_val, y_train, y_val = train_test_split(dt_train[x_names], dt_train['lECLO'], test_size=0.2, random_state=42)

# ## hex
# h2o.init()
# h2o.no_progress()
# hex_train = h2o.H2OFrame(dt_train[x_names + ['lECLO']])
# hex_test = h2o.H2OFrame(dt_test[x_names])

In [20]:
## 파라미터 설정 - seed 변경 + grid search 등을 활용한 결과

# dt_params = pd.DataFrame({
#     'learn_rate': [0.05, 0.05, 0.1, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05],
#     'max_depth': [6, 6, 5, 6, 6, 5, 6, 6, 6],
#     'sample_rate': [0.75, 0.8, 0.7, 0.65, 0.65, 0.6, 0.8, 0.8, 0.75],
#     'col_sample_rate_per_tree': [0.85, 0.8, 0.8, 0.9, 0.95, 0.9, 0.8, 0.9, 0.8],
#     'col_sample_rate': [1.0, 0.85, 0.95, 0.85, 1, 1, 0.95, 0.95, 0.95],
#     'min_rows': [10, 10, 15, 15, 10, 15, 10, 15, 10],
#     'reg_alpha': [1, 1, 0, 0, 0, 1, 0, 1, 0],
#     'reg_lambda': [1, 10, 0, 0, 1, 1, 10, 1, 10],
#     'ntrees': 120,
#     'seed': [0, 0, 0, 1, 1, 1, 3, 3, 3]
# })

xgb_params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.95,
    'min_child_weight': 15,
    'gamma': 0,
    'alpha': 1,
    'lambda': 1,
    'n_estimators': 120,
    'seed': 0,
    'objective':'reg:squaredlogerror',
    'eval_metric': 'rmsle',
}

In [21]:
# 데이터셋 생성
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dval = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)
# 각 모형 적합

# model = xgb.XGBRegressor(
#     learning_rate = 0.05,
#     max_depth = 6,
#     subsample = 0.8,
#     colsample_bytree = 0.8,
#     colsample_bylevel = 0.95,
#     min_child_weight = 15,
#     reg_alpha = 1,
#     n_estimators = 120,
#     seed = 42,
#     objective = 'reg:squaredlogerror',
#     eval_metric = 'rmsle',
#     enable_categorical = True,
# )

# model.fit(X_train, y_train)


model = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=xgb_params['n_estimators'],
    early_stopping_rounds=10,
    evals=[(dval, 'validation')],
    verbose_eval=True
)



# for i in range(9):
#     params = dt_params.iloc[i].to_dict()
#     params['seed'] = int(params['seed'])
#     params['ntrees'] = int(params['ntrees'])
#     params['max_depth'] = int(params['max_depth'])
#     md_xgb = H2OXGBoostEstimator(
#         learn_rate=params['learn_rate'],
#         ntrees=params['ntrees'],
#         max_depth=params['max_depth'],
#         min_rows=params['min_rows'],
#         col_sample_rate_per_tree=params['col_sample_rate_per_tree'],
#         col_sample_rate=params['col_sample_rate'],
#         sample_rate=params['sample_rate'],
#         reg_alpha=params['reg_alpha'],
#         reg_lambda=params['reg_lambda'],
#         stopping_rounds=10,
#         stopping_metric='rmse',
#         stopping_tolerance=0.00001,
#         score_each_iteration=True,
#         seed=params['seed']
#     )
    
#     md_xgb.train(x=x_names, y='lECLO', training_frame=hex_train)

[0]	validation-rmsle:0.56695
[1]	validation-rmsle:0.55044
[2]	validation-rmsle:0.53436
[3]	validation-rmsle:0.51866
[4]	validation-rmsle:0.50336
[5]	validation-rmsle:0.48845
[6]	validation-rmsle:0.47394
[7]	validation-rmsle:0.45986
[8]	validation-rmsle:0.44619
[9]	validation-rmsle:0.43289
[10]	validation-rmsle:0.42001
[11]	validation-rmsle:0.40754
[12]	validation-rmsle:0.39547
[13]	validation-rmsle:0.38379
[14]	validation-rmsle:0.37253
[15]	validation-rmsle:0.36166
[16]	validation-rmsle:0.35119
[17]	validation-rmsle:0.34110
[18]	validation-rmsle:0.33142
[19]	validation-rmsle:0.32211
[20]	validation-rmsle:0.31317
[21]	validation-rmsle:0.30463
[22]	validation-rmsle:0.29648
[23]	validation-rmsle:0.28866
[24]	validation-rmsle:0.28117
[25]	validation-rmsle:0.27406
[26]	validation-rmsle:0.26728
[27]	validation-rmsle:0.26090
[28]	validation-rmsle:0.25481
[29]	validation-rmsle:0.24901
[30]	validation-rmsle:0.24357
[31]	validation-rmsle:0.23841
[32]	validation-rmsle:0.23355
[33]	validation-rmsl

Parameters: { "n_estimators" } are not used.



[61]	validation-rmsle:0.17521
[62]	validation-rmsle:0.17474
[63]	validation-rmsle:0.17427
[64]	validation-rmsle:0.17388
[65]	validation-rmsle:0.17351
[66]	validation-rmsle:0.17317
[67]	validation-rmsle:0.17285
[68]	validation-rmsle:0.17256
[69]	validation-rmsle:0.17231
[70]	validation-rmsle:0.17206
[71]	validation-rmsle:0.17183
[72]	validation-rmsle:0.17163
[73]	validation-rmsle:0.17144
[74]	validation-rmsle:0.17126
[75]	validation-rmsle:0.17111
[76]	validation-rmsle:0.17096
[77]	validation-rmsle:0.17082
[78]	validation-rmsle:0.17072
[79]	validation-rmsle:0.17063
[80]	validation-rmsle:0.17053
[81]	validation-rmsle:0.17044
[82]	validation-rmsle:0.17037
[83]	validation-rmsle:0.17028
[84]	validation-rmsle:0.17020
[85]	validation-rmsle:0.17012
[86]	validation-rmsle:0.17005
[87]	validation-rmsle:0.17000
[88]	validation-rmsle:0.16992
[89]	validation-rmsle:0.16989
[90]	validation-rmsle:0.16984
[91]	validation-rmsle:0.16981
[92]	validation-rmsle:0.16978
[93]	validation-rmsle:0.16974
[94]	valid

In [22]:
# 예측 값 계산 및 exp(pred -1) 변환
# X_test = dt_test[x_names]
# pred2 = model.predict(X_test)
dtest = xgb.DMatrix(dt_test[x_names], enable_categorical=True)
pred = model.predict(dtest)

dt_subm = pd.read_csv('./sample_submission.csv')
dt_subm['ECLO'] = np.expm1(pred)

#결과 저장

dt_subm.to_csv('./subm.csv', index=False)
# pred = md_xgb.predict(hex_test).as_data_frame()
# dt_subm = pd.read_csv('./sample_submission.csv')
# dt_subm['ECLO'] = pred['predict'].apply(lambda x: math.exp(x) - 1)

# 변수 중요도 계산
feature_importances = model.get_score(importance_type='weight')
dt_varimp = pd.DataFrame(list(feature_importances.items()), columns=['variable', 'imp'])
print(dt_varimp)

   variable    imp
0    가해중위연령   39.0
1     도로형태1   74.0
2      휴일구분    9.0
3      이륜비중  102.0
4    경상부상구분   49.0
5     사고시간대   49.0
6      승합비중   97.0
7      노면상태   10.0
8    피해고령비율  111.0
9    피해중위연령   68.0
10     사고건수  118.0
11     사고유형  129.0
12    보행자비중   97.0
13      요일2   21.0
14      사고월  233.0
15       요일  188.0
16    도로형태2   91.0
17      사고시  179.0
18   사망중상구분   53.0
19   가해고령비율  105.0
20   화물건설비중  100.0
21     사고시c   70.0
22     지역구분  159.0
23     기상상태    6.0
24     사고시s   87.0


In [None]:
## shap 추가

## shap를 위해 xgb 모델 json dump
# Convert the XGBoost model to a dictionary
# model_dict = model.get_dump(with_stats=True, dump_format='json')

# # Save the model dictionary to a JSON file
# with open('xgb_model.json', 'w') as json_file:
#     json.dump(model_dict, json_file)

# # Save the model dictionary to a UBJSON file
# with open('xgb_model.ubjson', 'wb') as ubjson_file:
#     ubjson.dump(model_dict, ubjson_file)
    
# explainer = shap.TreeExplainer(model)
# shap_values = explainer.shap_values(dtest)
# shap.summary_plot(shap_values, dtest)

In [None]:
# # 결과 저장
# i = 1
# globals()[f'dt_subm_{i}'] = dt_subm
# globals()[f'dt_varimp_{i}'] = pd.DataFrame(md_xgb._model_json['output']['variable_importances']['variable']).rename(columns={'variable': 'imp'})

# print("Model", i, "RMSLE :", md_xgb.rmse(xval=True))

# 4. 조합 및 결과 생성

# 결과 평균 계산 함수
def agg_mean(x):
    dts = pd.concat(x)
    return dts.groupby('ID')['ECLO'].mean().reset_index()

# 최댓값, 최솟값 제외 평균 계산 함수
def agg_mmean(x):
    dts = pd.concat(x)
    return dts.groupby('ID')['ECLO'].apply(lambda x: (x.sum() - x.min() - x.max()) / (len(x) - 2)).reset_index()

# 예측값 통합
dt_subms = [globals()[var] for var in globals() if var.startswith('dt_subm_')]

# 제출 데이터 생성
agg_mmean(dt_subms).to_csv('subm/subm_fin.csv', index=False)

# 변수 중요도 계산
dt_varimps = [globals()[var] for var in globals() if var.startswith('dt_varimp_')]
dt_varimp = pd.concat(dt_varimps).groupby('variable')['imp'].mean().reset_index()
print(dt_varimp)