In [10]:
import pandas as pd
import geopandas as gpd
import pyproj
from tqdm import tqdm

import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.graph_objs as go
import plotly.offline as offline
from plotly.subplots import make_subplots

import folium
from folium import plugins
from folium.plugins import HeatMap
from folium import FeatureGroup
from folium.plugins import HeatMapWithTime

from branca.colormap import linear
# 모든 열이 생략되지 않도록 설정
pd.set_option('display.max_columns', None)

#### 기상상태별 교통사고 위험도

In [11]:
weather = pd.read_csv('open/external_open/도로교통공단_도로종류별 기상상태별 교통사고 통계_20221231.csv', encoding='cp949')
weather_ag = weather.drop(['도로종류'], axis=1).groupby('기상상태').sum()
weather_ag.index = ['기타', '눈', '맑음', '비', '안개', '흐림']

# 사고건수 대비 ECLO(인명피해 심각도) 계산 후 위험도 정규화(Min-Max) 진행
weather_ag['weather_danger'] = ((10 * weather_ag['사망자수']) + (5 * weather_ag['중상자수']) + (3 * weather_ag['경상자수']) + (1 * weather_ag['부상신고자수'])) / weather_ag['사고건수'] 
weather_ag['weather_danger'] = (weather_ag['weather_danger'] - weather_ag['weather_danger'].min())/ (weather_ag['weather_danger'].max() - weather_ag['weather_danger'].min())

# 날씨별 위험도 딕셔너리 생성
weather_danger_dic = dict(zip(weather_ag.index, weather_ag['weather_danger']))
weather_danger_dic

{'기타': 0.0,
 '눈': 0.630249016035219,
 '맑음': 0.14492007954600153,
 '비': 0.3198248805428553,
 '안개': 1.0,
 '흐림': 0.21384592769016494}

#### 도로형태별 교통사고 위험도

In [12]:
# 데이터로드, 마지막 두 행 선택 및 통일 후 재결합
roadshape2020_df = pd.read_csv('open/external_open/도로교통공단_도로형태별 교통사고(2020).csv', encoding='cp949')
roadshape2020_df.drop(8, inplace=True)
last_two_rows = roadshape2020_df.iloc[8:,:]
last_two_rows.loc[:, '도로형태_대분류'] = last_two_rows.iloc[0]['도로형태_대분류']
last_two_rows.loc[:, '도로형태'] = last_two_rows.iloc[0]['도로형태']
summed_rows = last_two_rows.groupby(['도로형태_대분류', '도로형태']).sum().reset_index()

roadshape2020_df = pd.concat([roadshape2020_df.iloc[:8,:], summed_rows])

# 데이터로드, 마지막 두 행 선택 및 통일 후 재결합
roadshape2021_df = pd.read_csv('open/external_open/도로교통공단_도로형태별 교통사고(2021).csv', encoding='cp949')
roadshape2021_df.drop(8, inplace=True)
last_two_rows = roadshape2021_df.iloc[8:,:]
last_two_rows.loc[:, '도로형태_대분류'] = last_two_rows.iloc[0]['도로형태_대분류']
last_two_rows.loc[:, '도로형태'] = last_two_rows.iloc[0]['도로형태']
summed_rows = last_two_rows.groupby(['도로형태_대분류', '도로형태']).sum().reset_index()

roadshape2021_df = pd.concat([roadshape2020_df.iloc[:8,:], summed_rows])

# 데이터로드, 마지막 행 편집
roadshape2022_df = pd.read_csv('open/external_open/도로교통공단_도로형태별 교통사고(2022).csv', encoding='cp949')
roadshape2022_df.drop(8, inplace=True)
roadshape2022_df.iloc[8:,:2] = ['기타', '기타']

roadshape_df = pd.concat([roadshape2020_df, roadshape2021_df, roadshape2022_df]).groupby(['도로형태_대분류', '도로형태']).sum().reset_index()
roadshape_df['도로형태'] = ['교차로안', '교차로부근', '교차로횡단보도내', '기타', '고가도로위', '교량', '기타', '지하차도(도로)내', '터널']
roadshape_df['roadshape_danger'] = ((10 * roadshape_df['사망자수']) + (5 * roadshape_df['중상자수']) + (3 * roadshape_df['경상자수']) + (1 * roadshape_df['부상신고자수'])) / roadshape_df['사고건수'] 
roadshape_df['roadshape_danger'] = (roadshape_df['roadshape_danger'] - roadshape_df['roadshape_danger'].min())/ (roadshape_df['roadshape_danger'].max() - roadshape_df['roadshape_danger'].min())
roadshape_df['도로형태'] = roadshape_df['도로형태_대분류'] + " - " + roadshape_df['도로형태']
roadshape_df = roadshape_df.iloc[:,1:]

# 날씨별 위험도 딕셔너리 생성
roadshape_danger_dic = dict(zip(roadshape_df['도로형태'], roadshape_df['roadshape_danger']))
roadshape_danger_dic['주차장 - 주차장'] = 0.5 # 없는 데이터 위험도 0.5로 통일
roadshape_danger_dic['미분류 - 미분류'] = 0.5 # 없는 데이터 위험도 0.5로 통일
roadshape_danger_dic

{'교차로 - 교차로안': 0.28371010316150946,
 '교차로 - 교차로부근': 0.21443340820597145,
 '교차로 - 교차로횡단보도내': 0.0,
 '기타 - 기타': 0.07573804112828718,
 '단일로 - 고가도로위': 0.4774196135018285,
 '단일로 - 교량': 0.48446609165016485,
 '단일로 - 기타': 0.2067045082897451,
 '단일로 - 지하차도(도로)내': 0.3439542378490483,
 '단일로 - 터널': 1.0,
 '주차장 - 주차장': 0.5,
 '미분류 - 미분류': 0.5}

#### 시간대별 교통사고 위험도

In [13]:
time2018_df = pd.read_csv('open/external_open/도로교통공단_도로형태별 시간대별 교통사고(2018).csv', encoding='cp949')
time2018_df = time2018_df.iloc[:,2:]
time2018_df = time2018_df.groupby(['발생시간_분류']).sum().reset_index()
time2018_df.rename(columns={'발생시간_분류':'시간대'}, inplace=True)

time2019_df = pd.read_csv('open/external_open/도로교통공단_도로형태별 시간대별 교통사고(2019).csv', encoding='cp949')
time2019_df = time2019_df.iloc[:,2:]
time2019_df = time2019_df.groupby(['발생시간_분류']).sum().reset_index()
time2019_df.rename(columns={'발생시간_분류':'시간대'}, inplace=True)

time2021_df = pd.read_csv('open/external_open/도로교통공단_도로형태별 시간대별 교통사고(2021).csv', encoding='cp949')
time2021_df = time2021_df.iloc[:,2:]
time2021_df = time2021_df.groupby(['시간대']).sum().reset_index()

time2022_df = pd.read_csv('open/external_open/도로교통공단_도로형태별 시간대별 교통사고(2022).csv', encoding='cp949')
time2022_df = time2022_df.iloc[:,2:]
time2022_df = time2022_df.groupby(['시간대']).sum().reset_index()

time_df = pd.concat([time2018_df, time2019_df, time2021_df, time2022_df]).groupby(['시간대']).sum().reset_index()
time_df['time_danger'] = ((10 * time_df['사망자수']) + (5 * time_df['중상자수']) + (3 * time_df['경상자수']) + (1 * time_df['부상신고자수'])) / time_df['사고건수'] 
time_df['time_danger'] = (time_df['time_danger'] - time_df['time_danger'].min())/ (time_df['time_danger'].max() - time_df['time_danger'].min())

time_df_danger_dic = {}
count = 0
for f in range(3):
    for b in range(10):
        if f"{f}{b}" == "24":
            break
        time_df_danger_dic[f"{f}{b}"] = time_df['time_danger'][count//2]
        count += 1

time_df_danger_dic

{'00': 0.9155015885837771,
 '01': 0.9155015885837771,
 '02': 0.9576499818930245,
 '03': 0.9576499818930245,
 '04': 1.0,
 '05': 1.0,
 '06': 0.3539509636954647,
 '07': 0.3539509636954647,
 '08': 0.08551203039087009,
 '09': 0.08551203039087009,
 '10': 0.4064413368531681,
 '11': 0.4064413368531681,
 '12': 0.46508264354822065,
 '13': 0.46508264354822065,
 '14': 0.4517063261223543,
 '15': 0.4517063261223543,
 '16': 0.20146548927365882,
 '17': 0.20146548927365882,
 '18': 0.0,
 '19': 0.0,
 '20': 0.23493920339978955,
 '21': 0.23493920339978955,
 '22': 0.46681555165169286,
 '23': 0.46681555165169286}

#### 사고유형별 교통사고 위험도

In [14]:
case2020_df = pd.read_csv('open/external_open/도로교통공단_사고유형별 교통사고(2020).csv', encoding='cp949')
case2020_df = case2020_df.iloc[:, [0,3,4,5,6,7]]
case2020_df = case2020_df.groupby(['사고유형대분류']).sum().reset_index()
case2020_df.rename(columns={'사고유형대분류':'사고유형'}, inplace=True)
case2020_df = case2020_df.iloc[:3]

case2021_df = pd.read_csv('open/external_open/도로교통공단_사고유형별 교통사고(2021).csv', encoding='cp949')
case2021_df = case2021_df.iloc[:, [0,3,4,5,6,7]]
case2021_df = case2021_df.groupby(['사고유형대분류']).sum().reset_index()
case2021_df.rename(columns={'사고유형대분류':'사고유형'}, inplace=True)
case2021_df = case2021_df.iloc[:3]

case2022_df = pd.read_csv('open/external_open/도로교통공단_사고유형별 교통사고(2022).csv', encoding='cp949')
case2022_df = case2022_df.iloc[:, [0,3,4,5,6,7]]
case2022_df = case2022_df.groupby(['사고유형대분류']).sum().reset_index()
case2022_df.rename(columns={'사고유형대분류':'사고유형'}, inplace=True)
case2022_df = case2022_df.iloc[:3]

case_df = pd.concat([case2020_df, case2021_df, case2022_df]).groupby(['사고유형']).sum().reset_index()
case_df['case_danger'] = ((10 * case_df['사망자수']) + (5 * case_df['중상자수']) + (3 * case_df['경상자수']) + (1 * case_df['부상신고자수'])) / case_df['사고건수'] 
case_df['case_danger'] = (case_df['case_danger'] - case_df['case_danger'].min())/ (case_df['case_danger'].max() - case_df['case_danger'].min())

case_danger_dic = dict(zip(case_df['사고유형'], case_df['case_danger']))
case_danger_dic

{'차대사람': 0.0, '차대차': 1.0, '차량단독': 0.5945617604734095}

#### 요일별 위험도

In [15]:
day2018_df = pd.read_csv('open/external_open/도로교통공단_요일별 시간대별 교통사고(2018).csv', encoding='cp949')
day2018_df.drop(['Unnamed: 7'], axis=1, inplace=True)
day2018_df.dropna(inplace=True)
day2018_df = day2018_df.iloc[:, [0,2,3,4,5,6]]
day2018_df = day2018_df.groupby(['요일']).sum().reset_index()

day2019_df = pd.read_csv('open/external_open/도로교통공단_요일별 시간대별 교통사고(2019).csv', encoding='cp949')
day2019_df.drop(['Unnamed: 7'], axis=1, inplace=True)
day2019_df.dropna(inplace=True)
day2019_df = day2019_df.iloc[:, [0,2,3,4,5,6]]
day2019_df = day2019_df.groupby(['요일']).sum().reset_index()

day2020_df = pd.read_csv('open/external_open/도로교통공단_요일별 시간대별 교통사고(2020).csv', encoding='cp949')
day2020_df = day2020_df.iloc[:, [0,2,3,4,5,6]]
day2020_df = day2020_df.groupby(['요일']).sum().reset_index()

day2021_df = pd.read_csv('open/external_open/도로교통공단_요일별 시간대별 교통사고(2021).csv', encoding='cp949')
day2021_df = day2021_df.iloc[:, [0,2,3,4,5,6]]
day2021_df = day2021_df.groupby(['요일']).sum().reset_index()

day2022_df = pd.read_csv('open/external_open/도로교통공단_요일별 시간대별 교통사고(2022).csv', encoding='cp949')
day2022_df = day2022_df.iloc[:, [0,2,3,4,5,6]]
day2022_df = day2022_df.groupby(['요일']).sum().reset_index()

day_df = pd.concat([day2018_df, day2019_df, day2020_df, day2021_df, day2022_df]).groupby(['요일']).sum().reset_index()
day_df['day_danger'] = ((10 * day_df['사망자수']) + (5 * day_df['중상자수']) + (3 * day_df['경상자수']) + (1 * day_df['부상신고자수'])) / day_df['사고건수'] 
day_df['day_danger'] = (day_df['day_danger'] - day_df['day_danger'].min())/ (day_df['day_danger'].max() - day_df['day_danger'].min())

day_danger_dic = dict(zip(day_df['요일'], day_df['day_danger']))
day_danger_dic

{'금': 0.06940624709800688,
 '목': 0.0,
 '수': 0.021974592426828834,
 '월': 0.07538748611096166,
 '일': 1.0,
 '토': 0.725915673064352,
 '화': 0.03275592362861103}

#### 노면상태별 위험도

In [16]:
roadsurface_df = pd.read_csv('open/external_open/노면상태별 교통사고.csv')
roadsurface_df['roadsurface_danger'] = (roadsurface_df['사망자수'] + roadsurface_df['부상자수']) / roadsurface_df['사고건수'] 
roadsurface_df['roadsurface_danger'] = (roadsurface_df['roadsurface_danger'] - roadsurface_df['roadsurface_danger'].min())/ (roadsurface_df['roadsurface_danger'].max() - roadsurface_df['roadsurface_danger'].min())

roadsurface_danger_dic = dict(zip(roadsurface_df['상태'], roadsurface_df['roadsurface_danger']))
roadsurface_danger_dic

{'건조': 0.22710326004512565,
 '젖음/습기': 0.2631478069391325,
 '서리/결빙': 1.0,
 '적설': 0.8530809155632956,
 '기타': 0.09019011664881801,
 '침수': 0.0}

#### 월별 위험도

In [17]:
month2019_df = pd.read_csv('open/external_open/도로교통공단_월별 요일별 교통사고(2019).csv', encoding='cp949')
month2019_df = month2019_df.iloc[:, [0,2,3,4,5,6]]
month2019_df.rename(columns={'월':'발생월'}, inplace=True)
month2019_df = month2019_df.groupby(['발생월']).sum().reset_index()

month2020_df = pd.read_csv('open/external_open/도로교통공단_월별 요일별 교통사고(2020).csv', encoding='cp949')
month2020_df = month2020_df.iloc[:, [0,2,3,4,5,6]]
month2020_df = month2020_df.groupby(['발생월']).sum().reset_index()

month2021_df = pd.read_csv('open/external_open/도로교통공단_월별 요일별 교통사고(2021).csv', encoding='cp949')
month2021_df = month2021_df.iloc[:, [0,2,3,4,5,6]]
month2021_df = month2021_df.groupby(['발생월']).sum().reset_index()

month2022_df = pd.read_csv('open/external_open/도로교통공단_월별 요일별 교통사고(2022).csv', encoding='cp949')
month2022_df = month2022_df.iloc[:, [0,2,3,4,5,6]]
month2022_df = month2022_df.groupby(['발생월']).sum().reset_index()

month_df = pd.concat([month2019_df, month2020_df, month2021_df, month2022_df]).groupby(['발생월']).sum().reset_index()
month_df['month_danger'] = ((10 * month_df['사망자수']) + (5 * month_df['중상자수']) + (3 * month_df['경상자수']) + (1 * month_df['부상신고자수'])) / month_df['사고건수'] 
month_df['month_danger'] = (month_df['month_danger'] - month_df['month_danger'].min())/ (month_df['month_danger'].max() - month_df['month_danger'].min())

month_danger_dic = dict(zip(month_df['발생월'], month_df['month_danger']))
month_danger_dic

{1: 0.8247558671953833,
 2: 0.7728967332121681,
 3: 0.5153471264144931,
 4: 0.8323063848167668,
 5: 0.7231366243147488,
 6: 0.6083855665889639,
 7: 0.5120610426494504,
 8: 1.0,
 9: 0.36311241627115187,
 10: 0.7449907104629871,
 11: 0.25470992254305075,
 12: 0.0}

#### 전국 사고데이터

In [18]:
# import collections

# collections.Counter(a['ECLO'])

In [19]:
# a = pd.read_csv('open/external_open/countrywide_accident.csv', encoding='utf8')
# a

#### trainset 전처리

In [45]:
train_set = pd.read_csv('open/train.csv')
train_set = pd.concat([train_set.iloc[:,:8], train_set.iloc[:,-1:]], axis=1)
train_set['요일'] = train_set['요일'].apply(lambda x : x[0])
train_set['시간대'] = train_set['사고일시'].apply(lambda x : x[-2:])
train_set['발생월'] = train_set['사고일시'].apply(lambda x : int(x[5:7]))
train_set = train_set.iloc[:, [0,10,9,2,3,4,5,6,7,8]]
train_set

Unnamed: 0,ID,발생월,시간대,요일,기상상태,시군구,도로형태,노면상태,사고유형,ECLO
0,ACCIDENT_00000,1,00,화,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,5
1,ACCIDENT_00001,1,00,화,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,3
2,ACCIDENT_00002,1,01,화,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,3
3,ACCIDENT_00003,1,02,화,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,5
4,ACCIDENT_00004,1,04,화,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,3
...,...,...,...,...,...,...,...,...,...,...
39604,ACCIDENT_39604,12,19,금,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,3
39605,ACCIDENT_39605,12,19,금,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,3
39606,ACCIDENT_39606,12,21,금,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,10
39607,ACCIDENT_39607,12,22,금,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,3


In [72]:
import plotly.express as px
import pandas as pd

# 'ECLO' 컬럼의 값 분포를 histogram으로 시각화
fig = px.histogram(train_set, x='ECLO', nbins=100, labels={'ECLO': 'ECLO 값', 'count': '빈도'})
fig.update_layout(title='ECLO 값 분포 Histogram')

# 하위 99%와 상위 1% 기준선 추가
eclo_99th_percentile = train_set['ECLO'].quantile(0.99) # 이상치 제거를 위한 기준점 도출
fig.add_shape(
    dict(
        type='line',
        x0=eclo_99th_percentile,
        x1=eclo_99th_percentile,
        y0=0,
        y1=15000, #max(fig.data[0].y),
        line=dict(color='red', width=2, dash='dash')
    )
)

# 기준선 어노테이션 추가
fig.add_annotation(
    x=eclo_99th_percentile,
    y=15000,
    text=f'99th Percentile - {eclo_99th_percentile}',
    showarrow=True,
    arrowhead=4,
    ax=0,
    ay=-40
)

fig.show()

In [73]:
eclo_99th_percentile

16.0

In [74]:
train_set = train_set[train_set['ECLO'] < eclo_99th_percentile]

In [None]:
# 지역구별 ECLO를 이용해서 정규화
district_df = train_set[['시군구', 'ECLO']].groupby(['시군구']).mean().reset_index()
district_df['ECLO'] = (district_df['ECLO'] - district_df['ECLO'].min())/ (district_df['ECLO'].max() - district_df['ECLO'].min())
district_danger_dic = dict(zip(district_df['시군구'], district_df['ECLO']))
district_danger_dic

In [86]:
train_set['발생월'] = train_set['발생월'].map(month_danger_dic)
train_set['시간대'] = train_set['시간대'].map(time_df_danger_dic)
train_set['요일'] = train_set['요일'].map(day_danger_dic)
train_set['기상상태'] = train_set['기상상태'].map(weather_danger_dic)
train_set['시군구'] = train_set['시군구'].map(district_danger_dic)
train_set['도로형태'] = train_set['도로형태'].map(roadshape_danger_dic)
train_set['노면상태'] = train_set['노면상태'].map(roadsurface_danger_dic)
train_set['사고유형'] = train_set['사고유형'].map(case_danger_dic)
train_set



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Unnamed: 0,ID,발생월,시간대,요일,기상상태,시군구,도로형태,노면상태,사고유형,ECLO
0,ACCIDENT_00000,0.824756,0.915502,0.032756,0.144920,0.243639,0.206705,0.227103,0.0,5
1,ACCIDENT_00001,0.824756,0.915502,0.032756,0.213846,0.300701,0.206705,0.227103,0.0,3
2,ACCIDENT_00002,0.824756,0.915502,0.032756,0.144920,0.313673,0.206705,0.227103,0.0,3
3,ACCIDENT_00003,0.824756,0.957650,0.032756,0.144920,0.234448,0.206705,0.227103,1.0,5
4,ACCIDENT_00004,0.824756,1.000000,0.032756,0.144920,0.276970,0.206705,0.227103,1.0,3
...,...,...,...,...,...,...,...,...,...,...
39604,ACCIDENT_39604,0.000000,0.000000,0.069406,0.144920,0.322516,0.283710,0.227103,1.0,3
39605,ACCIDENT_39605,0.000000,0.000000,0.069406,0.144920,0.245079,0.206705,0.227103,1.0,3
39606,ACCIDENT_39606,0.000000,0.234939,0.069406,0.144920,0.289442,0.283710,0.227103,1.0,10
39607,ACCIDENT_39607,0.000000,0.466816,0.069406,0.144920,0.292333,0.075738,0.227103,1.0,3


#### testset 전처리

In [87]:
test_set = pd.read_csv('open/test.csv')
test_set['요일'] = test_set['요일'].apply(lambda x : x[0])
test_set['시간대'] = test_set['사고일시'].apply(lambda x : x[-2:])
test_set['발생월'] = test_set['사고일시'].apply(lambda x : int(x[5:7]))
test_set = test_set.iloc[:, [0,9,8,2,3,4,5,6,7]]
test_set

Unnamed: 0,ID,발생월,시간대,요일,기상상태,시군구,도로형태,노면상태,사고유형
0,ACCIDENT_39609,1,01,토,맑음,대구광역시 수성구 상동,교차로 - 교차로안,건조,차대사람
1,ACCIDENT_39610,1,01,토,맑음,대구광역시 수성구 지산동,단일로 - 기타,건조,차대사람
2,ACCIDENT_39611,1,04,토,맑음,대구광역시 수성구 수성동2가,교차로 - 교차로안,건조,차대차
3,ACCIDENT_39612,1,04,토,맑음,대구광역시 수성구 신매동,단일로 - 기타,건조,차대차
4,ACCIDENT_39613,1,06,토,맑음,대구광역시 달서구 감삼동,교차로 - 교차로안,건조,차대차
...,...,...,...,...,...,...,...,...,...
10958,ACCIDENT_50567,12,18,토,맑음,대구광역시 남구 대명동,단일로 - 터널,건조,차대차
10959,ACCIDENT_50568,12,18,토,맑음,대구광역시 수성구 시지동,단일로 - 기타,건조,차대차
10960,ACCIDENT_50569,12,20,토,맑음,대구광역시 수성구 연호동,단일로 - 기타,건조,차대차
10961,ACCIDENT_50570,12,20,토,맑음,대구광역시 수성구 범물동,교차로 - 교차로부근,건조,차대차


In [88]:
test_set['발생월'] = test_set['발생월'].map(month_danger_dic)
test_set['시간대'] = test_set['시간대'].map(time_df_danger_dic)
test_set['요일'] = test_set['요일'].map(day_danger_dic)
test_set['기상상태'] = test_set['기상상태'].map(weather_danger_dic)
test_set['시군구'] = test_set['시군구'].map(district_danger_dic)
test_set['도로형태'] = test_set['도로형태'].map(roadshape_danger_dic)
test_set['노면상태'] = test_set['노면상태'].map(roadsurface_danger_dic)
test_set['사고유형'] = test_set['사고유형'].map(case_danger_dic)
test_set

Unnamed: 0,ID,발생월,시간대,요일,기상상태,시군구,도로형태,노면상태,사고유형
0,ACCIDENT_39609,0.824756,0.915502,0.725916,0.14492,0.318540,0.283710,0.227103,0.0
1,ACCIDENT_39610,0.824756,0.915502,0.725916,0.14492,0.278709,0.206705,0.227103,0.0
2,ACCIDENT_39611,0.824756,1.000000,0.725916,0.14492,0.326531,0.283710,0.227103,1.0
3,ACCIDENT_39612,0.824756,1.000000,0.725916,0.14492,0.263105,0.206705,0.227103,1.0
4,ACCIDENT_39613,0.824756,0.353951,0.725916,0.14492,0.300701,0.283710,0.227103,1.0
...,...,...,...,...,...,...,...,...,...
10958,ACCIDENT_50567,0.000000,0.000000,0.725916,0.14492,0.269868,1.000000,0.227103,1.0
10959,ACCIDENT_50568,0.000000,0.000000,0.725916,0.14492,0.324351,0.206705,0.227103,1.0
10960,ACCIDENT_50569,0.000000,0.234939,0.725916,0.14492,0.377726,0.206705,0.227103,1.0
10961,ACCIDENT_50570,0.000000,0.234939,0.725916,0.14492,0.343271,0.214433,0.227103,1.0


#### XGBoost 모델

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import numpy as np
import math

# train_set에서 독립 변수와 종속 변수를 나눔
X = train_set.drop(['ID', 'ECLO'], axis=1)
y = train_set['ECLO']

# train_set을 8:2의 비율로 나눔
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost 모델 초기화
model = XGBRegressor(random_state=42)

# 모델 학습
model.fit(X_train, y_train)

# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val).astype(int)

In [None]:
# validation set에 대한 RMSLE 계산
rmsle = np.sqrt(np.mean(np.square(np.log1p(y_val_pred) - np.log1p(y_val))))
print(f'RMSLE(validation): {rmsle}')

RMSLE(validation): 0.46347070832221304


In [None]:
# test_set에서 예측 및 정수로 변환
X_test = test_set.drop('ID', axis=1)
test_set['ECLO'] = model.predict(X_test).astype(int)

# 결과 확인
test_set[['ID', 'ECLO']]

In [None]:
test_set[['ID', 'ECLO']].to_csv('submission_XGBoost.csv', index=False)

#### tensorflow 선형예측

In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np

# train_set에서 독립 변수와 종속 변수를 나눔
X = train_set.drop(['ID', 'ECLO'], axis=1)
y = train_set['ECLO']

In [76]:
# train_set을 8:2의 비율로 나눔
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
# 텐서플로우 모델 초기화
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])

In [31]:
# 모델 컴파일
model.compile(optimizer='adam', loss='mean_squared_error')




In [None]:
# 모델 학습
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

In [33]:
# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val).flatten().astype(int)



In [34]:
# validation set에 대한 RMSLE 계산
rmsle = np.sqrt(np.mean(np.square(np.log1p(y_val_pred) - np.log1p(y_val))))
print(f'RMSLE(validation): {rmsle}')

RMSLE(validation): 0.4492597014687761


In [106]:
test_set = test_set.iloc[:,:9]
test_set

Unnamed: 0,ID,발생월,시간대,요일,기상상태,시군구,도로형태,노면상태,사고유형
0,ACCIDENT_39609,0.824756,0.915502,0.725916,0.14492,0.318540,0.283710,0.227103,0.0
1,ACCIDENT_39610,0.824756,0.915502,0.725916,0.14492,0.278709,0.206705,0.227103,0.0
2,ACCIDENT_39611,0.824756,1.000000,0.725916,0.14492,0.326531,0.283710,0.227103,1.0
3,ACCIDENT_39612,0.824756,1.000000,0.725916,0.14492,0.263105,0.206705,0.227103,1.0
4,ACCIDENT_39613,0.824756,0.353951,0.725916,0.14492,0.300701,0.283710,0.227103,1.0
...,...,...,...,...,...,...,...,...,...
10958,ACCIDENT_50567,0.000000,0.000000,0.725916,0.14492,0.269868,1.000000,0.227103,1.0
10959,ACCIDENT_50568,0.000000,0.000000,0.725916,0.14492,0.324351,0.206705,0.227103,1.0
10960,ACCIDENT_50569,0.000000,0.234939,0.725916,0.14492,0.377726,0.206705,0.227103,1.0
10961,ACCIDENT_50570,0.000000,0.234939,0.725916,0.14492,0.343271,0.214433,0.227103,1.0


In [36]:
# test_set에서 예측 및 정수로 변환
X_test = test_set.drop('ID', axis=1)
test_set['ECLO'] = model.predict(X_test).flatten().astype(int)



In [38]:
test_set[['ID', 'ECLO']]['ECLO'].unique()

array([ 4,  5,  3,  2,  6,  7, 10,  9,  8,  1])

In [None]:
# 결과 확인
test_set[['ID', 'ECLO']].to_csv('submission_tensorflow_sequential.csv', index=False)

#### 변경한 tensorflow 모델

In [107]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import AdamW
import numpy as np

# train_set에서 독립 변수와 종속 변수를 나눔
X = train_set.drop(['ID', 'ECLO'], axis=1)
y = train_set['ECLO']

In [108]:
# train_set을 8:2의 비율로 나눔
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [109]:
# 텐서플로우 모델 초기화
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])

In [110]:
# 모델 컴파일
optimizer = AdamW(learning_rate=0.001, weight_decay=1e-4)
model.compile(optimizer=optimizer, loss='mean_squared_error')

In [111]:
# 모델 학습
for epoch in range(50):
    model.fit(X_train, y_train, epochs=1, batch_size=128, validation_data=(X_val, y_val), verbose=2)
    
    # 검증 데이터에 대한 예측 및 RMSLE 계산
    y_val_pred = model.predict(X_val).flatten()
    rmsle = np.sqrt(np.mean(np.square(np.log1p(y_val_pred) - np.log1p(y_val))))
    print(f'Epoch {epoch+1}/{50} - RMSLE(validation): {rmsle}')

245/245 - 1s - loss: 9.3546 - val_loss: 6.3999 - 1s/epoch - 5ms/step
Epoch 1/50 - RMSLE(validation): 0.4385386920978764
245/245 - 1s - loss: 6.5626 - val_loss: 6.3160 - 614ms/epoch - 3ms/step
Epoch 2/50 - RMSLE(validation): 0.4271361380005481
245/245 - 1s - loss: 6.5447 - val_loss: 6.3101 - 595ms/epoch - 2ms/step
Epoch 3/50 - RMSLE(validation): 0.42922108137376175
245/245 - 1s - loss: 6.5316 - val_loss: 6.3024 - 598ms/epoch - 2ms/step
Epoch 4/50 - RMSLE(validation): 0.42715004539186974
245/245 - 1s - loss: 6.5318 - val_loss: 6.3175 - 608ms/epoch - 2ms/step
Epoch 5/50 - RMSLE(validation): 0.4228172937730372
245/245 - 1s - loss: 6.5217 - val_loss: 6.3006 - 592ms/epoch - 2ms/step
Epoch 6/50 - RMSLE(validation): 0.4250252882351593
245/245 - 1s - loss: 6.5134 - val_loss: 6.3451 - 600ms/epoch - 2ms/step
Epoch 7/50 - RMSLE(validation): 0.43686687650346256
245/245 - 1s - loss: 6.5299 - val_loss: 6.2959 - 578ms/epoch - 2ms/step
Epoch 8/50 - RMSLE(validation): 0.4281805972627402
245/245 - 1s - l

In [112]:
# 검증 데이터에 대한 예측
y_val_pred = model.predict(X_val).flatten()



In [113]:
# validation set에 대한 RMSLE 계산
rmsle = np.sqrt(np.mean(np.square(np.log1p(y_val_pred) - np.log1p(y_val))))
print(f'RMSLE(validation): {rmsle}')

RMSLE(validation): 0.4309719979389831


In [114]:
# test_set에서 예측 및 정수로 변환
X_test = test_set.drop('ID', axis=1)
test_set['ECLO'] = model.predict(X_test).flatten()



In [115]:
# 결과 확인
test_set[['ID', 'ECLO']]

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,4.261109
1,ACCIDENT_39610,4.117989
2,ACCIDENT_39611,5.873297
3,ACCIDENT_39612,5.457228
4,ACCIDENT_39613,5.093212
...,...,...
10958,ACCIDENT_50567,6.144843
10959,ACCIDENT_50568,4.969854
10960,ACCIDENT_50569,5.199775
10961,ACCIDENT_50570,5.027367


In [116]:
# 결과 저장
test_set[['ID', 'ECLO']].to_csv('submission_tensorflow_modified2.csv', index=False)