Setting Data

In [None]:
pip install catboost mljar-supervised

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mljar-supervised
  Downloading mljar-supervised-1.1.1.tar.gz (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.9/126.9 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pandas>=0.24 (from catboost)
  Downloading pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m93.5 MB/s[0m eta [36m0:00:00[0m
Collecting dtreeviz>=2.2.2 (from mljar-supervised)
  Downloading dtreeviz-2.2.2-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.8/91.8 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting shap>=0.42.1 (f

In [None]:
  !sudo apt-get install -y fonts-nanum
  !sudo fc-cache -fv
  !rm ~/.cache/matplotlib -rf


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 15 not upgraded.
Need to get 10.3 MB of archives.
After this operation, 34.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-nanum all 20200506-1 [10.3 MB]
Fetched 10.3 MB in 1s (10.9 MB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package fonts-nanum.
(Reading database ... 120899 files and direc

In [None]:
# 한글 깨짐 방지 및 warnings 알림 금지
import warnings
import matplotlib.pyplot as plt

plt.rc('font', family='NanumBarunGothic')
warnings.filterwarnings(action = 'ignore')


In [None]:
# load modules
import numpy as np
import pandas as pd

# split
from sklearn.model_selection import train_test_split

# models
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

# metrics
from sklearn.metrics import mean_squared_log_error

# GridSearchCV
# 최적의 파라미터 값 찾아보기
from sklearn.model_selection import GridSearchCV

# K-FOLD
from sklearn.model_selection import StratifiedKFold

# Using AutoML
from supervised import AutoML

# One-hot-encoding
from sklearn.preprocessing import OneHotEncoder

# Target-Encoding
from category_encoders.target_encoder import TargetEncoder

In [None]:
# load data
# train = pd.read_csv('train_1.csv',encoding='cp949')
# test = pd.read_csv('test_1.csv')

In [None]:
df_t=pd.read_csv('train_1.csv',encoding='cp949')
df_te=pd.read_csv('test_1.csv',encoding='cp949')

In [None]:
df_t['군'] = ''
df_t['구'] = ''

In [None]:
# 주소 분리 및 군과 구 열에 저장
for i in range(0, len(df_t)):
    address = df_t['시군구'][i]
    elements = address.split(" ")
    if len(elements) == 3:
        city = elements[0]
        county = elements[1]
        district = elements[2]
        df_t.at[i, '군'] = county
        df_t.at[i, '구'] = district
    else:
        print(f"잘못된 주소 형식입니다: {address}")

In [None]:
df_te['군'] = ''
df_te['구'] = ''

In [None]:
for i in range(0, len(df_te)):
    address = df_te['시군구'][i]
    elements = address.split(" ")
    if len(elements) == 3:
        city = elements[0]
        county = elements[1]
        district = elements[2]
        df_te.at[i, '군'] = county
        df_te.at[i, '구'] = district
    else:
        print(f"잘못된 주소 형식입니다: {address}")

In [None]:
# train_df와 test_df의 기상상태 및 노면상태 열 선택
train_categorical_data = df_t[['군']]
test_categorical_data = df_te[['군']]

In [None]:
# OneHotEncoder 인스턴스 생성 및 fit_transform 수행
encoder = OneHotEncoder()
train_encoded = encoder.fit_transform(train_categorical_data)
test_encoded = encoder.transform(test_categorical_data)

In [None]:
# OneHotEncoder가 사용한 카테고리 목록을 가져와서 카테고리 이름을 열 이름으로 변환
feature_names = encoder.get_feature_names_out(['군'])

In [None]:
# 밀집 행렬로 변환 (선택 사항)
train_encoded_dense = train_encoded.toarray()
test_encoded_dense = test_encoded.toarray()

In [None]:
# 데이터프레임으로 변환 (선택 사항)
import pandas as pd

train_encoded_df = pd.DataFrame(train_encoded_dense, columns=feature_names, index=df_t.index)
test_encoded_df = pd.DataFrame(test_encoded_dense, columns=feature_names, index=df_te.index)

In [None]:
# 기존 열 제거
df_t = df_t.drop(['군'], axis=1)
df_te = df_te.drop(['군'], axis=1)

In [None]:
# 인코딩된 열 추가
df_t = pd.concat([df_t, train_encoded_df], axis=1)
df_te = pd.concat([df_te, test_encoded_df], axis=1)

In [None]:
df_t

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,...,어린이보호구역개수,구,군_남구,군_달서구,군_달성군,군_동구,군_북구,군_서구,군_수성구,군_중구
0,ACCIDENT_00000,2019-01-01 00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,...,2.0,대신동,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,ACCIDENT_00001,2019-01-01 00,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,보도통행중,기타,...,0.0,감삼동,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ACCIDENT_00002,2019-01-01 01,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,차도통행중,안전운전불이행,...,5.0,두산동,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,ACCIDENT_00003,2019-01-01 02,화요일,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,11.0,복현동,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,ACCIDENT_00004,2019-01-01 04,화요일,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,0.0,신암동,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,ACCIDENT_39604,2021-12-31 19,금요일,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,측면충돌,신호위반,...,1.0,수성동3가,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39605,ACCIDENT_39605,2021-12-31 19,금요일,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,측면충돌,안전거리미확보,...,0.0,상인동,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
39606,ACCIDENT_39606,2021-12-31 21,금요일,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,측면충돌,교차로운행방법위반,...,0.0,월성동,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
39607,ACCIDENT_39607,2021-12-31 22,금요일,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,추돌,안전운전불이행,...,0.0,장동,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
accident_counts = df_t['시군구'].value_counts().reset_index()
accident_counts.columns = ['시군구', '사고횟수']

In [None]:
# '시군구'별 사고 횟수를 train_df에 매핑하기 위해 '시군구' 컬럼을 기준으로 사고 횟수를 가져옵니다.
df_t['사고발생횟수'] = df_t['시군구'].map(accident_counts.set_index('시군구')['사고횟수'])

In [None]:
# '시군구'별 사고 횟수를 train_df에 매핑하기 위해 '시군구' 컬럼을 기준으로 사고 횟수를 가져옵니다.
df_te['사고발생횟수'] = df_te['시군구'].map(accident_counts.set_index('시군구')['사고횟수'])

In [None]:
# datetime 컬럼 처리
df_t['사고일시'] = pd.to_datetime(df_t['사고일시'])
df_te['사고일시'] = pd.to_datetime(df_te['사고일시'])

# datetime을 여러 파생 변수로 변환
for df in [df_t, df_te]:
    df['year'] = df['사고일시'].dt.year
    df['month'] = df['사고일시'].dt.month
    df['day'] = df['사고일시'].dt.day
    df['hour'] = df['사고일시'].dt.hour
    df['weekday'] = df['사고일시'].dt.weekday

In [None]:
holi_weekday = ['2019-01-01', '2019-02-04', '2019-02-05', '2019-02-06', '2019-03-01', '2019-05-05', '2019-05-12', '2019-06-06', '2019-08-15', '2019-09-12', '2019-09-13', '2019-09-14', '2019-10-03', '2019-10-09', '2019-12-25',
                '2020-01-01' ,'2020-01-24' ,'2020-01-25', '2020-01-26', '2020-03-01', '2020-04-30', '2020-05-05', '2020-06-06', '2020-08-15', '2020-08-17', '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03', '2020-10-09', '2020-12-25',
                '2021-01-01' ,'2021-02-11' ,'2021-02-12', '2021-02-13', '2021-03-01', '2021-05-05', '2021-05-19', '2021-06-06', '2021-08-15', '2021-09-20', '2021-09-21', '2021-09-22', '2021-10-03', '2021-10-09', '2021-12-25',
                '2022-01-01' ,'2022-01-31' ,'2022-02-01', '2022-02-02', '2022-03-01', '2022-05-05', '2022-05-08', '2022-06-06', '2022-08-15', '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2020-10-10', '2022-12-25',
                '2023-01-01' ,'2023-01-21' ,'2023-01-22', '2023-01-23', '2023-01-24', '2023-03-01']

In [None]:
df_t['사고일시'] = pd.to_datetime(df_t['사고일시'])
df_t['day_of_week'] = df_t['사고일시'].dt.dayofweek
df_t['holiday'] = np.where((df_t.day_of_week >= 5) | (df_t.사고일시.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)

In [None]:
df_te['사고일시'] = pd.to_datetime(df_te['사고일시'])
df_te['day_of_week'] = df_te['사고일시'].dt.dayofweek
df_te['holiday'] = np.where((df_te.day_of_week >= 5) | (df_te.사고일시.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)

In [None]:
df_t = df_t.drop(columns=['사고유형 - 세부분류','법규위반','가해운전자 차종','가해운전자 성별','가해운전자 연령',
                               '가해운전자 상해정도','사망자수','사망자수','중상자수','피해운전자 차종','피해운전자 성별',
                      '피해운전자 연령','피해운전자 상해정도','경상자수','부상자수','사고유형 - 세부분류'])

In [None]:
def group_season(df):
    df.loc[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5), 'season'] = '봄'
    df.loc[(df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8), 'season'] = '여름'
    df.loc[(df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11), 'season'] = '가을'
    df.loc[(df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2), 'season'] = '겨울'
    return df['season']

df_t['Cosine_Time'] = np.cos(2 * np.pi * df_t['hour'] / 24)
df_te['Cosine_Time'] = np.cos(2 * np.pi * df_te['hour'] / 24)
df_t['season'] = group_season(df_t)
df_te['season'] = group_season(df_te)

In [None]:
# Target encoding


categorical_features = ['ID','시군구','요일','도로형태','사고유형','구','기상상태', '노면상태','season']

for i in categorical_features:
    tr_encoder = TargetEncoder(cols=[i])
    df_t[i] = tr_encoder.fit_transform(df_t[i], df_t['ECLO'])
    df_te[i] = tr_encoder.transform(df_te[i])

In [None]:
target = df_t['ECLO']

In [None]:
df_t['ECLO'] = target

In [None]:
df_t.corr()

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,ECLO,보호구역,...,사고발생횟수,year,month,day,hour,weekday,day_of_week,holiday,Cosine_Time,season
ID,1.0,-0.032334,0.061771,0.01475,0.118496,0.086154,0.018617,0.137211,1.0,-0.010639,...,-0.040406,-0.031281,-0.006871,-0.012303,-0.017355,0.039407,0.039407,0.060091,0.037052,0.013063
사고일시,-0.032334,1.0,-0.015394,0.020113,0.016873,-0.008729,0.009637,0.008021,-0.032334,0.005885,...,-0.004246,0.943755,0.312167,0.018916,0.010514,-0.011247,-0.011247,-0.022126,-0.032835,0.052574
요일,0.061771,-0.015394,1.0,0.005726,-0.011276,0.009456,0.003387,0.010758,0.061771,-0.006064,...,-0.001278,-0.015829,-0.000869,-0.001398,-0.028288,0.637956,0.637956,0.861151,0.031918,0.010607
기상상태,0.01475,0.020113,0.005726,1.0,0.00673,0.005441,0.811933,-0.019851,0.01475,-0.000643,...,-0.00308,0.028432,-0.01963,-0.012132,0.025395,-0.007867,-0.007867,0.00448,0.099668,0.019916
시군구,0.118496,0.016873,-0.011276,0.00673,1.0,0.025979,-0.005277,0.093599,0.118496,-0.087755,...,-0.353927,0.013896,0.01113,0.001173,-0.014139,-0.010625,-0.010625,-0.009383,-0.046177,0.007512
도로형태,0.086154,-0.008729,0.009456,0.005441,0.025979,1.0,0.0193,0.238355,0.086154,-0.022488,...,-0.019708,-0.008411,-0.002795,0.006277,-0.01481,0.008161,0.008161,0.007643,0.022999,-0.002123
노면상태,0.018617,0.009637,0.003387,0.811933,-0.005277,0.0193,1.0,-0.008089,0.018617,-0.015506,...,0.002751,0.018757,-0.023078,-0.017716,0.019702,-0.013314,-0.013314,0.000965,0.081766,0.016906
사고유형,0.137211,0.008021,0.010758,-0.019851,0.093599,0.238355,-0.008089,1.0,0.137211,0.000105,...,-0.027941,0.008337,0.0003,-0.000257,0.009275,0.009591,0.009591,0.008729,-0.042967,0.001176
ECLO,1.0,-0.032334,0.061771,0.01475,0.118496,0.086154,0.018617,0.137211,1.0,-0.010639,...,-0.040406,-0.031281,-0.006871,-0.012303,-0.017355,0.039407,0.039407,0.060091,0.037052,0.013063
보호구역,-0.010639,0.005885,-0.006064,-0.000643,-0.087755,-0.022488,-0.015506,0.000105,-0.010639,1.0,...,0.422502,0.007459,-0.004158,0.005322,0.02856,-0.007212,-0.007212,-0.007549,-0.001071,0.004471


In [None]:
df_t.columns


Index(['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형', 'ECLO',
       '보호구역', '불법주정차', '차량전용', '평지도로', '설치개수', '주차장개수', '어린이보호구역개수', '구',
       '군_남구', '군_달서구', '군_달성군', '군_동구', '군_북구', '군_서구', '군_수성구', '군_중구',
       '사고발생횟수', 'year', 'month', 'day', 'hour', 'weekday', 'day_of_week',
       'holiday', 'Cosine_Time', 'season'],
      dtype='object')

In [None]:
df_te.columns

Index(['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형', '보호구역',
       '불법주정차', '차량전용', '평지도로', '설치개수', '주차장개수', '어린이보호구역개수', '구', '군_남구',
       '군_달서구', '군_달성군', '군_동구', '군_북구', '군_서구', '군_수성구', '군_중구', '사고발생횟수',
       'year', 'month', 'day', 'hour', 'weekday', 'day_of_week', 'holiday',
       'Cosine_Time', 'season'],
      dtype='object')

In [None]:
# null 값을 0으로 채우기
df_t.fillna(0, inplace=True)
df_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39609 entries, 0 to 39608
Data columns (total 35 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   ID           39609 non-null  float64       
 1   사고일시         39609 non-null  datetime64[ns]
 2   요일           39609 non-null  float64       
 3   기상상태         39609 non-null  float64       
 4   시군구          39609 non-null  float64       
 5   도로형태         39609 non-null  float64       
 6   노면상태         39609 non-null  float64       
 7   사고유형         39609 non-null  float64       
 8   ECLO         39609 non-null  int64         
 9   보호구역         39609 non-null  float64       
 10  불법주정차        39609 non-null  float64       
 11  차량전용         39609 non-null  float64       
 12  평지도로         39609 non-null  float64       
 13  설치개수         39609 non-null  float64       
 14  주차장개수        39609 non-null  float64       
 15  어린이보호구역개수    39609 non-null  float64       
 16  구   

In [None]:
# null 값을 0으로 채우기
df_te.fillna(0, inplace=True)
df_te.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10963 entries, 0 to 10962
Data columns (total 34 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   ID           10963 non-null  float64       
 1   사고일시         10963 non-null  datetime64[ns]
 2   요일           10963 non-null  float64       
 3   기상상태         10963 non-null  float64       
 4   시군구          10963 non-null  float64       
 5   도로형태         10963 non-null  float64       
 6   노면상태         10963 non-null  float64       
 7   사고유형         10963 non-null  float64       
 8   보호구역         10963 non-null  float64       
 9   불법주정차        10963 non-null  float64       
 10  차량전용         10963 non-null  float64       
 11  평지도로         10963 non-null  float64       
 12  설치개수         10963 non-null  float64       
 13  주차장개수        10963 non-null  float64       
 14  어린이보호구역개수    10963 non-null  float64       
 15  구            10963 non-null  float64       
 16  군_남구

In [None]:
test_x = df_te.drop(columns=['ID','year','month','day','hour','day_of_week','사고일시']).copy()
train_x = df_t[test_x.columns].copy()
train_y = df_t['ECLO'].copy()

AUTOML

In [None]:
import matplotlib
matplotlib.use('Agg')

In [None]:
from supervised.automl import AutoML
automl = AutoML(mode="Compete",
                algorithms = ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost'],
                n_jobs = -1,total_time_limit=43200, eval_metric="rmse", ml_task = "regression",)

In [None]:
automl.fit(train_x, train_y)

AutoML directory: AutoML_1
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree rmse 3.217178 trained in 0.84 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 4 models
1_Default_LightGBM rmse 3.235748 trained in 14.41 seconds
2_Default_Xgboost rmse 3.234628 trained in 13.49 seconds
3_Default_CatBoost rmse 3.231218 trained in 16.66 seconds
4_Default_Rand

In [None]:
pred = automl.predict(test_x)

In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
submission['ECLO'] = pred

In [None]:
submission.loc[ submission['ECLO'] < 0.0, 'ECLO'] = 0.0

In [None]:
min(submission['ECLO'])

2.558492386272325

In [None]:
submission.to_csv('submit.csv', index=False)

In [None]:
submission