Setting Data

In [1]:
# load modules
import numpy as np
import pandas as pd
import random
import os
import warnings

# models 
from supervised.automl import AutoML

# encoder
from sklearn.preprocessing import OneHotEncoder
from target_encoding import TargetEncoder

# visualization
import matplotlib
import matplotlib.pyplot as plt

# 경고 무시
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
# 한글 깨짐 방지 및 warnings 알림 금지
plt.rc('font', family='NanumSquare')
warnings.filterwarnings(action = 'ignore')


In [3]:
# random seed 고정하기
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [4]:
# load data
# train = pd.read_csv('train_1.csv',encoding='cp949')
# test = pd.read_csv('test_1.csv')

In [5]:
df_t=pd.read_csv('../Data/train_1.csv',encoding='cp949')
df_te=pd.read_csv('../Data/test_1.csv',encoding='cp949')

In [6]:
df_t['군'] = ''
df_t['구'] = ''

In [7]:
# 주소 분리 및 군과 구 열에 저장
for i in range(0, len(df_t)):
    address = df_t['시군구'][i]
    elements = address.split(" ")
    if len(elements) == 3:
        city = elements[0]
        county = elements[1]
        district = elements[2]
        df_t.at[i, '군'] = county
        df_t.at[i, '구'] = district
    else:
        print(f"잘못된 주소 형식입니다: {address}")

In [8]:
df_te['군'] = ''
df_te['구'] = ''

In [9]:
for i in range(0, len(df_te)):
    address = df_te['시군구'][i]
    elements = address.split(" ")
    if len(elements) == 3:
        city = elements[0]
        county = elements[1]
        district = elements[2]
        df_te.at[i, '군'] = county
        df_te.at[i, '구'] = district
    else:
        print(f"잘못된 주소 형식입니다: {address}")

In [10]:
# train_df와 test_df의 기상상태 및 노면상태 열 선택
train_categorical_data = df_t[['군']]
test_categorical_data = df_te[['군']]

In [11]:
# OneHotEncoder 인스턴스 생성 및 fit_transform 수행
encoder = OneHotEncoder()
train_encoded = encoder.fit_transform(train_categorical_data)
test_encoded = encoder.transform(test_categorical_data)

In [12]:
# OneHotEncoder가 사용한 카테고리 목록을 가져와서 카테고리 이름을 열 이름으로 변환
feature_names = encoder.get_feature_names_out(['군'])

In [13]:
# 밀집 행렬로 변환 (선택 사항)
train_encoded_dense = train_encoded.toarray()
test_encoded_dense = test_encoded.toarray()

In [14]:
# 데이터프레임으로 변환 (선택 사항)
train_encoded_df = pd.DataFrame(train_encoded_dense, columns=feature_names, index=df_t.index)
test_encoded_df = pd.DataFrame(test_encoded_dense, columns=feature_names, index=df_te.index)

In [15]:
# 기존 열 제거
df_t = df_t.drop(['군'], axis=1)
df_te = df_te.drop(['군'], axis=1)

In [16]:
# 인코딩된 열 추가
df_t = pd.concat([df_t, train_encoded_df], axis=1)
df_te = pd.concat([df_te, test_encoded_df], axis=1)

In [17]:
df_t

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,...,어린이보호구역개수,구,군_남구,군_달서구,군_달성군,군_동구,군_북구,군_서구,군_수성구,군_중구
0,ACCIDENT_00000,2019-01-01 00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,...,2.0,대신동,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,ACCIDENT_00001,2019-01-01 00,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,보도통행중,기타,...,0.0,감삼동,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ACCIDENT_00002,2019-01-01 01,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,차도통행중,안전운전불이행,...,5.0,두산동,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,ACCIDENT_00003,2019-01-01 02,화요일,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,11.0,복현동,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,ACCIDENT_00004,2019-01-01 04,화요일,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,0.0,신암동,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,ACCIDENT_39604,2021-12-31 19,금요일,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,측면충돌,신호위반,...,1.0,수성동3가,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39605,ACCIDENT_39605,2021-12-31 19,금요일,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,측면충돌,안전거리미확보,...,0.0,상인동,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
39606,ACCIDENT_39606,2021-12-31 21,금요일,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,측면충돌,교차로운행방법위반,...,0.0,월성동,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
39607,ACCIDENT_39607,2021-12-31 22,금요일,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,추돌,안전운전불이행,...,0.0,장동,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
accident_counts = df_t['시군구'].value_counts().reset_index()
accident_counts.columns = ['시군구', '사고횟수']

In [19]:
# '시군구'별 사고 횟수를 train_df에 매핑하기 위해 '시군구' 컬럼을 기준으로 사고 횟수를 가져옵니다.
df_t['사고발생횟수'] = df_t['시군구'].map(accident_counts.set_index('시군구')['사고횟수'])

In [20]:
# '시군구'별 사고 횟수를 train_df에 매핑하기 위해 '시군구' 컬럼을 기준으로 사고 횟수를 가져옵니다.
df_te['사고발생횟수'] = df_te['시군구'].map(accident_counts.set_index('시군구')['사고횟수'])

In [21]:
# datetime 컬럼 처리
df_t['사고일시'] = pd.to_datetime(df_t['사고일시'])
df_te['사고일시'] = pd.to_datetime(df_te['사고일시'])

# datetime을 여러 파생 변수로 변환
for df in [df_t, df_te]:
    df['year'] = df['사고일시'].dt.year
    df['month'] = df['사고일시'].dt.month
    df['day'] = df['사고일시'].dt.day
    df['hour'] = df['사고일시'].dt.hour
    df['weekday'] = df['사고일시'].dt.weekday

In [22]:
holi_weekday = ['2019-01-01', '2019-02-04', '2019-02-05', '2019-02-06', '2019-03-01', '2019-05-05', '2019-05-12', '2019-06-06', '2019-08-15', '2019-09-12', '2019-09-13', '2019-09-14', '2019-10-03', '2019-10-09', '2019-12-25',
                '2020-01-01' ,'2020-01-24' ,'2020-01-25', '2020-01-26', '2020-03-01', '2020-04-30', '2020-05-05', '2020-06-06', '2020-08-15', '2020-08-17', '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03', '2020-10-09', '2020-12-25',
                '2021-01-01' ,'2021-02-11' ,'2021-02-12', '2021-02-13', '2021-03-01', '2021-05-05', '2021-05-19', '2021-06-06', '2021-08-15', '2021-09-20', '2021-09-21', '2021-09-22', '2021-10-03', '2021-10-09', '2021-12-25',
                '2022-01-01' ,'2022-01-31' ,'2022-02-01', '2022-02-02', '2022-03-01', '2022-05-05', '2022-05-08', '2022-06-06', '2022-08-15', '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2020-10-10', '2022-12-25',
                '2023-01-01' ,'2023-01-21' ,'2023-01-22', '2023-01-23', '2023-01-24', '2023-03-01']

In [23]:
df_t['사고일시'] = pd.to_datetime(df_t['사고일시'])
df_t['day_of_week'] = df_t['사고일시'].dt.dayofweek
df_t['holiday'] = np.where((df_t.day_of_week >= 5) | (df_t.사고일시.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)

In [24]:
df_te['사고일시'] = pd.to_datetime(df_te['사고일시'])
df_te['day_of_week'] = df_te['사고일시'].dt.dayofweek
df_te['holiday'] = np.where((df_te.day_of_week >= 5) | (df_te.사고일시.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)

In [25]:
df_t = df_t.drop(columns=['사고유형 - 세부분류','법규위반','가해운전자 차종','가해운전자 성별','가해운전자 연령',
                               '가해운전자 상해정도','사망자수','사망자수','중상자수','피해운전자 차종','피해운전자 성별',
                      '피해운전자 연령','피해운전자 상해정도','경상자수','부상자수','사고유형 - 세부분류'])

In [26]:
def group_season(df):
    df.loc[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5), 'season'] = '봄'
    df.loc[(df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8), 'season'] = '여름'
    df.loc[(df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11), 'season'] = '가을'
    df.loc[(df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2), 'season'] = '겨울'
    return df['season']

df_t['Cosine_Time'] = np.cos(2 * np.pi * df_t['hour'] / 24)
df_te['Cosine_Time'] = np.cos(2 * np.pi * df_te['hour'] / 24)
df_t['season'] = group_season(df_t)
df_te['season'] = group_season(df_te)

In [27]:
target = df_t['ECLO']

In [28]:
df_t['ECLO'] = target

In [29]:
df_t.columns


Index(['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형', 'ECLO',
       '보호구역', '불법주정차', '차량전용', '평지도로', '설치개수', '주차장개수', '어린이보호구역개수', '구',
       '군_남구', '군_달서구', '군_달성군', '군_동구', '군_북구', '군_서구', '군_수성구', '군_중구',
       '사고발생횟수', 'year', 'month', 'day', 'hour', 'weekday', 'day_of_week',
       'holiday', 'Cosine_Time', 'season'],
      dtype='object')

In [30]:
df_te.columns

Index(['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형', '보호구역',
       '불법주정차', '차량전용', '평지도로', '설치개수', '주차장개수', '어린이보호구역개수', '구', '군_남구',
       '군_달서구', '군_달성군', '군_동구', '군_북구', '군_서구', '군_수성구', '군_중구', '사고발생횟수',
       'year', 'month', 'day', 'hour', 'weekday', 'day_of_week', 'holiday',
       'Cosine_Time', 'season'],
      dtype='object')

In [31]:
# null 값을 0으로 채우기
df_t.fillna(0, inplace=True)
df_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39609 entries, 0 to 39608
Data columns (total 35 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   ID           39609 non-null  object        
 1   사고일시         39609 non-null  datetime64[ns]
 2   요일           39609 non-null  object        
 3   기상상태         39609 non-null  object        
 4   시군구          39609 non-null  object        
 5   도로형태         39609 non-null  object        
 6   노면상태         39609 non-null  object        
 7   사고유형         39609 non-null  object        
 8   ECLO         39609 non-null  int64         
 9   보호구역         39609 non-null  float64       
 10  불법주정차        39609 non-null  float64       
 11  차량전용         39609 non-null  float64       
 12  평지도로         39609 non-null  float64       
 13  설치개수         39609 non-null  float64       
 14  주차장개수        39609 non-null  float64       
 15  어린이보호구역개수    39609 non-null  float64       
 16  구   

In [32]:
# null 값을 0으로 채우기
df_te.fillna(0, inplace=True)
df_te.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10963 entries, 0 to 10962
Data columns (total 34 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   ID           10963 non-null  object        
 1   사고일시         10963 non-null  datetime64[ns]
 2   요일           10963 non-null  object        
 3   기상상태         10963 non-null  object        
 4   시군구          10963 non-null  object        
 5   도로형태         10963 non-null  object        
 6   노면상태         10963 non-null  object        
 7   사고유형         10963 non-null  object        
 8   보호구역         10963 non-null  float64       
 9   불법주정차        10963 non-null  float64       
 10  차량전용         10963 non-null  float64       
 11  평지도로         10963 non-null  float64       
 12  설치개수         10963 non-null  float64       
 13  주차장개수        10963 non-null  float64       
 14  어린이보호구역개수    10963 non-null  float64       
 15  구            10963 non-null  object        
 16  군_남구

In [33]:
test_x = df_te.drop(columns=['ID','year','month','day','hour','day_of_week','사고일시']).copy()
train_x = df_t[test_x.columns].copy()
train_y = np.log1p(df_t['ECLO'].copy())

In [41]:
train_x.columns.astype(str)

Index(['요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형', '보호구역', '불법주정차', '차량전용',
       '평지도로', '설치개수', '주차장개수', '어린이보호구역개수', '구', '군_남구', '군_달서구', '군_달성군',
       '군_동구', '군_북구', '군_서구', '군_수성구', '군_중구', '사고발생횟수', 'weekday', 'holiday',
       'Cosine_Time', 'season'],
      dtype='object')

In [44]:
train_x

Unnamed: 0,요일,기상상태,시군구,도로형태,노면상태,사고유형,보호구역,불법주정차,차량전용,평지도로,...,군_동구,군_북구,군_서구,군_수성구,군_중구,사고발생횟수,weekday,holiday,Cosine_Time,season
0,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,1.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,131,1,1,1.000000,겨울
1,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,1.0,8.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,678,1,1,1.000000,겨울
2,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,1.0,0.0,604,1,1,0.965926,겨울
3,화요일,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,7.0,0.0,0.0,3.0,...,0.0,1.0,0.0,0.0,0.0,426,1,1,0.866025,겨울
4,화요일,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,7.0,0.0,0.0,5.0,...,1.0,0.0,0.0,0.0,0.0,825,1,1,0.500000,겨울
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,금요일,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,104,4,0,0.258819,겨울
39605,금요일,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,11.0,12.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,1143,4,0,0.258819,겨울
39606,금요일,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,4.0,13.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,618,4,0,0.707107,겨울
39607,금요일,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,125,4,0,0.866025,겨울


AUTOML

In [37]:
matplotlib.use('Agg')

In [38]:
automl = AutoML(mode="Compete",
                algorithms = ['Random Forest',
                            'LightGBM',
                            'Xgboost',
                            'CatBoost'],
                n_jobs = -1,
                total_time_limit=43200,
                eval_metric="rmse", 
                ml_task = "regression",
                results_path = '../Model/AutoML_rmse_best_log1p/',
                explain_level=2)

In [39]:
automl.fit(train_x, train_y)

This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'.


In [90]:
pred = automl.predict(test_x)

In [91]:
submission = pd.read_csv('../Data/sample_submission.csv')

In [107]:
submission['ECLO'] = np.expm1(pred)

In [108]:
submission.loc[ submission['ECLO'] < 0.0, 'ECLO'] = 0.0

In [109]:
min(submission['ECLO'])

2.747193948121656

In [110]:
submission.to_csv('../Data/AutoML_best_model_log1p.csv', index=False)