# Select Merged data AutoML

## 1. setting modules

In [34]:
# load modules
import numpy as np
import pandas as pd
import geopandas as gpd
import tqdm
import random
import os
import fiona
import sqlite3
from shapely import wkb
from pyproj import Proj, transform
import csv
import chardet

# split
from sklearn.model_selection import train_test_split

# models 
from xgboost import XGBRegressor, DMatrix
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from supervised.automl import AutoML

# tuning
import optuna

# vif
from statsmodels.stats.outliers_influence import variance_inflation_factor

# visualization
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib as mpl
import plotly
from shapely.geometry import MultiPolygon
from shapely.wkt import loads

# metrics
from sklearn.metrics import mean_squared_log_error

# 경고 무시
import warnings
warnings.filterwarnings('ignore')

# test
import scipy.stats as stats
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.tsa.seasonal import seasonal_decompose

%matplotlib inline

In [35]:
plt.rcParams['font.family'] = 'NanumSquare'
plt.rcParams['font.size'] = 10

In [36]:
# load pd data
cross = pd.read_csv('../Data/cross100_address.csv', encoding='cp949')
park_cctv = pd.read_csv('../Data/bigdatamart/주차단속카메라정보.csv', encoding='cp949')
rest = pd.read_csv('../Data/bigdatamart/소상공인시장진흥공단_상가(상권)정보_대구_202109.csv')
corner = pd.read_csv('../Data/bigdatamart/먹거리골목업소정보.csv', encoding = 'cp949')
nadel = pd.read_csv('../Data/bigdatamart/소상공인시장진흥공단_전국 나들가게 현황(CSV)_20210512.csv', encoding = 'cp949')
baek = pd.read_csv('../Data/bigdatamart/소상공인시장진흥공단_전국 백년가게 현황_20220713.csv', encoding='cp949')
train = pd.read_csv('../Data/train_4.csv', encoding='cp949')
test = pd.read_csv('../Data/test_4.csv', encoding='cp949')
m4 = pd.read_csv('../Data/bigdatamart/merge_3_data_by_dongga.csv', encoding='cp949')
train_merge = pd.read_csv('../Data/train_merged_with_parking.csv', encoding='cp949')
test_merge = pd.read_csv('../Data/test_merged_with_parking.csv', encoding='cp949')

In [37]:
# random seed 고정하기
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [38]:
# train, test 추가변수 생성 완료
train_merge[train.columns[~train.columns.isin(train_merge.columns)]] = train[train.columns[~train.columns.isin(train_merge.columns)]]
test_merge[test.columns[~test.columns.isin(test_merge.columns)]] = test[test.columns[~test.columns.isin(test_merge.columns)]]

In [39]:
# 미리 저장하기
#train_merge.to_csv('../Data/train_4.csv', index = False, encoding = 'cp949')
#test_merge.to_csv('../Data/test_4.csv', index = False, encoding = 'cp949')

## 2. Preprocessing

In [40]:
# 요일: 주말과 평균으로 나눔
train_merge['주말'] = np.where(train_merge['요일'].isin(['토요일', '일요일']), 1, 0)
test_merge['주말'] = np.where(test_merge['요일'].isin(['토요일', '일요일']), 1, 0)

In [41]:
# 사고 시간
train_merge['피크타임'] = np.where(train_merge['시간'] > 6, 0, 1)
test_merge['피크타임'] = np.where(test_merge['시간'] > 6, 0, 1)

In [42]:
# 기상상태
train_merge['맑음'] = np.where(train_merge['기상상태'] == '맑음', 1, 0)
test_merge['맑음'] = np.where(test_merge['기상상태'] == '맑음', 1, 0)

In [43]:
# 교차로
train_merge['도로'] = np.where(train_merge['도로형태1'].isin(['단일로', '교차로']), train_merge['도로형태1'], '기타')
test_merge['도로'] = np.where(test_merge['도로형태1'].isin(['단일로', '교차로']), test_merge['도로형태1'], '기타')

In [44]:
# 노면상태
train_merge['건조'] = np.where(train_merge['노면상태'] == '건조', 1, 0)
test_merge['건조'] = np.where(test_merge['노면상태'] == '건조', 1, 0)

In [45]:
# 사고유형
train_merge['차대차'] = np.where(train_merge['사고유형'] == '차대차', 1, 0)
test_merge['차대차'] = np.where(test_merge['사고유형'] == '차대차', 1, 0)

In [46]:
# test에 없는 train 변수 중 ECLO를 제외하고 삭제하자
train_merge2 = train_merge[train_merge.columns[train_merge.columns.isin(test_merge.columns)]]
train_merge2['ECLO'] = train_merge['ECLO']

### 기존 계획했던 변수 변환은 끝났고 다음 변수 선택 및 변환을 적용하자

In [47]:
# 사망자수 컬럼 제외
train_merge3 = train_merge2[train_merge2.columns[~train_merge2.columns.str.contains('사망자수')]]
test_merge3 = test_merge[test_merge.columns[~test_merge.columns.str.contains('사망자수')]]

In [48]:
# 부상자수 컬럼 제외
train_merge4 = train_merge3[train_merge3.columns[~train_merge3.columns.str.contains('부상자수')]]
test_merge4 = test_merge3[test_merge3.columns[~test_merge3.columns.str.contains('부상자수')]]

In [49]:
# 경상자수 컬럼 제외
train_merge5 = train_merge4[train_merge4.columns[~train_merge4.columns.str.contains('경상자수')]]
test_merge5 = test_merge4[test_merge4.columns[~test_merge4.columns.str.contains('경상자수')]]

In [50]:
# 중상자수 컬럼 제외
train_merge6 = train_merge5[train_merge5.columns[~train_merge5.columns.str.contains('중상자수')]]
test_merge6 = test_merge5[test_merge5.columns[~test_merge5.columns.str.contains('중상자수')]]

In [51]:
# 필요없는 변수 제외
train_merge7 = train_merge6.drop(['ID', '요일', '기상상태', '노면상태', '사고유형', '자전거사고지역수', '음주사고지역수', '결빙사고지역수',
                                  '연', '월', '일', '시간', 'sin_hour', 'cos_hour', '이륜차사고지역수',
                                  '도시', '구', '동', '도로형태1', '도로형태2', '보행자사고지역수',
                                  '보행노인사고사고건수', 'oldman_ECLO', '보행노인사고지역수', '맛집개수',
                                  '보행어린이사고사고건수', 'child_ECLO', '보행어린이사고지역수', '정원', '펌프', '탱크', '고가굴절', '화학',
                                  '구조', '구급', '화물차지역수', '소상공인상권정보_개수', '먹거리골목업소정보_개수', '주차장CCTV개수',
                                  '관광/여가/오락', '부동산', '생활서비스', '소매', '숙박', '스포츠', '음식', '학문/교육', '상권total',
                                  '보행자무단횡단발생건수', 'jaywalking_ECLO', '보행자무단횡단지역수', 'parking_count', '유료주차', '무료주차',
                                  '보행어린이사고사고건수', 'child_ECLO', '보행어린이사고지역수', '법규위반지역수',
                                  '스쿨존내어린이사고사고건수', 'schoolzone_ECLO', '스쿨존내어린이사고지역수', '장 비(소방차량)계',
                                  '연휴기간사고건수', 'tmzon_ECLO', '연휴기간지역수', '보호구역', '불법주정차', '기타', '안전센터', '구조대', '지역대',
                                  '차량전용', '평지도로', 'cctv 설치개수', '주차장개수', '어린이보호구역개수', '노인요양시설_개수', '경로당현황_개수',
                                  '도로형태', '시군구', '시', '군구', '동가', '사고일시'], axis = 1).fillna(0)

test_merge7 = test_merge6.drop(['ID', '요일', '기상상태', '노면상태', '사고유형', '자전거사고지역수', '음주사고지역수', '결빙사고지역수',
                                  '연', '월', '일', '시간', 'sin_hour', 'cos_hour', '이륜차사고지역수',
                                  '도시', '구', '동', '도로형태1', '도로형태2', '보행자사고지역수',
                                  '보행노인사고사고건수', 'oldman_ECLO', '보행노인사고지역수', '맛집개수',
                                  '보행어린이사고사고건수', 'child_ECLO', '보행어린이사고지역수', '정원', '펌프', '탱크', '고가굴절', '화학',
                                  '구조', '구급', '화물차지역수', '소상공인상권정보_개수', '먹거리골목업소정보_개수', '주차장CCTV개수',
                                  '관광/여가/오락', '부동산', '생활서비스', '소매', '숙박', '스포츠', '음식', '학문/교육', '상권total',
                                  '보행자무단횡단발생건수', 'jaywalking_ECLO', '보행자무단횡단지역수', 'parking_count', '유료주차', '무료주차',
                                  '보행어린이사고사고건수', 'child_ECLO', '보행어린이사고지역수', '법규위반지역수',
                                  '스쿨존내어린이사고사고건수', 'schoolzone_ECLO', '스쿨존내어린이사고지역수', '장 비(소방차량)계',
                                  '연휴기간사고건수', 'tmzon_ECLO', '연휴기간지역수', '보호구역', '불법주정차', '기타', '안전센터', '구조대', '지역대',
                                  '차량전용', '평지도로', 'cctv 설치개수', '주차장개수', '어린이보호구역개수', '노인요양시설_개수', '경로당현황_개수',
                                  '도로형태', '시군구', '시', '군구', '동가', '사고일시', '주차단속카메라_개수'], axis = 1).fillna(0)

In [52]:
train_merge7.columns

Index(['자전거사고사고건수', 'bicycle_ECLO', '음주사고사고건수', 'drunk_ECLO', '결빙사고사고건수',
       'freezing_ECLO', '이륜차사고사고건수', 'motorcycle_ECLO', '보행자사고사고건수',
       'pedstrians_ECLO', '법규위반사고건수', 'violt_ECLO', '중앙선침범', '신호위반', '화물차사고건수',
       'truck_ECLO', '노상주차면수', '횡단보도개수', '주말', '피크타임', '맑음', '도로', '건조', '차대차',
       'ECLO'],
      dtype='object')

In [53]:
# 타겟인코딩
target_encoded=train_merge7.groupby('도로')['ECLO'].mean()
train_merge7['도로']=train_merge7['도로'].map(target_encoded)
test_merge7['도로']=test_merge7['도로'].map(target_encoded)

## 3. Modeling

In [54]:
# train test split
x_train = train_merge7[train_merge7.columns.difference(['ECLO'])]
y_train = np.log1p(train_merge7['ECLO'])

In [55]:
# automl modeling
automl = AutoML(mode="Compete",
                algorithms=["CatBoost",
                            "Xgboost",
                            "LightGBM",
                            ], 
                ml_task = "regression",
                eval_metric = 'rmse',
                random_state = 42,
                n_jobs = -1,
                total_time_limit=43200,
                results_path = '../Model/AutoML_mae_select_merged_data_log1p/',
                explain_level=2
               )

In [56]:
# fitting
automl.fit(x_train, y_train)

AutoML directory: ../Model/AutoML_mae_select_merged_data_log1p/
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'LightGBM']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree rmse 0.44564 trained in 26.16 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_LightGBM rmse 0.446132 trained in 85.05 seconds
2_Default_Xgboost rmse 0.446055 trained in 68.42 seconds
3_Default_CatBoost rmse 0.445653 trained in 18.6 sec

In [57]:
preds = automl.predict(test_merge7)

In [58]:
# submission 만들기
sample_submission = pd.read_csv('../Data/sample_submission.csv')
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = np.expm1(preds)
baseline_submission.to_csv('../Data/AutoML_rmse_select_merged_data_log1p.csv', index = False, encoding = 'cp949')