# Merged data AutoML

## 1. setting modules

In [1]:
# load modules
import numpy as np
import pandas as pd
import geopandas as gpd
import tqdm
import random
import os
import fiona
import sqlite3
from shapely import wkb
from pyproj import Proj, transform
import csv
import chardet

# split
from sklearn.model_selection import train_test_split

# models 
from xgboost import XGBRegressor, DMatrix
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from supervised.automl import AutoML

# tuning
import optuna

# vif
from statsmodels.stats.outliers_influence import variance_inflation_factor

# visualization
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib as mpl
import plotly
from shapely.geometry import MultiPolygon
from shapely.wkt import loads

# metrics
from sklearn.metrics import mean_squared_log_error

# 경고 무시
import warnings
warnings.filterwarnings('ignore')

# test
import scipy.stats as stats
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.tsa.seasonal import seasonal_decompose

%matplotlib inline

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
plt.rcParams['font.family'] = 'NanumSquare'
plt.rcParams['font.size'] = 10

In [3]:
# load pd data
cross = pd.read_csv('../Data/cross100_address.csv', encoding='cp949')
park_cctv = pd.read_csv('../Data/bigdatamart/주차단속카메라정보.csv', encoding='cp949')
rest = pd.read_csv('../Data/bigdatamart/소상공인시장진흥공단_상가(상권)정보_대구_202109.csv')
corner = pd.read_csv('../Data/bigdatamart/먹거리골목업소정보.csv', encoding = 'cp949')
nadel = pd.read_csv('../Data/bigdatamart/소상공인시장진흥공단_전국 나들가게 현황(CSV)_20210512.csv', encoding = 'cp949')
baek = pd.read_csv('../Data/bigdatamart/소상공인시장진흥공단_전국 백년가게 현황_20220713.csv', encoding='cp949')
train = pd.read_csv('../Data/train_4.csv', encoding='cp949')
test = pd.read_csv('../Data/test_4.csv', encoding='cp949')
m4 = pd.read_csv('../Data/bigdatamart/merge_3_data_by_dongga.csv', encoding='cp949')
train_merge = pd.read_csv('../Data/train_merged_with_parking.csv', encoding='cp949')
test_merge = pd.read_csv('../Data/test_merged_with_parking.csv', encoding='cp949')

In [4]:
# random seed 고정하기
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [5]:
# train, test 추가변수 생성 완료
train_merge[train.columns[~train.columns.isin(train_merge.columns)]] = train[train.columns[~train.columns.isin(train_merge.columns)]]
test_merge[test.columns[~test.columns.isin(test_merge.columns)]] = test[test.columns[~test.columns.isin(test_merge.columns)]]

In [6]:
# 미리 저장하기
#train_merge.to_csv('../Data/train_4.csv', index = False, encoding = 'cp949')
#test_merge.to_csv('../Data/test_4.csv', index = False, encoding = 'cp949')

## 2. Preprocessing

In [7]:
# 요일: 주말과 평균으로 나눔
train_merge['주말'] = np.where(train_merge['요일'].isin(['토요일', '일요일']), 1, 0)
test_merge['주말'] = np.where(test_merge['요일'].isin(['토요일', '일요일']), 1, 0)

In [8]:
# 사고 시간
train_merge['피크타임'] = np.where(train_merge['시간'] > 6, 0, 1)
test_merge['피크타임'] = np.where(test_merge['시간'] > 6, 0, 1)

In [9]:
# 기상상태
train_merge['맑음'] = np.where(train_merge['기상상태'] == '맑음', 1, 0)
test_merge['맑음'] = np.where(test_merge['기상상태'] == '맑음', 1, 0)

In [10]:
# 교차로
train_merge['도로'] = np.where(train_merge['도로형태1'].isin(['단일로', '교차로']), train_merge['도로형태1'], '기타')
test_merge['도로'] = np.where(test_merge['도로형태1'].isin(['단일로', '교차로']), test_merge['도로형태1'], '기타')

In [11]:
# 노면상태
train_merge['건조'] = np.where(train_merge['노면상태'] == '건조', 1, 0)
test_merge['건조'] = np.where(test_merge['노면상태'] == '건조', 1, 0)

In [12]:
# 사고유형
train_merge['차대차'] = np.where(train_merge['사고유형'] == '차대차', 1, 0)
test_merge['차대차'] = np.where(test_merge['사고유형'] == '차대차', 1, 0)

In [13]:
# test에 없는 train 변수 중 ECLO를 제외하고 삭제하자
train_merge2 = train_merge[train_merge.columns[train_merge.columns.isin(test_merge.columns)]]
train_merge2['ECLO'] = train_merge['ECLO']

### 기존 계획했던 변수 변환은 끝났고 다음 변수 선택 및 변환을 적용하자

In [14]:
# 사망자수 컬럼 제외
train_merge3 = train_merge2[train_merge2.columns[~train_merge2.columns.str.contains('사망자수')]]
test_merge3 = test_merge[test_merge.columns[~test_merge.columns.str.contains('사망자수')]]

In [15]:
# 부상자수 컬럼 제외
train_merge4 = train_merge3[train_merge3.columns[~train_merge3.columns.str.contains('부상자수')]]
test_merge4 = test_merge3[test_merge3.columns[~test_merge3.columns.str.contains('부상자수')]]

In [16]:
# 경상자수 컬럼 제외
train_merge5 = train_merge4[train_merge4.columns[~train_merge4.columns.str.contains('경상자수')]]
test_merge5 = test_merge4[test_merge4.columns[~test_merge4.columns.str.contains('경상자수')]]

In [17]:
# 중상자수 컬럼 제외
train_merge6 = train_merge5[train_merge5.columns[~train_merge5.columns.str.contains('중상자수')]]
test_merge6 = test_merge5[test_merge5.columns[~test_merge5.columns.str.contains('중상자수')]]

In [18]:
# 필요없는 변수 제외
train_merge7 = train_merge6.drop(['ID', '요일', '기상상태', '노면상태', '사고유형',
                                  '연', '월', '일', '시간', 'sin_hour', 'cos_hour', 
                                  '도시', '구', '동', '도로형태1', '도로형태2', '상권total',
                                  '도로형태', '시군구', '시', '군구', '동가', '사고일시'], axis = 1).fillna(0)
test_merge7 = test_merge6.drop(['ID', '요일', '기상상태', '노면상태', '사고유형',
                                  '연', '월', '일', '시간', 'sin_hour', 'cos_hour', 
                                  '도시', '구', '동', '도로형태1', '도로형태2', '상권total',
                                  '도로형태', '시군구', '시', '군구', '동가', '사고일시', '주차단속카메라_개수'], axis = 1).fillna(0)

In [19]:
# 미리 저장하기
train_merge7.to_csv('../Data/train_5.csv', index = False, encoding = 'cp949')
test_merge7.to_csv('../Data/test_5.csv', index = False, encoding = 'cp949')

In [20]:
# 타겟인코딩
target_encoded=train_merge7.groupby('도로')['ECLO'].mean()
train_merge7['도로']=train_merge7['도로'].map(target_encoded)
test_merge7['도로']=test_merge7['도로'].map(target_encoded)

## 3. Modeling

In [21]:
# train test split
x_train = train_merge7[train_merge7.columns.difference(['ECLO'])]
y_train = train_merge7['ECLO']

In [22]:
# automl modeling
automl = AutoML(mode="Compete",
                algorithms=["CatBoost",
                            "Xgboost",
                            "LightGBM",
                            ], 
                ml_task = "regression",
                eval_metric = 'mae',
                random_state = 42,
                n_jobs = -1,
                total_time_limit=43200,
                results_path = '../Model/AutoML_mae_merged_data/',
                explain_level=2
               )

In [23]:
# fitting
automl.fit(x_train, y_train)

AutoML directory: ../Model/AutoML_mae_merged_data/
The task is regression with evaluation metric mae
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'LightGBM']
AutoML will stack models
AutoML will ensemble available models


AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree mae 2.152893 trained in 27.71 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_LightGBM mae 2.041616 trained in 130.1 seconds
2_Default_Xgboost mae 2.041328 trained in 108.86 seconds
3_Default_CatBoost mae 2.037985 trained in 48.97 seconds
* Step not_so_random will try to check up to 27 models
13_LightGBM mae 2.041152 trained in 105.93 seconds
4_Xgboost mae 2.043465 trained in 113.15 seconds
22_CatBoost mae 2.038188 trained in 59.43 seconds
14_LightGBM mae 2.040166 trained 

2023-12-10 03:54:00,735 supervised.exceptions ERROR All features are droppped! Your data looks like random data.
2023-12-10 03:54:00,914 supervised.exceptions ERROR All features are droppped! Your data looks like random data.


There was an error during 8_Xgboost_SelectedFeatures training.
Please check ../Model/AutoML_mae_merged_data/errors.md for details.
There was an error during 15_LightGBM_SelectedFeatures training.
Please check ../Model/AutoML_mae_merged_data/errors.md for details.
* Step hill_climbing_1 will try to check up to 13 models
31_CatBoost_GoldenFeatures mae 1.982057 trained in 109.52 seconds
32_CatBoost mae 1.982881 trained in 146.77 seconds
33_CatBoost mae 1.981416 trained in 150.93 seconds
34_CatBoost mae 1.982992 trained in 79.92 seconds
35_Xgboost mae 2.03871 trained in 150.99 seconds
36_Xgboost mae 2.03861 trained in 150.13 seconds
37_Xgboost mae 2.03963 trained in 143.96 seconds
38_Xgboost mae 2.04045 trained in 152.87 seconds
39_LightGBM mae 2.039195 trained in 152.66 seconds
40_LightGBM mae 2.041098 trained in 157.42 seconds
41_LightGBM mae 2.040673 trained in 144.28 seconds
42_LightGBM mae 2.040919 trained in 203.33 seconds
43_LightGBM mae 2.041449 trained in 212.35 seconds
* Step hil

In [24]:
preds = automl.predict(test_merge7)

In [25]:
# submission 만들기
sample_submission = pd.read_csv('../Data/sample_submission.csv')
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = preds
baseline_submission.to_csv('../Data/AutoML_mae_merged_data.csv', index = False, encoding = 'cp949')