In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder, MultiLabelBinarizer

In [3]:
categorical = ['주야', '요일', '발생지시도', '발생지시군구', '사고유형_대분류', '사고유형_중분류', '법규위반', 
            '도로형태_대분류', '도로형태', '당사자종별_1당_대분류', '당사자종별_2당_대분류']
numerical = ['사상자수', '사망자수', '중상자수', '경상자수','부상신고자수']

x_train_num = pd.read_csv('./교통사망사고정보/Kor_Train_교통사망사고정보(12.1~17.6).csv',encoding='cp949', 
                              usecols=numerical)

x_train_cat = pd.read_csv('./교통사망사고정보/Kor_Train_교통사망사고정보(12.1~17.6).csv',encoding='cp949',
                               usecols=categorical)


In [4]:
x_test_num = pd.read_csv('./test_kor.csv',encoding='cp949', 
                              usecols=numerical)

x_test_cat = pd.read_csv('./test_kor.csv',encoding='cp949',
                               usecols=categorical)

### One Hot Encdoing을 나열해서 만드는 방법

In [5]:
all_data = pd.concat((x_test_cat.dropna(),x_train_cat))
# for col in all_data.select_dtypes(include=[np.object]).columns:
#     print(col, all_data[col].unique())

In [6]:
for column in all_data.select_dtypes(include=[np.object]).columns:
    x_train_cat[column] = x_train_cat[column].astype('category', categories = all_data[column].unique())
    x_test_cat[column] = x_test_cat[column].astype('category', categories = all_data[column].unique())

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
x_train_cat = pd.get_dummies(data=x_train_cat)
x_test_cat = pd.get_dummies(data=x_test_cat)

In [8]:
print(x_train_cat.shape)
print(x_test_cat.shape)

(25037, 328)
(50, 328)


# Deep Learning

In [9]:
import keras
from keras.layers import Dense, Input, LSTM, concatenate, Dropout, Conv2D, MaxPool2D, Embedding, Reshape, Conv1D
from keras.models import Sequential, Model
from keras.utils import np_utils
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from keras import metrics
from keras import backend as K

Using TensorFlow backend.


In [10]:
print(x_train_cat.shape)
print(x_train_num.shape)
print(x_test_cat.shape)
print(x_test_num.shape)

(25037, 328)
(25037, 5)
(50, 328)
(50, 5)


In [11]:
# Numeric cases
Case1 : ['사망자수','사상자수','경상자수']
Case2 = ['사상자수', '중상자수', '부상신고자수']
Case3 = ['사상자수', '중상자수', '경상자수' ]
Case4 = ['사망자수', '사상자수', '중상자수' ]

# Categorical cases
Case5 = ['사고유형_대분류', '사고유형_중분류', '법규위반']
Case6 = ['도로형태_대분류', '도로형태', '당사자종별_1당_대분류']
Case7 = ['도로형태_대분류', '도로형태', '당사자종별_2당_대분류']
Case8 = ['도로형태_대분류', '도로형태', '당사자종별_1당_대분류', '당사자종별_2당_대분류']
Case11 = ['발생지시도', '발생지시군구']
Case12 = ['요일', '사고유형_대분류', '사고유형_중분류']
Case13 = ['요일', '사고유형_중분류', '법규위반', '도로형태_대분류']

# Mixed cases
Case9 = ['사망자수', '사상자수', '발생지시군구']
Case10 = ['중상자수', '경상자수', '발생지시군구']
Case14 = ['사망자수', '사상자수', '주야', '당사자종별_1당_대분류']
Case15 = ['사상자수', '중상자수', '주야', '도로형태']

Cases = [Case1, Case2, Case3, Case4, Case5, Case6, Case7, Case8, Case11, Case12, Case13,
        Case9, Case10, Case14, Case15]

In [12]:
# 수치형 데이터 Case 함수

def numeric_case(case, start, end):
    
    K.clear_session()
    
    print("Case:", case)
    X = x_train_num.drop(columns=case)
    X_test = x_test_num.drop(columns=case)
    
    case_copy=case.copy()
    if '사상자수' in case:       
        case_copy.remove('사상자수')
    print('사상자제거:', case_copy)
    Y = x_train_num[case_copy].values
    
    # 수치형 데이터와 범주형 데이터 합치기
    X = pd.concat([X, x_train_cat], axis=1).values
    X_test = pd.concat([X_test, x_test_cat],axis=1).values
    
    print(X)
    print(Y)
    
    # 모델 정의
    num_input = Input(shape=(len(X[0]),), name='num_input')
    x = Dense(512, activation='relu')(num_input)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x)
    num_output = Dense(len(Y[0]), name='num_output')(x)

    model = Model(inputs=num_input, outputs=num_output)

    model.compile(optimizer='sgd',
                  loss='mse',
                  metrics=['accuracy'])



    
    
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                        patience=25, 
                                        verbose=1, 
                                        factor=0.5, 
                                        min_lr=0.00000001)

    callbacks = [
    #         learning_rate_reduction, # learning_rate를 점차 감소시킴으로서 최적값에 접근하는 방식
        EarlyStopping('val_loss', patience=5)# val_loss이 최적값에서 멀어지는 순간 epoch가 남아도 학습 중지
    ]
    
    history = model.fit(X, Y, epochs=50, batch_size=128, callbacks=callbacks,validation_split=0.2 )
    
    
    # make a prediction
    Y_test = model.predict(X_test[start:end+1])
    
    # show the inputs and predicted outputs
    print("X=%s, Predicted=%s" % (X_test[range(start, end+1)],  Y_test ))
    del model
    
    return Y_test

In [98]:
# 범주형 데이터 Case 함수

def categorical_case(case, start, end):
    col_name = [] # ex. '사고유형_대분류_차대차', '사고유형_대분류_차대사람', '사고유형_대분류_차량단독'
    label_name = [] #  ex. '차대차', '차대사람', '차량단독
    
    for col in case:
        label_name.extend(all_data[col].unique()) 
        for name in all_data[col].unique():
            col_name.append(col+'_'+name)
    
    print('col_name:',col_name)
    print('label_name:', label_name)
                
    Y = x_train_cat[col_name].values
    X = x_train_cat.drop(columns=col_name)
    X = pd.concat([X, x_train_num], axis=1).values

    X_test = x_test_cat.drop(columns=col_name)
    X_test = pd.concat([X_test, x_test_num],axis=1).values

    print(X.shape)
    print(Y.shape)

    cat_input = Input(shape=(len(X[0]),), name='cat_input')
    x = Dense(512, activation='relu')(cat_input)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x)
    cat_output = Dense(len(Y[0]), activation='sigmoid', name='cat_output')(x)

    model = Model(inputs=cat_input, outputs=cat_output)

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                                patience=25, 
                                                verbose=1, 
                                                factor=0.5, 
                                                min_lr=0.00000001)

    callbacks = [
        learning_rate_reduction, # learning_rate를 점차 감소시킴으로서 최적값에 접근하는 방식
        EarlyStopping('val_loss', patience=10), # val_loss이 최적값에서 멀어지는 순간 epoch가 남아도 학습 중지
        ]

    history = model.fit(X, Y, epochs=50, batch_size=128, callbacks=callbacks,validation_split=0.2 )
    
    
    # make a prediction
    Y_test = model.predict(X_test[range(start, end+1)])
    
    # show the inputs and predicted outputs
#     print("X=%s, Predicted=%s" % (X_test[range(start, end+1)],  Y_test ))
    
            
    result = []
    '''
    예시 출력:
        사고유형_대분류 : 차량단독
        사고유형_중분류 : 공작물충돌
        법규위반 : 안전운전 의무 불이행
    '''
    for val in Y_test: 
        x_list = list(val)
        label_name_x = label_name.copy()
        temp = []
        for col in case:
            print(col, ':', label_name_x[x_list.index(max(x_list[0:len(all_data[col].unique())]))] )
            temp.append(label_name_x[x_list.index(max(x_list[0:len(all_data[col].unique())]))])
            del x_list[:len(all_data[col].unique())]
            del label_name_x[:len(all_data[col].unique())]
        result.append(temp)
        print()
    
    return np.array(result)


In [14]:
# 믹스형 데이터 Case 함수

def mix_case(case, n, start, end):
    '''
    case: Case에 해당되는 컬럼이 담긴 배열
    n: 범주형 데이터 수
    start: 해당 Case 테스트의 시작 인덱스
    end: 해당 Case 테스트의 마지막 인덱스
    '''
    
    case_copy=case.copy()
    
    # categorical
    col_name = []
    label_name = []
    cat_name = case_copy[-n:]
    
    for col in case_copy[-n:]:
        label_name.extend(all_data[col].unique()) 
        for name in all_data[col].unique():
            col_name.append(col+'_'+name)
    
    Y_cat = x_train_cat[col_name].values
    X1 = x_train_cat.drop(columns=col_name)
    X_test1 = x_test_cat.drop(columns=col_name)
    
    # categorical columns 삭제
    del case_copy[-n:]
    
    # numerical
    X2 = x_train_num.drop(columns=case_copy)
    X_test2 = x_test_num.drop(columns=case_copy)
    if '사상자수' in case: 
        case_copy.remove('사상자수')
    Y_num = x_train_num[case_copy].values
    
    X = pd.concat([X1, X2], axis=1).values
    X_test = pd.concat([X_test1, X_test2],axis=1).values
    print(Y_num.shape)
    print(len(Y_num[0]))
    
    cat_input = Input(shape=(len(X[0]),), name='cat_input')
    x = Dense(512, activation='relu')(cat_input)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    cat_output = Dense(len(Y_cat[0]), activation='softmax', name='cat_output')(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    num_output = Dense(len(Y_num[0]), name='num_output')(x)

    model = Model(inputs=cat_input, outputs=[cat_output, num_output])

    model.compile(optimizer='adam',
                  loss={'cat_output': 'categorical_crossentropy', 'num_output': 'mse'},
                  metrics=['accuracy'])
    
    
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=25, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00000001)

    callbacks = [
        learning_rate_reduction, # learning_rate를 점차 감소시킴으로서 최적값에 접근하는 방식
        EarlyStopping('val_loss', patience=20), # val_loss이 최적값에서 멀어지는 순간 epoch가 남아도 학습 중지
    ]

    history = model.fit(X, {'cat_output':Y_cat, 'num_output':Y_num}, epochs=50, batch_size=128, callbacks=callbacks,validation_split=0.2 )
    
    
    # make a prediction
    Y_test = model.predict(X_test[start:end+1])
    
    # show the inputs and predicted outputs
    print("X=%s, Predicted=%s" % (X_test[start:end+1],  Y_test ))
    
    
    
    for val in Y_test[0]:
        x_list = list(val)
        label_name_x = label_name.copy()
        for col in cat_name:
            print(col, ':', label_name_x[x_list.index(max(x_list[0:len(all_data[col].unique())]))] )
            # 출력한 Column과 데이터 삭제
            del x_list[:len(all_data[col].unique())]
            del label_name_x[:len(all_data[col].unique())]
        print()
        
    for num in Y_test[1]:
        print(case_copy, ':', num)
    
    

In [18]:
Case1_pre = numeric_case(Case1, 0,1)

Case: ['사망자수', '사상자수', '경상자수']
사상자제거: ['사망자수', '경상자수']
[[0 0 1 ... 0 0 0]
 [2 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [5 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]]
[[1 0]
 [1 1]
 [1 0]
 ...
 [1 0]
 [1 0]
 [1 0]]
Train on 20029 samples, validate on 5008 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
X=[[0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0

In [21]:
Case1_pre

array([[ 0.99262625,  0.35796607],
       [ 0.9594538 , -0.0098708 ]], dtype=float32)

In [43]:
# set result array to each cases
def setResult(arr, predict, case):
    result_arr = arr
    
    
    if case == [0,1]:
        for row in case:
            result_arr[row, 2] = predict[row, 0] 
            result_arr[row, 5] = predict[row, 1] 
            result_arr[:, 3] = result_arr[row, 2].sum() + result_arr[row, 4:7].sum()
            print(result_arr)

In [44]:
x_test = pd.read_csv('./test_kor.csv',encoding='cp949')

# setResult(each case array, predict array, start to end in each case):
setResult(x_test.loc[:1].values, Case1_pre, [0,1])

[['야간' '금' 0.99262625 1.350592315196991 0.0 0.35796607 0.0 '경기' '화성시'
  '차대차' '측면충돌' '중앙선 침범' '단일로' '기타단일로' '승용차' '승합차']
 ['야간' '금' nan 1.350592315196991 0.0 nan 0.0 '전남' '영암군' '차대사람' '차도통행중'
  '과속' '단일로' '기타단일로' '승용차' '보행자']]
[['야간' '금' 0.99262625 0.9495830247178674 0.0 0.35796607 0.0 '경기' '화성시'
  '차대차' '측면충돌' '중앙선 침범' '단일로' '기타단일로' '승용차' '승합차']
 ['야간' '금' 0.9594538 0.9495830247178674 0.0 -0.009870796 0.0 '전남' '영암군'
  '차대사람' '차도통행중' '과속' '단일로' '기타단일로' '승용차' '보행자']]


In [165]:
# set result array to each cases
def setResult(arr, predict, case):
    result_arr = arr
    
    if case == [int(x) for x in range(10,20)]:
        for row in range(len(predict)):
            result_arr[row, 9] = predict[row, 0] 
            result_arr[row, 10] = predict[row, 1] 
            result_arr[row, 11] = predict[row, 2] 
            
    elif case == [int(x) for x in range(20,23)]:
        for row in range(len(predict)):
            result_arr[row, 12] = predict[row, 0] 
            result_arr[row, 13] = predict[row, 1] 
            result_arr[row, 14] = predict[row, 2] 
            
    elif case == [int(x) for x in range(23,26)]:
        for row in range(len(predict)):
            result_arr[row, 12] = predict[row, 0] 
            result_arr[row, 13] = predict[row, 1] 
            result_arr[row, 15] = predict[row, 2] 
    
    elif case == [int(x) for x in range(26,30)]:
        for row in range(len(predict)):
            result_arr[row, 12] = predict[row, 0] 
            result_arr[row, 13] = predict[row, 1] 
            result_arr[row, 14] = predict[row, 2] 
            result_arr[row, 15] = predict[row, 3] 
    
    elif case == [int(x) for x in range(35,40)]:
        for row in range(len(predict)):
            result_arr[row, 7] = predict[row, 0] 
            result_arr[row, 8] = predict[row, 1] 
        
        
    elif case == [int(x) for x in range(40,42)]:
        for row in range(len(predict)):
            result_arr[row, 1] = predict[row, 0] 
            result_arr[row, 9] = predict[row, 1] 
            result_arr[row, 10] = predict[row, 2] 
        
    elif case == [int(x) for x in range(42,45)]:
        for row in range(len(predict)):
            result_arr[row, 1] = predict[row, 0] 
            result_arr[row, 10] = predict[row, 1] 
            result_arr[row, 11] = predict[row, 2] 
            result_arr[row, 12] = predict[row, 3] 
            
            
    print(result_arr)

In [166]:
setResult(x_test.loc[42:44].values, Case13_pre, [int(x) for x in range(42,45)])

[['주간' '수' 1.0 1.0 0.0 0.0 0.0 '경기' '부천시' '차대차' '추돌' '안전운전 의무 불이행' '교차로'
  '교차로내' '승합차' '이륜차']
 ['주간' '월' 1.0 1.0 0.0 0.0 0.0 '대전' '동구' '차량단독' '공작물충돌' '안전운전 의무 불이행'
  '단일로' '기타단일로' '특수차' '없음']
 ['야간' '토' 2.0 2.0 0.0 0.0 0.0 '충남' '보령시' '차대사람' '횡단중' '안전운전 의무 불이행'
  '단일로' '기타단일로' '승용차' '보행자']]
