In [1]:
# 사용할 패키지 불러오기
import pandas as pd
import numpy as np
np.set_printoptions(threshold=np.inf) # print all numpy values

# For deep learning model 
import keras
from keras.layers import Dense, Input, concatenate, Dropout
from keras.models import Sequential, Model
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from keras import metrics
from keras import backend as K

Using TensorFlow backend.


In [2]:
# 데이터 준비하기 - 학습 데이터, 테스트 데이터 로드
categorical = ['발생지시도', '발생지시군구', '사고유형_대분류', '사고유형_중분류', '법규위반', 
            '도로형태_대분류', '도로형태', '당사자종별_1당_대분류', '당사자종별_2당_대분류', '주야', '요일']
numerical = ['사상자수', '사망자수', '중상자수', '경상자수','부상신고자수']

x_train_num = pd.read_csv('./교통사망사고정보/Kor_Train_교통사망사고정보(12.1~17.6).csv',encoding='cp949', usecols=numerical)

x_train_cat = pd.read_csv('./교통사망사고정보/Kor_Train_교통사망사고정보(12.1~17.6).csv',encoding='cp949', usecols=categorical)

x_test = pd.read_csv('./test_kor.csv', encoding='cp949')

x_test_num = pd.read_csv('./test_kor.csv', encoding='cp949', usecols=numerical)

x_test_cat = pd.read_csv('./test_kor.csv', encoding='cp949', usecols=categorical)

In [3]:
# one-hot encoding

# 모든 존재하는 column의 one hot encoding을 위해 train과 test의 카테고리 통합
all_data = pd.concat((x_test_cat.dropna(), x_train_cat))

# get_dummies가 가능하도록 type 설정
for column in all_data.select_dtypes(include=[np.object]).columns:
    x_train_cat[column] = x_train_cat[column].astype('category', categories = all_data[column].unique())
    x_test_cat[column] = x_test_cat[column].astype('category', categories = all_data[column].unique())

# 모든 column에 대해 one hot encoding 수행
x_train_cat = pd.get_dummies(data=x_train_cat)
x_test_cat = pd.get_dummies(data=x_test_cat)

  
  if __name__ == '__main__':


In [4]:
# Train Data Shape check
print(x_train_cat.shape)
print(x_train_num.shape)

# Test Data Shape check
print(x_test_cat.shape)
print(x_test_num.shape)

(25037, 328)
(25037, 5)
(50, 328)
(50, 5)


In [18]:
# test file에 있는 값을 result file에 저장

def save_result(sheet_name_csv):
    result_file_address = './result_kor.csv'
    test_file = pd.read_csv(sheet_name_csv, encoding='cp949', names= [chr(y) for y in range(ord('A'),ord('P')+1)])
    result_file = pd.read_csv(result_file_address, encoding='cp949')
    print(sheet_name_csv)
    
    cols = result_file['열'].astype('str').values
    rows = result_file['행'].astype('str').astype('int').values
    vals = result_file['값'].astype('str').values

    for i, (row, col) in enumerate(zip(rows, cols)):      
        vals[i] = test_file[col][int(row)-1]


    with open(result_file_address, 'wb+') as f:
        np.savetxt(f,  np.c_[rows,cols,vals],  delimiter=",", fmt='%s', encoding='cp949', header='행,열,값', comments='')
    print('Save Success')

In [6]:
# Hyper Parameters 
epochs = 1
batch_size = 128
optimizer = 'adam' 

In [7]:
# Numeric case를 학습하고 예상값을 return

# case: column명 array
# start: 예측할 시작 row
# end: 예측할 마지막 row
def numeric_case(case, start, end):
    
    K.clear_session()
    case_copy=case.copy()
    
    # Case 확인
    print("Case:", case)
    
    # Train Data
    X = x_train_num.drop(columns=case)
    X = pd.concat([X, x_train_cat], axis=1).values
    
    # Test Data
    X_test = x_test_num.drop(columns=case)
    X_test = pd.concat([X_test, x_test_cat],axis=1).values
    
    # Label Data
    if '사상자수' in case:       
        case_copy.remove('사상자수')
        print('사상자제거:', case_copy) 
    Y = x_train_num[case_copy].values
    
    
    # Model define(Ver. MP)
    num_input = Input(shape=(len(X[0]),), name='num_input')
    x = Dense((int(len(X[0] + len(Y[0])) * 2 /3)), activation='relu')(num_input)
    x = Dropout(0.3)(x)
    x = Dense((int(len(X[0] + len(Y[0])) * 2 /3)), activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense((int(len(X[0] + len(Y[0])) * 2 /3)), activation='relu')(x)
    x = Dropout(0.3)(x)
    num_output = Dense(len(Y[0]), name='num_output')(x)

    model = Model(inputs=num_input, outputs=num_output)

    model.compile(optimizer=optimizer,
                  loss='mse',
                  metrics=['accuracy'])

    learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=10, verbose=1, factor=0.5, min_lr=0.00000001)

    callbacks = [
        learning_rate_reduction, # learning_rate를 점차 감소시킴으로서 최적값에 접근하는 방식
        EarlyStopping('val_loss', patience=15)# val_loss이 최적값에서 멀어지는 순간 epoch가 남아도 학습 중지
    ]
    
    history = model.fit(X, Y, epochs=epochs, batch_size=batch_size, callbacks=callbacks, validation_split=0.2 )
    
    
    # make a prediction
    Y_test = model.predict(X_test[start:end+1])
    
    del model
    
    return Y_test

In [8]:
# Categorical case를 학습하고 예상값을 return

# case: column명 array
# start: 예측할 시작 row
# end: 예측할 마지막 row
def categorical_case(case, start, end):
    
    # 기존 session 초기화
    K.clear_session()
    
    # Case 확인
    print("Case:", case)
    
    col_name = [] # ex. '사고유형_대분류_차대차', '사고유형_대분류_차대사람', '사고유형_대분류_차량단독'
    label_name = [] #  ex. '차대차', '차대사람', '차량단독
    
    # One Hot Encoding 후 Columns 이름과 Columns에 들어 있는 값 
    for col in case:
        label_name.extend(all_data[col].unique()) 
        for name in all_data[col].unique():
            col_name.append(col+'_'+name)

    # Train Data 
    X = x_train_cat.drop(columns=col_name)
    X = pd.concat([X, x_train_num], axis=1).values

    # Test Data
    X_test = x_test_cat.drop(columns=col_name)
    X_test = pd.concat([X_test, x_test_num],axis=1).values
    
    # Label Data
    Y = x_train_cat[col_name].values
    
    # Model define(Ver. MP)
    cat_input = Input(shape=(len(X[0]),), name='cat_input')
    x = Dense(512, activation='relu')(cat_input)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x)
    cat_output = Dense(len(Y[0]), activation='sigmoid', name='cat_output')(x)

    model = Model(inputs=cat_input, outputs=cat_output)

    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=10, verbose=1, factor=0.5, min_lr=0.00000001)

    callbacks = [
        learning_rate_reduction, # learning_rate를 점차 감소시킴으로서 최적값에 접근하는 방식
        EarlyStopping('val_loss', patience=15), # val_loss이 최적값에서 멀어지는 순간 epoch가 남아도 학습 중지
        ]

    history = model.fit(X, Y, epochs=epochs, batch_size=batch_size, callbacks=callbacks,validation_split=0.2 )
    
    # make a prediction
    Y_test = model.predict(X_test[start:end+1])
    

    '''
    예시 출력:
        사고유형_대분류 : 차량단독
        사고유형_중분류 : 공작물충돌
        법규위반 : 안전운전 의무 불이행
    '''
    result = []
    for cat in Y_test: 
        x_list = list(cat)
        label_name_x = label_name.copy()
        temp = []
        for col in case:
#             print(col, ':', label_name_x[x_list.index(max(x_list[0:len(all_data[col].unique())]))] )
            temp.append(label_name_x[x_list.index(max(x_list[0:len(all_data[col].unique())]))])
            del x_list[:len(all_data[col].unique())]
            del label_name_x[:len(all_data[col].unique())]
        result.append(temp)
        
        
    del model
    
    return np.array(result)


In [9]:
# Mix case를 학습하고 예상값을 return

# case: column명 array
# n: 범주형 데이터 수
# start: 예측할 시작 row
# end: 예측할 마지막 row
def mix_case(case, n, start, end):

    # 기존 session 초기화
    K.clear_session()
    
    # Case 확인
    print("Case:", case)
    
    # list 값 복사
    case_copy = case.copy()
    
    # categorical cases 처리
    col_name = []
    label_name = []
    cat_name = []
#     cat_name = case_copy[-n:]
    
    for col in case_copy:
        if col in categorical:
            cat_name.append(col)
            label_name.extend(all_data[col].unique()) 
            for name in all_data[col].unique():
                col_name.append(col+'_'+name)
                
                
    

    Y_cat = x_train_cat[col_name].values
    X1 = x_train_cat.drop(columns=col_name)
    X_test1 = x_test_cat.drop(columns=col_name)
    

    for remove_name in cat_name:
        case_copy.remove(remove_name)
    
    print('해당 numberical 컬럼 순서', case_copy)
    
    
    # numerical cases 처리
    X2 = x_train_num.drop(columns=case_copy)
    X_test2 = x_test_num.drop(columns=case_copy)
    
    if '사상자수' in case: 
        case_copy.remove('사상자수')
        print('사상자제거:', case_copy) 
    Y_num = x_train_num[case_copy].values
    
    X = pd.concat([X1, X2], axis=1).values
    X_test = pd.concat([X_test1, X_test2],axis=1).values
    
    
    # Model define(Ver.MP)
    cat_input = Input(shape=(len(X[0]),), name='cat_input')
    x = Dense(1024, activation='relu')(cat_input)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    
    if n == 1:
        cat_output = Dense(len(Y_cat[0]), activation='softmax', name='cat_output')(x)
    elif n == 2:
        cat_output = Dense(len(Y_cat[0]), activation='sigmoid', name='cat_output')(x)
        
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    num_output = Dense(len(Y_num[0]), name='num_output')(x)

    model = Model(inputs=cat_input, outputs=[cat_output, num_output])

    if n==1:
        model.compile(optimizer=optimizer,
                      loss={'cat_output': 'categorical_crossentropy', 'num_output': 'mse'},
                      metrics=['accuracy'])
    elif n==2:
        model.compile(optimizer=optimizer,
                      loss={'cat_output': 'binary_crossentropy', 'num_output': 'mse'},
                      metrics=['accuracy'])
    
    
    learning_rate_reduction = ReduceLROnPlateau(monitor='cat_output_acc', 
                                            patience=10, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00000001)

    callbacks = [
        learning_rate_reduction, # learning_rate를 점차 감소시킴으로서 최적값에 접근하는 방식
        EarlyStopping('val_loss', patience=20), # val_loss이 최적값에서 멀어지는 순간 epoch가 남아도 학습 중지
    ]

    history = model.fit(X, {'cat_output':Y_cat, 'num_output':Y_num}, epochs=epochs, batch_size=batch_size, callbacks=callbacks,validation_split=0.2 )
    
    
    # make a prediction
    Y_test = model.predict(X_test[start:end+1])    
    
    result = []
    for cat, num in zip(Y_test[0], Y_test[1]):
        x_list = list(cat)
        label_name_x = label_name.copy()
        temp = []
        for col in cat_name:
            temp.append(label_name_x[x_list.index(max(x_list[0:len(all_data[col].unique())]))])
            
            # 출력한 Column과 데이터 삭제
            del x_list[:len(all_data[col].unique())]
            del label_name_x[:len(all_data[col].unique())]
        temp.extend(num)
        result.append(temp)
        
    
    return np.array(result)
    

In [10]:
# set result array to each cases
def setResult(arr, predict, extent, case):
    result_arr = arr
    print('setResult 호출')
        
    for xy in extent:
        if 3 == xy[1]:
            extent.remove(xy)
    
    for i, cell in enumerate(extent):
        result_arr[cell[1]] = predict[0, i]
    
    # 사망자 수는 사망자, 중상자, 경상자, 부상신고자 수에 의해 영향을 받음.
    if '사상자수' in case:
        result_arr[2:7] = np.asarray(result_arr[2:7], dtype='float64')
        result_arr[3] = result_arr[2] + result_arr[4:7].sum()
    
    
    print('result_arr 결과:',result_arr)
    
    return result_arr
    

In [11]:
np_x_test = x_test.isnull().values
x_test_col_name = x_test.columns.values
new_sheet = np.array([x_test.columns.values])
Case_ex = []
label_ex = []
for row in range(len(np_x_test)):
    temp_case = []
    temp_label = []
    for col in range(len(np_x_test[0])):
        if np_x_test[row, col] == True:
            temp_case.append(x_test_col_name[col])
            temp_label.append([row, col])
    Case_ex.append(temp_case)
    label_ex.append(temp_label)

# print(Case_ex)
# print(label_ex)

for idx, (case, cell) in enumerate(zip(Case_ex, label_ex)):
    print(idx)
    print(cell)
    num = 0
    cat = 0
    for one_case in case:
        if one_case in numerical:
            num += 1
        elif one_case in categorical:
            cat += 1
        
    if len(case) == num:
        Case_prediction = numeric_case(case, cell[0][0], cell[-1][0])
        print("예측값 출력: ", Case_prediction)
        answer = setResult(x_test.loc[cell[0][0]].values, Case_prediction, cell, case)
        

    elif len(case) == cat:
        
        Case_prediction = categorical_case(case, cell[0][0], cell[-1][0])
        print("예측값 출력: ", Case_prediction)
        answer = setResult(x_test.loc[cell[0][0]].values, Case_prediction, cell, case)
        
    else:
        print(cell)
        
        temp_num = []
        temp_cat = []
        for one_case in case:
            if one_case in numerical:
                temp_num.append(cell[case.index(one_case)])
                
            if one_case in categorical:
                temp_cat.append(cell[case.index(one_case)])
                
        temp_cat.extend(temp_num)
        print(temp_cat)
        
        Case_prediction = mix_case(case, cat, cell[0][0], cell[-1][0])
        print("예측값 출력: ", Case_prediction)
        answer = setResult(x_test.loc[cell[0][0]].values, Case_prediction, temp_cat, case)
            
    print()
    new_sheet = np.append(new_sheet, [answer], axis=0)

0
[[0, 2], [0, 3], [0, 5]]
Case: ['사망자수', '사상자수', '경상자수']
사상자제거: ['사망자수', '경상자수']
Train on 20029 samples, validate on 5008 samples
Epoch 1/1
예측값 출력:  [[0.92062664 0.4886908 ]]
setResult 호출
result_arr 결과: ['야간' '금' 0.9206266403198242 1.409317433834076 0.0 0.4886907935142517 0.0
 '경기' '화성시' '차대차' '측면충돌' '중앙선 침범' '단일로' '기타단일로' '승용차' '승합차']

1
[[1, 2], [1, 3], [1, 5]]
Case: ['사망자수', '사상자수', '경상자수']
사상자제거: ['사망자수', '경상자수']
Train on 20029 samples, validate on 5008 samples
Epoch 1/1
예측값 출력:  [[0.90263057 0.06199617]]
setResult 호출
result_arr 결과: ['야간' '금' 0.9026305675506592 0.9646267332136631 0.0 0.06199616566300392
 0.0 '전남' '영암군' '차대사람' '차도통행중' '과속' '단일로' '기타단일로' '승용차' '보행자']

2
[[2, 3], [2, 4], [2, 6]]
Case: ['사상자수', '중상자수', '부상신고자수']
사상자제거: ['중상자수', '부상신고자수']
Train on 20029 samples, validate on 5008 samples
Epoch 1/1
예측값 출력:  [[0.04967534 0.01536297]]
setResult 호출
result_arr 결과: ['야간' '월' 1.0 1.065038312226534 0.049675337970256805 0.0
 0.015362974256277084 '전남' '곡성군' '차량단독' '전도전복' '안전운전 의무

In [12]:
new_sheet

array([['주야', '요일', '사망자수', '사상자수', '중상자수', '경상자수', '부상신고자수', '발생지시도',
        '발생지시군구', '사고유형_대분류', '사고유형_중분류', '법규위반', '도로형태_대분류', '도로형태',
        '당사자종별_1당_대분류', '당사자종별_2당_대분류'],
       ['야간', '금', 0.9206266403198242, 1.409317433834076, 0.0,
        0.4886907935142517, 0.0, '경기', '화성시', '차대차', '측면충돌', '중앙선 침범',
        '단일로', '기타단일로', '승용차', '승합차'],
       ['야간', '금', 0.9026305675506592, 0.9646267332136631, 0.0,
        0.06199616566300392, 0.0, '전남', '영암군', '차대사람', '차도통행중', '과속',
        '단일로', '기타단일로', '승용차', '보행자'],
       ['야간', '월', 1.0, 1.065038312226534, 0.049675337970256805, 0.0,
        0.015362974256277084, '전남', '곡성군', '차량단독', '전도전복', '안전운전 의무 불이행',
        '단일로', '기타단일로', '자전거', '없음'],
       ['야간', '일', 2.0, 4.477096319198608, 1.2662509679794312, 1.0,
        0.21084535121917725, '대구', '달성군', '차대차', '측면충돌', '중앙선 침범', '단일로',
        '기타단일로', '승용차', '승합차'],
       ['주간', '목', 1.0, 1.794167935848236, 0.40548306703567505,
        0.38868486881256104, 0.0, '전남', '고흥군', '차대차'

In [13]:
sheet_name_csv = './test_kor_' +  str(epochs) + '.csv'
with open(sheet_name_csv, 'wb') as f:
        np.savetxt(f,  new_sheet,  delimiter=",", fmt='%s', encoding='cp949')

In [19]:
save_result(sheet_name_csv)

./test_kor_1.csv
Save Success
