## import package

In [None]:
# import package
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import GRU, LSTM, Dense, Dropout, LeakyReLU
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error
import datetime
import inspect
from tensorflow.keras.layers import ELU

## train, valid 데이터 불러오기

In [40]:
# 데이터 불러오기
train_raw_df = pd.read_csv('e:/kma/data/TRAIN_nottree_with_파생변수0802.csv', encoding='utf8')
train_raw_df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
col_list = train_raw_df.columns

test_raw_df = pd.read_csv('e:/kma/data/TEST_with_파생변수0802.csv', encoding='utf8')
test_raw_df = test_raw_df[col_list]


full_raw_df = pd.concat([train_raw_df, test_raw_df], ignore_index=True)
full_raw_df = full_raw_df.astype({'yyyymmdd':'str'})
full_raw_df['yyyymmdd'] = pd.to_datetime(full_raw_df['yyyymmdd'])
full_raw_df


add_list = set(full_raw_df['add'])
indep_cols = full_raw_df.columns.difference(['yyyymmdd', 'add', 'sex', 'frequency'])

In [None]:
sido = '서울'
sex = 1
condition = (full_raw_df['add']==sido) & (full_raw_df['sex']==sex)        
tmp = full_raw_df.copy()[condition]


import seaborn as sns
sns.set_theme(style="darkgrid")
ax = sns.countplot(x="frequency", data=tmp)

frq_size = tmp.groupby(['frequency']).size()
frq_ratio = frq_size/frq_size.sum()
frq_ratio

## def

In [None]:
# 그래프 타이틀 한글인코딩
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/NGULIM.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)


# 그래프 확인
def graph(actual, pred, model_nm, val_or_test='val'):
    if val_or_test=='val':
        x_var = tmp[-731:-366].yyyymmdd
    elif val_or_test=='test':
        x_var = tmp[-366:].yyyymmdd
    else:
        print('error')
                    
    plt.figure(figsize=(12, 6))
    plt.xlim([min(x_var), max(x_var)])
    # plt.yticks([i for i in range(0, 14)])
    plt.plot(x_var, actual, color="dodgerblue", marker='o', markersize=1.5, label='actual', linewidth=0.3)
    if np.array_equal(actual, pred):
        print(' ')
    else:
        plt.plot(x_var, pred, color="violet", marker='o', markersize=1, label='prediction', linewidth=0.2)
    plt.grid(color='grey', linestyle='--', linewidth='0.1')
    plt.legend(loc='best', markerscale=3)
    plt.title('{0}, {1}, {2}'.format(sido, sex, model_nm))
    plt.tight_layout()
    plt.savefig('e:/kma/image/{0}/{1}_{2}_{3}_{4}.png'.format(val_or_test, sido, sex, model_nm, nowDatetime),
                facecolor='#eeeeee',
                edgecolor='black',
                format='png', dpi=200)
    plt.show()

In [None]:
# model 세부내용 기록
def record_model_setting(func_nm):
    
    get_source = inspect.getsource(func_nm)

    file = open('e:/kma/model/{0}_{1}_{2}_{3}.txt'.format(model_nm, sido, sex, nowDatetime), "w") 
    file.write(get_source)
    file.close()

In [68]:
# 모델 설정

from tensorflow.keras import backend as K

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 


def make_multiclass_nn(x, y):
        input_dim = x.shape[1]
        output_dim = y.shape[1]

        model = Sequential()
        model.add(Dense(256, input_dim=input_dim, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(128))
        model.add(ELU(alpha=0.05))
        model.add(Dropout(0.2))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(64, activation='relu'))       
        model.add(Dense(output_dim, activation='softmax'))
        
        return model

In [69]:
# 최저기온 + 최저기압 + 요일
indep_cols = ['min_ta_x', 'min_ta_y', 'min_ps',
              'weekday_Fri', 'weekday_Mon', 'weekday_Sat',
       'weekday_Thu', 'weekday_Tue', 'weekday_Wed']

## 시도별 반복 수행

In [None]:
now = datetime.datetime.now()
nowDatetime = now.strftime('%m%d_%H%M')

result = pd.DataFrame(columns=['sido', 'sex', 'rmse'])
LB_result = pd.DataFrame(columns=['yyyymmdd', 'sido', 'sex', 'frequency'])


# sido = '서울'
# sex = 1
for sido in add_list: # ('광주','서울'):
    for sex in (1,2):
                #-------------------------------------------------------------------------------------------#
                print("============================", sido, sex, "============================")
                condition = (full_raw_df['add']==sido) & (full_raw_df['sex']==sex)        
                tmp = full_raw_df.copy()[condition]
                tmp.reset_index(level=0, inplace=True, drop=True)

                # one-hot encoding
                Y = tmp['frequency']
                le = preprocessing.LabelEncoder()
                le.fit(Y)
                Y = le.transform(Y)
                Y_encoded = tf.keras.utils.to_categorical(Y)

                # scaling
                scaler_x = StandardScaler()
                scaled_df = scaler_x.fit_transform(tmp[indep_cols])
                scaled_df = pd.DataFrame(scaled_df, columns=indep_cols)

                # train, test 분리
                x_train = scaled_df[0:-731]
                y_train = Y_encoded[0:-731]

                x_valid = scaled_df[-731:-366]
                y_valid = Y_encoded[-731:-366]

                x_test = scaled_df[-366:]


                # model 생성
                model_nm = "multiclass_nn"
                model = make_multiclass_nn(x_train, y_train)
                record_model_setting(make_multiclass_nn)

                # 모델 컴파일
                model.compile(loss='categorical_crossentropy',#
                        optimizer='rmsprop',
                        metrics=[root_mean_squared_error])

                # 모델 학습
                early_stop = EarlyStopping(monitor='val_loss', patience=3)
                history = model.fit(x_train, y_train,
                        validation_data=(x_valid, y_valid),
                        epochs=10, batch_size=128,
                        verbose=1,
                        callbacks=[early_stop])

                model.save('e:/kma/model/{0}_{1}_{2}_{3}.h5'.format(model_nm, sido, sex, nowDatetime))

                print("\n Accuracy: %.4f" % (model.evaluate(scaled_df, Y_encoded)[1]))

                # 예측
                pred_valid = np.argmax(model.predict(x_valid), axis=1)

                # 그래프 확인
                graph(tmp.frequency[-731:-366], pred_valid, model_nm)

                # rmse
                rmse = mean_squared_error(tmp.frequency[-731:-366], pred_valid, squared=False)
                print(rmse)
                result_tmp = pd.DataFrame([[sido, sex, rmse]], columns=['sido', 'sex', 'rmse'])
                result = pd.concat([result, result_tmp])



                # 2016 test---------------------------------------------
                pred_test = np.argmax(model.predict(x_test), axis=1)
                graph(tmp.frequency[-366:], pred_test, model_nm+'_LB', 'test')

                LB_result_tmp = pd.DataFrame({'yyyymmdd':tmp[-366:].yyyymmdd, 'sido':sido, 'sex':sex, 'frequency':pred_test})
                LB_result = pd.concat([LB_result, LB_result_tmp])
                #-------------------------------------------------------
                        
                        
                #-------------------------------------------------------------------------------------------#
# 
# rmse 결과 저장
result.to_csv('e:/kma/rmse/{0}_{1}.csv'.format(model_nm, nowDatetime), encoding = 'utf-8-sig')


# 2016 test셋 예측 결과 저장
LB_result.to_csv('e:/kma/data/leaderboard/{0}_{1}.csv'.format(model_nm, nowDatetime), encoding = 'utf-8-sig')

In [None]:
# 6 훈련 과정 시각화 (정확도)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('loss')
plt.legend(['Train', 'Valid'], loc='upper left')
plt.show()


In [None]:

# 7 훈련 과정 시각화 (손실)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

## 리더보드 제출

In [None]:
# 리더보드 제출용

# 검증데이터 불러오기
raw_df = pd.read_csv('e:/kma/data/TEST_tree_with_파생변수_euc-kr0725.csv', encoding='cp949')
raw_df.drop(['Unnamed: 0', 'frequency'], axis=1, inplace=True)
raw_df = raw_df.astype({'yyyymmdd':'str'})
raw_df['yyyymmdd'] = pd.to_datetime(raw_df['yyyymmdd'])
# raw_df = pd.get_dummies(raw_df, columns=['weekday', 'season'])

# indep_cols = raw_df.columns.difference(['yyyymmdd', 'add', 'sex', 'frequency'])

# 검증데이터셋 area 순서
area_list = ('경기', '강원', '인천', '충북', '서울', '광주', '경북', '대구', '충남', '세종', '전남', '경남', '전북', '대전', '울산', '부산', '제주')

In [None]:
# nowDatetime = '0725_0155'
result = pd.DataFrame(columns=['yyyymmdd', 'sido', 'sex', 'frequency'])

# sido = '서울'
for sido in area_list: # ('광주','서울'):
# sex = 1
    for sex in (1,2):
#-------------------------------------------------------------------------------------------#
        print("============================", sido, sex, "============================")
        condition = (raw_df['add']==sido) & (raw_df['sex']==sex)        
        tmp = raw_df.copy()[condition]
        # print(tmp.head(2))

        # scaling
        scaler_x = StandardScaler()
        scaled_df = scaler_x.fit_transform(tmp[indep_cols])
        scaled_df = pd.DataFrame(scaled_df, columns=indep_cols)

        # model 불러오기
        model_nm = 'multiclass_nn'
        model = load_model('e:/kma/model/{0}_{1}_{2}_{3}.h5'.format(model_nm, sido, sex, nowDatetime))

        # 예측
        pred = np.argmax(model.predict(scaled_df), axis=1)

        # 그래프 확인
        graph(pred, pred, model_nm+'_LB')

        #-------------------------------------------------------------------------------------------#
        result_tmp = pd.DataFrame({'yyyymmdd':tmp.yyyymmdd, 'sido':sido, 'sex':sex, 'frequency':pred})
        result = result.append(result_tmp)
#-------------------------------------------------------------------------------------------#
# 
# 예측 결과 저장
result.to_csv('e:/kma/data/leaderboard/{0}_{1}.csv'.format(model_nm, nowDatetime), encoding = 'utf-8-sig')

In [None]:
multiclass = pd.read_csv('e:/kma/data/leaderboard/multiclass_nn_0727_0035.csv')
multiclass

In [None]:
# sido = '서울'
# sex = 1

for sido in area_list:
    for sex in (1, 2):
        condition = (multiclass['sido']==sido)&(multiclass['sex']==1)&(multiclass['yyyymmdd']=='2016-02-28')
        mc_tmp = multiclass.copy()[condition]
        mc_tmp['yyyymmdd']='2016-02-29'

        multiclass = pd.concat([multiclass, mc_tmp])

In [None]:
multiclass.reset_index(level=0, inplace=True)

In [None]:
multiclass.to_csv('e:/kma/data/leaderboard/{0}_{1}_0229.csv'.format(model_nm, nowDatetime), encoding = 'utf-8-sig')