In [3]:
N = 3
under = 90
model = "lgb"   #lgb or cnn
gpu_list = ['/gpu:0', '/gpu:1']
# Training params ------------------------------------------
train_epochs = 2**2
ntrials = 2**4
cvs = 10
best_epochs = 2**5
early_stopping = 2**3


# Utils -----------------------
import numpy as np
import scipy.stats as stats
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os, zipfile, io, re
from PIL import Image, ImageOps
import random
import pickle
import datetime
import gc
from tqdm import tqdm
import warnings
import seaborn as sns
from glob import glob
# Machine Learning ---------------
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from math import sqrt
import optuna
from optuna import integration
import optuna.integration.lightgbm as lgb
# Keras, TensorFlow ---------------
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, GlobalAveragePooling2D, AveragePooling2D, MaxPooling2D, BatchNormalization, Convolution2D, Input
from keras import optimizers
from keras.utils import multi_gpu_model
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras.preprocessing.image import ImageDataGenerator
from keras import backend as K
from keras.backend.tensorflow_backend import set_session
import tifffile
from sklearn.metrics import classification_report

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
warnings.filterwarnings('ignore')
SEED = 31
np.random.seed(SEED)
gpus = len(gpu_list)

Using TensorFlow backend.


In [4]:
# %matplotlib inline
# plt.tick_params(colors='white')
# IO Functions ------------------------------
def pkl_saver(object, pkl_filename):
    with open(pkl_filename, 'wb') as web:
        pickle.dump(object , web)


def pkl_loader(pkl_filename):
    with open(pkl_filename, 'rb') as web:
        data = pickle.load(web)
    return data


# Dir generator ----------------------------
def dir_generator(dir_path):
    if os.path.exists(dir_path) == False:
        os.mkdir(dir_path)


def region_visualizer(df):
    points = df[3]
    lon, lat = [], []
    for point in points:
        lon.append(point[0])
        lat.append(point[1])
    lon = np.array(lon).astype(float).reshape(-1,1)
    lat = np.array(lat).astype(float).reshape(-1,1)
    region = df[4]
    region = np.array(region).astype(int).reshape(-1,1)
    df = pd.DataFrame(np.concatenate([lon, lat, region], axis=1))
    df.columns = ['longitude', 'latitude', 'region_class']
    pivotted = df.pivot('longitude', 'latitude', 'region_class')
    for i in range(pivotted.shape[0]):
        pivotted.iloc[i] = pd.to_numeric(pivotted.iloc[i])
    pivotted.columns = pd.to_numeric(pivotted.columns)
    pivotted.index = pd.to_numeric(pivotted.index)
    pivotted = pivotted.fillna(-1)
    pivotted = pivotted.astype(float).T
    cmap = sns.color_palette("deep", cvs + 1)
    cmap[0] = (0,0,0)
    plt = sns.heatmap(pivotted, cmap = cmap)
    plt.invert_yaxis()
    colorbar = plt.collections[0].colorbar
    r = colorbar.vmax - colorbar.vmin
    colorbar.set_ticks([colorbar.vmin + 0.5 * r / (cvs + 1) + r * i / (cvs + 1) for i in range(cvs + 1)])
    colorbar.set_ticklabels(['background']+list(range(cvs)))
    plt.figure.savefig(os.path.join(result_path, "result_img", "region_map.jpg"))
    del(plt)


def data_splitter_cv(filenames, X, Y, cv, region, point):
    test_index = np.where(region==cv)
    train_index = np.setdiff1d(np.arange(0, X.shape[0], 1), test_index)
    train_files = filenames[train_index]
    test_files = filenames[test_index]
    X_test = X[test_index]
    Y_test = Y[test_index]
    X_train = X[train_index]
    Y_train = Y[train_index]
    train_region = region[train_index]
    train_point = point[train_index]
    return train_files, test_files, X_train, X_test, Y_train, Y_test, train_region, train_point

def lgb_splitter_cv(filenames, X, Y, cv, region, point):
#     from IPython.core.debugger import Pdb; Pdb().set_trace()
    test_index = np.where(region==cv)
    train_index = np.setdiff1d(np.arange(0, X.shape[0], 1), test_index)
    train_files = filenames[train_index]
    test_files = filenames[test_index]
    X_test = np.array(X)[test_index]
    Y_test = np.array(Y)[test_index]
    X_train = np.array(X)[train_index]
    Y_train = np.array(Y)[train_index]
    train_region = region[train_index]
    train_point = point[train_index]
    X_train, X_test, Y_train, Y_test = make_df(X_train), make_df(X_test), make_df(Y_train), make_df(Y_test)
    return train_files, test_files, X_train, X_test, Y_train, Y_test, train_region, train_point


# Loss Definition ----------------------------------
def root_mean_squared_error(Y_true, Y_pred):
    return K.sqrt(K.mean(K.square(Y_pred - Y_true), axis = -1))


def create_model(image_shape, num_layer, padding, dense_num, num_filters, size_filters, dropout_rate_in, dropout_rate_out):
    inputs = Input(image_shape)
    for d in gpu_list:
        with tf.device(d):
            x = Dropout(dropout_rate_in)(inputs)
            x = Convolution2D(filters = 2**num_filters[0], kernel_size = (size_filters[0],size_filters[0]), padding = 'same', activation = 'relu')(x)
            for i in range(1, num_layer):
                x = Convolution2D(filters = 2**num_filters[i],
                                  kernel_size = (size_filters[i], size_filters[i]),
                                  padding = padding,
                                  activation = 'relu')(x)
            x = GlobalAveragePooling2D()(x)
            x = Dropout(dropout_rate_out)(x)
            x = Dense(units = 2**dense_num, activation = 'relu')(x)
            x = Dense(units = num_category, activation = 'softmax')(x)
            model = Model(inputs = inputs, outputs = x)
    return model


def opt_cnn(trial):
    # Opt params -----------------------
    # Categorical parameter
    num_layer = trial.suggest_int('num_layer', 1, 2)
    dense_num = trial.suggest_int('dense_num', 3, 7)
    num_filters = [int(trial.suggest_discrete_uniform(f'num_filter_{i}', 7, 10, 1)) for i in range(num_layer)]
    size_filters = [int(trial.suggest_discrete_uniform(f'size_filter_{i}', 3, 5, 2)) for i in range(num_layer)]
    batch_size = trial.suggest_int('batch_size', 1, 5)
    # Model Compiler -----------------------
    lr = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
    decay = trial.suggest_loguniform('decay', 1e-6, 1e-3)
    # Discrete-uniform parameter
    dropout_rate_in = trial.suggest_discrete_uniform('dropout_rate_in', 0.0, 0.5, 0.1)
    dropout_rate_out = trial.suggest_discrete_uniform('dropout_rate_out', 0.0, 0.5, 0.1)
    momentum = trial.suggest_discrete_uniform('momentum', 0.0, 1.0, 0.1)
    # categorical parameter
#    optimizer = trial.suggest_categorical("optimizer", ["sgd", "momentum", "rmsprop", "adam"])
    padding = trial.suggest_categorical('padding', ['same', 'valid'])
    # compile model-------------------
#     from IPython.core.debugger import Pdb; Pdb().set_trace()
    model = create_model(image_shape, num_layer, padding, dense_num, num_filters, size_filters, dropout_rate_in, dropout_rate_out)
    sgd = optimizers.SGD(lr = lr, decay = decay, momentum = momentum, nesterov = True)
#    sgd = optimizers.SGD(lr = lr, decay = decay, momentum = momentum, nesterov = True, clipvalue = 1.0)
    # For CPU run ------------------
    model.compile(optimizer = sgd, loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])
    # Train Model ----------------------------------
    es_cb = EarlyStopping(monitor = 'val_loss', patience = early_stopping, verbose = 0)
    pr_cb = integration.TFKerasPruningCallback(trial, 'val_loss')
    cbs = [es_cb, pr_cb]
    loss_list, acc_list = [], []
    for inner_cv in range(0, cvs):
        _, _, X_inner_train, X_inner_val, Y_inner_train, Y_inner_val, _, _ = data_splitter_cv(train_files, X_outer_train, Y_outer_train, inner_cv, val_train_region, val_train_point)
        hist = model.fit(
            train_datagen.flow(X_inner_train, Y_inner_train, batch_size = (2**batch_size) * gpus),
            epochs = train_epochs,
            validation_data = (X_inner_val, Y_inner_val),
            callbacks = cbs,
            shuffle = True,
            verbose = 0,
            use_multiprocessing = False)
        loss_list += [model.evaluate(X_inner_val, Y_inner_val)[0]]
        acc_list += [model.evaluate(X_inner_val, Y_inner_val)[1]]
    del model
    keras.backend.clear_session()
    gc.collect()
    eval_loss = np.mean(loss_list)
    eval_acc = np.mean(acc_list)
    return eval_loss

def opt_lgb(trial):
    
    param_grid_lgb = {
        'num_leaves': trial.suggest_int("num_leaves", 3, 30),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1.0),
        'max_depth': trial.suggest_int("max_depth", 3, 20),
        "random_state": SEED
    }

    model = LGBMClassifier(**param_grid_lgb)
    
    # 10-Fold CV / Accuracy でモデルを評価する
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    scores = cross_validate(model, X=X_outer_train, y=Y_outer_train, cv=kf)
    # 最小化なので 1.0 からスコアを引く
    return scores['test_score'].mean()
#     return lgb.score(X_outer_val, Y_outer_val)


def mean_params_calc(param_names):
    dict = {}
    categoricals = ['padding']
    for param_name in param_names:
        data_num = 0
        if param_name not in categoricals:
            for data in best_params:
                try:
                    try:
                        dict[param_name] += data[param_name]
                    except:
                        dict[param_name] = data[param_name]
                    data_num = data_num + 1
                except:
                    pass
            dict[param_name] = dict[param_name]/data_num
        else:
            categorical_list = []
            for data in best_params:
                try:
                    categorical_list = categorical_list + [data[param_name]]
                except:
                    pass
            dict[param_name] = stats.mode(categorical_list)[0][0]
    return dict


def cv_result_imgs_generator(model, history):
    # Visualize Loss Results ----------------------------
    plt.figure(figsize=(18,6))
    plt.plot(history.history["loss"], label="loss", marker="o")
    plt.plot(history.history["val_loss"], label="val_loss", marker="o")
    plt.ylabel("loss")
    plt.xlabel("epoch")
    plt.title("")
    plt.legend(loc="best")
    plt.grid(color='gray', alpha=0.2)
    plt.savefig('./img_loss/' + str(outer_cv) + '_loss.jpg')
    plt.close()


def generalization_result_imgs_generator(name, Y_val_pred, Y_val_all):
    # Evaluate test data -----------------------
    plt.figure()
    plt.scatter(Y_val_all, Y_val_pred, s=3, alpha=0.5)
    plt.xlim(min([np.min(Y_val_all), np.min(Y_val_pred)]), max([np.max(Y_val_all),np.max(Y_val_pred)]))
    plt.xlabel("obs")
    plt.ylabel("pred")
    x = np.linspace(min([np.min(Y_val_all), np.min(Y_val_pred)]), max([np.max(Y_val_all),np.max(Y_val_pred)]),100)
    y = x
    plt.plot(x, y, "r-")
    plt.savefig('./img_loss/' + name + '_scatter_test.jpg')
    plt.close()
    
def region_image_generator(point, region):
    data_num = int(len(imgfiles)/28)
    cmap = plt.get_cmap("tab10")
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.scatter(point[:data_num][:,0],point[:data_num][:,1], marker='o', s=5, color=cmap(region))
    ax.set_title("Region in Japan")
    ax.set_xlabel("longitude")
    ax.set_ylabel("latitude")
    fig.savefig('./region_separate.png')
    
    
def make_df(X):
    return pd.DataFrame(X)

def data_import():
    if os.path.exists(data_path + f'df_{N}x{N}.pkl'):
        df = pkl_loader(data_path + f'df_{N}x{N}.pkl')
    else:
        trial = int(len(imgfiles)/28)
        X = [] # X: 説明変数 = (N*N)*(7*4)のデータ
        Y = [] # Y: 目的変数
        point = [] # point: 緯度経度
        X_28 = []
        Y_28 = 0
        point_28 = []
        filenames = []
        max_light = 0
        print('inputdata_processing...')

        for box in tqdm(range(trial)):
            for imgfile in imgfiles[box*28: (box+1)*28]:
                # ZIPから画像読み込み
                image = tifffile.imread(imgfile)
        #         print(image.shape)
                file = os.path.basename(imgfile)
                file_split = [i for i in file.split('_')]
                X_28.append(image)
            Y_28 = file_split[5].split(".")[0]
            point_28 = [float(file_split[1]), float(file_split[2])]
            filenames.append(f"{file_split[0]}_{file_split[1]}_{file_split[2]}_{Y_28}")
            X.append(X_28[box*28: (box+1)*28])
            Y.append(Y_28)
            point.append(point_28)
        del X_28, Y_28, point_28
        X = np.asarray(X)
        print(X.shape)
        X = X.transpose(0,2,3,1)
        print(X.shape)
        Y = np.array(Y)
        filenames = np.array(filenames)
        point = np.array(point)
        region = KMeans(n_clusters = cvs, random_state=SEED).fit(point).labels_
        # label encorder===========================================
        labels = ['Water', 'Urban and built-up', 'Rice paddy',  'Crops', 'Grassland', 'DBF', 'DNF', 'EBF', 'ENF', 'Bare land' ]
        for i in range(len(labels)):
            Y[Y==labels[i]] = int(i)
        df = [filenames, X, Y, point, region]
        pkl_saver(df, os.path.join(data_path, f'df_{N}x{N}.pkl'))
        
    return df[0], df[1], df[2], df[3], df[4]

# Data Loader ------------------------------
if under==20:
    train_tif_name = f"D:/LULC/features/01_landsat8/train_old/{N}x{N}"
elif under==90:
    train_tif_name = f"D:/LULC/features/01_landsat8/train_new/{N}x{N}"
root_path      = f"C:/Users/GE/Dropbox/Kairo/under{under}_results/"
result_path    = f"C:/Users/GE/Dropbox/Kairo/under{under}_results/{N}x{N}"
data_path      = f"C:/Users/GE/Dropbox/Kairo/under{under}_results/data/"
model_path     = f"C:/Users/GE/Dropbox/Kairo/under{under}_results/model/{N}x{N}/"

imgfiles = glob(train_tif_name + "/*.tif")
imgfiles.sort()

# Data converter ----------------------------------------------
# X->説明変数, Y->目的変数, point->緯度経度, region->領域を10分割した時の分割区間名
filenames, X, Y, point, region = data_import()
image_shape = (X.shape[1], X.shape[2], X.shape[3])
num_category = len(np.unique(Y))

In [28]:
if model == "cnn":
    # Data standardizing ----------------------------------------------
    X_train_mean_lis, X_train_std_lis = [], []
    X_files, Y_files, X_train, X_test, Y_train, Y_test, region_train, _, train_point, _ = train_test_split(filenames, X, Y, region, point, test_size=0.2, random_state=SEED)

    for i in range(X_train.shape[3]):
        X_train_mean_lis.append(X_train[:,:,:,i].mean())
        X_train_std_lis.append(X_train[:,:,:,i].std())
        X_train[:,:,:,i] = (X_train[:,:,:,i] - X_train_mean_lis[i]) / X_train_std_lis[i]
        X_test[:,:,:,i] = (X_test[:,:,:,i] - X_train_mean_lis[i]) / X_train_std_lis[i]
    
    timename       = '{0:%Y_%m%d_%H%M}'.format(datetime.datetime.now())
    time_path      =  os.path.join(result_path, timename, "outer_cv_times")
    # dir generation
    dir_generator(result_path)
    # Chenge current directry
    os.mkdir(os.path.join(result_path, timename))
    os.chdir(os.path.join(result_path, timename))
    dir_generator(model_path)
    dir_generator("./results/")
    dir_generator("./img_loss/")
    dir_generator("./model/")
    dir_generator("./weights/")
    dir_generator("./logs/")
    dir_generator("./outer_cv_times/")


    # Saving region_separate_map
    region_image_generator(point, region)
    train_start = datetime.datetime.now()
    # Train Model ----------------------------------
    # CV start ------------------------------------------------------------
    for outer_cv in range(cvs):
        outer_start = datetime.datetime.now()
        print(f'outer_cv_{outer_cv}_processing....')
        # Data Loader-------------------------------------
        train_files, val_files, X_outer_train, X_outer_val, Y_outer_train, Y_outer_val, val_train_region, val_train_point = data_splitter_cv(X_files, X_train, Y_train, outer_cv, region_train, train_point)
        train_datagen = ImageDataGenerator(
    #         rotation_range = 360,
            horizontal_flip = True,
            vertical_flip = True
        )
        val_train_region = KMeans(n_clusters = cvs, random_state=SEED).fit(val_train_point).labels_
        # Bayesian optimization -------------------------------------
        study = optuna.create_study()
        study.optimize(opt_cnn, n_trials = ntrials)
        # Best_model_training ---------------------------------------
        num_filters = [int(study.best_params[f'num_filter_{i}']) for i in range(int(study.best_params['num_layer']))]
        size_filters = [int(study.best_params[f'size_filter_{i}']) for i in range(int(study.best_params['num_layer']))]
        model = create_model(image_shape, int(study.best_params['num_layer']), study.best_params['padding'], int(study.best_params['dense_num']), num_filters, size_filters, study.best_params['dropout_rate_in'], study.best_params['dropout_rate_out'])
        sgd = optimizers.SGD(lr = study.best_params['learning_rate'], decay = study.best_params['decay'], momentum = study.best_params['momentum'], nesterov = True, clipvalue = 1.0)
        model.compile(optimizer = sgd, loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])
        history = model.fit(
            train_datagen.flow(X_outer_train, Y_outer_train, batch_size = 2**int(study.best_params['batch_size']) * gpus),
            epochs = train_epochs,
            validation_data = (X_outer_val, Y_outer_val),
            shuffle = True,
            verbose = 0,
            use_multiprocessing = False
            )
        try:
            best_params.append(study.best_params)
        except:
            best_params = [study.best_params]
        try:
            val_pred_files = np.concatenate((val_pred_files, val_files), axis=0)
        except:
            val_pred_files = val_files
        try:
            Y_val_pred = np.concatenate((Y_val_pred, model.predict(X_outer_val).argmax(axis=1)), axis=0)
        except:
            Y_val_pred = np.array(model.predict(X_outer_val).argmax(axis=1))
        try:
            Y_val_obs = np.concatenate((Y_val_obs, Y_outer_val), axis=0)
        except:
            Y_val_obs = Y_outer_val
        try:
            Y_val_smx = np.concatenate((Y_val_smx, model.predict(X_outer_val)),axis=0)
        except:
            Y_val_smx = model.predict(X_outer_val)
        cv_result_imgs_generator(model, history)
        print("accuracy is", model.evaluate(X_outer_val, Y_outer_val)[1])
        #compare_TV(history, outer_cv)
        del model
        keras.backend.clear_session()
        gc.collect()

        outer_end = datetime.datetime.now()
        spend_time = f"Outer_cv time is {outer_end - outer_start} seconds."
        pkl_saver(spend_time, os.path.join(time_path, f"outer_cv_{outer_cv}_time.txt"))

    train_end = datetime.datetime.now()
    spend_time = f"Outer_cv time is {train_end - train_start} seconds."
    pkl_saver(spend_time, os.path.join(time_path, "all_time.txt"))


elif model == "lgb":
    timename       = '{0:%Y_%m%d_%H%M}'.format(datetime.datetime.now())
    time_path      =  os.path.join(result_path, timename, "outer_cv_times")
    # dir generation
    dir_generator(result_path)
    # Chenge current directry
    os.mkdir(os.path.join(result_path, timename))
    os.chdir(os.path.join(result_path, timename))
    dir_generator(model_path)
    dir_generator("./results/")
    dir_generator("./img_loss/")
    dir_generator("./model/")
    dir_generator("./weights/")
    dir_generator("./logs/")
    dir_generator("./outer_cv_times/")
#     中心の一点だけ使います
    if os.path.exists(data_path + f'df_{N}x{N}_XY.pkl'):
        df_XY = pkl_loader(data_path + f'df_{N}x{N}_XY.pkl')
    else:
        data_num = int(len(imgfiles)/28)
        df_XY = pd.DataFrame(np.zeros((data_num,29)))
        for i in range(data_num):
            df_XY.iloc[i,0] = Y[i]
            for j in range(28):
                df_XY.iloc[i, j+1] = X[i][1][1][j]
        pkl_saver(df_XY, os.path.join(data_path, f'df_{N}x{N}_XY.pkl'))

    Y, X = df_XY[0], df_XY.iloc[:, 1:]
    X_train_mean_lis = []
    X_train_std_lis = []
    X_files, Y_files, X_train, X_test, Y_train, Y_test, region_train, _, train_point, _ = train_test_split(filenames, X, Y, region, point, test_size=0.2, random_state=SEED)
    
    train_files, val_files, X_outer_train, X_outer_val, Y_outer_train, Y_outer_val, val_train_region, val_train_point = lgb_splitter_cv(X_files, X_train, Y_train, 0, region_train, train_point)
    
    for outer_cv in range(cvs):
        outer_start = datetime.datetime.now()
        print(f'outer_cv_{outer_cv}_processing....')
        # Data Loader-------------------------------------
        train_files, val_files, X_outer_train, X_outer_val, Y_outer_train, Y_outer_val, val_train_region, val_train_point = lgb_splitter_cv(X_files, X_train, Y_train, 0, region_train, train_point)
        val_train_region = KMeans(n_clusters = cvs, random_state=SEED).fit(val_train_point).labels_


    #     for inner_cv in range(0, cvs):
    #         _, _, X_inner_train, X_inner_val, Y_inner_train, Y_inner_val, _, _ = data_splitter_cv(train_files, X_outer_train, Y_outer_train, inner_cv, val_train_region, val_train_point)
    #         lgb = LGBMClassifier(random_state=SEED)
    #         lgb.fit(X_inner_train, _inner_train)
    #         print(f'inner_cv is {inner_cv}')
    #         print(f'accuracy of train set: {lgb.score(X_inner_train, Y_inner_train)}')
    #         print(f'accuracy of train set: {lgb.score(X_inner_val, Y_inner_val)}')
    #         print()

        study = optuna.create_study(direction='maximize')
        study.optimize(opt_lgb, n_trials=ntrials)
        print(study.best_params)
        print(study.best_value)
        lgb_best_param = study.best_params

        lgb_best = LGBMClassifier(**lgb_best_param)
        lgb_best.fit(X_train, Y_train)
        pred = pd.DataFrame(lgb_best.predict(X_test))
        print((np.array(pred[0]).astype(int) == Y_test.values.astype(int)).sum() / len(Y_test))
        
        try:
            Y_val_pred = np.concatenate((Y_val_pred, lgb_best.predict(X_outer_val).argmax(axis=1)), axis=0)
        except:
            Y_val_pred = np.array(lgb_best.predict(X_outer_val).argmax(axis=1))
        try:
            Y_val_obs = np.concatenate((Y_val_obs, Y_outer_val), axis=0)
        except:
            Y_val_obs = Y_outer_val
        try:
            Y_val_smx = np.concatenate((Y_val_smx, lgb_best.predict(X_outer_val)),axis=0)
        except:
            Y_val_smx = lgb_best.predict(X_outer_val)

        outer_end = datetime.datetime.now()
        spend_time = f"Outer_cv time is {outer_end - outer_start} seconds."
        pkl_saver(spend_time, os.path.join(time_path, f"outer_cv_{outer_cv}_time.txt"))

    train_end = datetime.datetime.now()
    spend_time = f"Outer_cv time is {train_end - train_start} seconds."
    pkl_saver(spend_time, os.path.join(time_path, "all_time.txt"))
    
else:
    print("modelを正しく選択してください")

# code save
import shutil
os.mkdir("./code")
shutil.copy(r"C:\Users\GE\Dropbox\Kairo\code\3_study_code\LULC_code\LULC_CNN_lightGBM.ipynb", "./code/LULC_CNN_lightGBM.ipynb")


outer_cv_0_processing....


[32m[I 2020-11-12 16:38:52,136][0m Finished trial#0 resulted in value: 0.2651773048400108. Current best value is 0.2651773048400108 with parameters: {'num_leaves': 9, 'learning_rate': 4.9753119101948975e-08, 'max_depth': 7}.[0m
[32m[I 2020-11-12 16:38:56,681][0m Finished trial#1 resulted in value: 0.7308060748843006. Current best value is 0.7308060748843006 with parameters: {'num_leaves': 11, 'learning_rate': 0.18896033331462503, 'max_depth': 8}.[0m
[32m[I 2020-11-12 16:39:02,728][0m Finished trial#2 resulted in value: 0.2651773048400108. Current best value is 0.7308060748843006 with parameters: {'num_leaves': 11, 'learning_rate': 0.18896033331462503, 'max_depth': 8}.[0m
[32m[I 2020-11-12 16:39:06,290][0m Finished trial#3 resulted in value: 0.2663249054073254. Current best value is 0.7308060748843006 with parameters: {'num_leaves': 11, 'learning_rate': 0.18896033331462503, 'max_depth': 8}.[0m
[32m[I 2020-11-12 16:39:12,290][0m Finished trial#4 resulted in value: 0.2651773

{'num_leaves': 30, 'learning_rate': 0.20365043745312883, 'max_depth': 14}
0.7353956943378133
0.7593135816955119


AxisError: axis 1 is out of bounds for array of dimension 1

In [None]:
    
    # Save CV_Result -------------------------------------------------
    # generalization_result_imgs_generator('val', Y_val_pred, Y_val_obs)
    np.savetxt('Y_val_smx.txt', Y_val_smx)
    param_names = best_params[list(map(len, best_params)).index(max(list(map(len, best_params))))].keys()
    best_params_dict = mean_params_calc(param_names)
    pkl_saver(best_params, 'best_params_list.binaryfile')
    pkl_saver(best_params_dict, 'best_params.binaryfile')
    best_params_dict = pkl_loader('best_params.binaryfile')

    # Save CV_Result to csv -------------------------------------------------
    results = [val_pred_files, Y_val_obs, Y_val_pred, Y_val_smx]
    pkl_saver(results, './results/results.pkl')
    results_csv = np.concatenate([pd.DataFrame(val_pred_files),pd.DataFrame(Y_val_obs), pd.DataFrame(Y_val_pred), pd.DataFrame(Y_val_smx)], 1)
    results_csv = pd.DataFrame(results_csv)
    columns = ["name", "obs", "pred", 'Water', 'Urban and built-up', 'Rice paddy',  'Crops', 'Grassland', 'DBF', 'DNF', 'EBF', 'ENF', 'Bare land']
    results_csv.columns=columns
    results_csv.to_csv('./results/results_val.csv')
    labels = ['Water', 'Urban and built-up', 'Rice paddy',  'Crops', 'Grassland', 'DBF', 'DNF', 'EBF', 'ENF', 'Bare land']
    cf_metr = confusion_matrix(Y_val_obs.astype(int), Y_val_pred)
    cf_metr = pd.DataFrame(cf_metr)
    cf_metr.columns=labels
    cf_metr.index=labels
    cf_metr.to_csv("./results/confusion_matrix_val.csv")

    res_smr = classification_report(list(results_csv['obs'].astype(int)), list(results_csv['pred']), target_names = labels, labels = np.array(range(len(labels))))
    with open('./results/result_summary_val.txt','w') as f:
        f.write(res_smr)

    # Best Model Training -----------------------------------------------
    # Int parameter
    num_layer = int(best_params_dict['num_layer'])
    num_filters = [int(best_params_dict['num_filter_' + str(i)]) for i in range(num_layer)]
    size_filters = [int(best_params_dict['size_filter_' + str(i)]) for i in range(num_layer)]
    dense_num = int(best_params_dict['dense_num'])
    batch_size = int(best_params_dict['batch_size'])
    # Uniform parameter
    # Loguniform parameter
    lr = best_params_dict['learning_rate']
    decay = best_params_dict['decay']
    # Discrete-uniform parameter
    dropout_rate_in = best_params_dict['dropout_rate_in']
    dropout_rate_out = best_params_dict['dropout_rate_out']
    momentum = best_params_dict['momentum']
    # Categorical parameter
    padding = best_params_dict['padding']


    # Model Checkpoint ------------------
    cp_cb = ModelCheckpoint(
        './weights/best_weights.hdf5',
        monitor = 'val_loss',
        verbose = 1,
        save_best_only = True,
        save_weights_only = True,
        mode = 'auto')
    # Logging ----------------------------------------
    log_dir = os.path.join('./logs/')
    tb_cb = TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True)
    es_cb = EarlyStopping(monitor = 'val_loss', patience = int(best_epochs/10), verbose = 1)

    cbs = [cp_cb, tb_cb, es_cb]
    # Train Best_Model ----------------------------------
    # For CPU run ------------------
    best_model = create_model(image_shape, num_layer, padding, dense_num, num_filters, size_filters, dropout_rate_in, dropout_rate_out)
    sgd = optimizers.SGD(lr = lr, decay = decay, momentum = momentum, nesterov = True, clipvalue = 1.0)

    best_model.compile(optimizer = sgd, loss = 'sparse_categorical_crossentropy')
    hist = best_model.fit(
        train_datagen.flow(X_train, Y_train, batch_size = (2**batch_size) * gpus),
        epochs = best_epochs,
        callbacks = cbs,
        shuffle = True,
        verbose = 1,
        initial_epoch = 0,
        use_multiprocessing = False)

    # Save Model -----------------------------------
    best_model.save('./model/best_model.hdf5')

In [26]:
np.array(pred[0]).astype(int), Y_test.values.astype(int)

(array([3, 2, 2, ..., 1, 8, 2]), array([5, 2, 2, ..., 1, 8, 2]))

In [27]:
print((np.array(pred[0]).astype(int) == Y_test.values.astype(int)).sum() / len(Y_test))

0.7374596655910824


In [None]:
if model == "cnn":
    # ロードだけでも動くように変数の再定義=============================================
    # ==============================================================================
    # ==============================================================================
    # ==============================================================================
    # ==============================================================================
    # Data Loader ------------------------------
    best_model = load_model("./model/best_model.hdf5")
    df = pkl_loader(os.path.join(data_path, f'df_{N}x{N}.pkl'))
    # Data converter ----------------------------------------------
    filenames, X, Y, point, region = df[0], df[1], df[2], df[3], df[4]
    image_shape = (X.shape[1], X.shape[2], X.shape[3])
    num_category = len(np.unique(Y))
    # Data splitting ----------------------------------------------
    X_train_mean_lis = []
    X_train_std_lis = []
    X_files, Y_files, X_train, X_test, Y_train, Y_test, region_train, _, train_point, _ = train_test_split(filenames, X, Y, region, point, test_size=0.2, random_state=SEED)

    for i in range(X_train.shape[3]):
        X_train_mean_lis.append(X_train[:,:,:,i].mean())
        X_train_std_lis.append(X_train[:,:,:,i].std())
        X_train[:,:,:,i] = (X_train[:,:,:,i] - X_train_mean_lis[i])/X_train_std_lis[i]
        X_test[:,:,:,i] = (X_test[:,:,:,i] - X_train_mean_lis[i])/X_train_std_lis[i]
    # ==============================================================================
    # ==============================================================================
    # ==============================================================================
    # ==============================================================================


    Y_test_pred = [ np.array(best_model.predict(X_test).argmax(axis=1))]
    np.savetxt('y_test_pred.txt', Y_test_pred)
    with open("best_model_summary.txt", "w") as fp:
        best_model.summary(print_fn=lambda x: fp.write(x + "\r\n"))


    results = [Y_files, Y_test, Y_test_pred]
    pkl_saver(results, './results/results.pkl')
    results_csv = np.concatenate([pd.DataFrame(Y_files),pd.DataFrame(Y_test), pd.DataFrame(Y_test_pred[0])], 1)
    results_csv = pd.DataFrame(results_csv)
    columns = ["name", "obs", "pred"]
    results_csv.columns=columns
    results_csv.to_csv('./results/results_test.csv')
    labels = ['Water', 'Urban and built-up', 'Rice paddy',  'Crops', 'Grassland', 'DBF', 'DNF', 'EBF', 'ENF', 'Bare land']
    cf_metr = confusion_matrix(Y_test.astype(int), Y_test_pred[0])
    cf_metr = pd.DataFrame(cf_metr)
    cf_metr.columns=labels
    cf_metr.index=labels
    cf_metr.to_csv("./results/confusion_matrix_test.csv")
    test_smr = classification_report(list(np.array(Y_test).astype(int)), list(Y_test_pred[0].astype(int)), target_names = labels, labels = np.array(range(len(labels))))
    with open('./results/result_summary_test.txt','w') as f:
        f.write(test_smr)

        
elif model == "lgb":
    if os.path.exists(data_path + f'df_{N}x{N}_XY.pkl'):
        df_XY = pkl_loader(data_path + f'df_{N}x{N}_XY.pkl')
    else:
        data_num = int(len(imgfiles)/28)
        df_XY = pd.DataFrame(np.zeros((data_num,29)))
        for i in range(data_num):
            df_XY.iloc[i,0] = Y[i]
            for j in range(28):
                df_XY.iloc[i, j+1] = X[i][1][1][j]
        pkl_saver(df_XY, os.path.join(data_path, f'df_{N}x{N}_XY.pkl'))

    Y, X = df_XY[0], df_XY.iloc[:, 1:]
    X_train_mean_lis = []
    X_train_std_lis = []
    X_files, Y_files, X_train, X_test, Y_train, Y_test, region_train, _, train_point, _ = train_test_split(filenames, X, Y, region, point, test_size=0.2, random_state=SEED)
    
    train_files, val_files, X_outer_train, X_outer_val, Y_outer_train, Y_outer_val, val_train_region, val_train_point = lgb_splitter_cv(X_files, X_train, Y_train, 0, region_train, train_point)

    
print('finished...')

In [None]:
# code save
import shutil
os.mkdir("./code")
shutil.copy(r"C:\Users\GE\Dropbox\Kairo\code\3_study_code\LULC_code\LULC_CNN_model.ipynb", "./code/LULC_CNN_model.ipynb")

In [None]:
(np.array(Y_test_pred).astype(int) == Y_test.astype(int)).sum() / len(Y_test)