In [1]:
import h5py
from datetime import datetime
import os

#importing tensorflow, check gpu
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'), tf.__version__)
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    print(gpu)
    tf.config.experimental.set_memory_growth(gpu, True)

2023-09-26 12:35:39.058702: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-26 12:35:39.817043: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/albert/miniconda3/envs/Baikal2/lib/:/home/albert/miniconda3/envs/Baikal2/lib/
2023-09-26 12:35:39.817118: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/albert/miniconda3/envs/Baikal2/lib/:/h

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')] 2.11.1
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [2]:
path_to_h5 = './DATA/h5_files/LC08_L2SP_02_T1_256.h5'
with h5py.File(path_to_h5, 'r') as f:
    LENGTH_OF_EPOCH = len(f['all/data_norm'])
    print('Number of small images in h5:', LENGTH_OF_EPOCH)

#Global variables, do not change
WIDTH = 256
HEIGHT = 256
CHANNELS = 7
CLASSES = 10
MAX_SHIFT = 1 # максимальное смещение по вертикали и горизонтали в функции потерь
BATCH_SIZE = 16

Number of small images in h5: 32362


In [3]:
#importing from local scripts
from TrainingNN.DataLoad import BatchLoader, make_train_dataset
from TrainingNN.BuildNN import build_resnet
from TrainingNN.Transform import *
from TrainingNN.Loss import conv_loss
from TrainingNN.Visualize import VisualClass

In [4]:
###Задаём фильтры и размеры ядер на этапе создания модели
###Список 'filters' - кол-во фильтров, по порядку следования слоёв 'encoder'
###Список 'conv_kernels' - размер ядер свёрток в 'encoder' и 'decoder', по порядку следования слоёв 'encoder'
###Список 'strides' - размер 'strides' в 'encoder' и 'decoder', по порядку следования слоёв 'encoder'
def make_model(filters = [16,32,16], conv_kernel = [3,3,3]):#, strides = [2,2,2,2]):
    
    #Создаём основу модели
    inp = tf.keras.layers.Input(shape=(None, None, CHANNELS))
    
    #classifier = simple_classifier()
    classifier = build_resnet(filters, conv_kernel, CHANNELS, CLASSES)
    #classifier = build_unet(filters, conv_kernel, strides)

    outp = classifier(inp)
    model = tf.keras.Model(inputs=inp, outputs=outp)
    
    #По гиперпараметрам генерируем имя модели
    s = 'f'
    for i in filters:
        s +='.'+str(i)
    s+='_k'
    for i in conv_kernel:
        s +='.'+str(i)
    s+='_s'
    #for i in strides:
    #    s +='.'+str(i)
    
    model_name = str(classifier.name)+'_'+s+'_CLASSES.'+str(CLASSES)+'_BS.'+str(BATCH_SIZE)
    
    #Алгоритм подсчёта лосса
    params, inverse_params = RandomAffineTransformParams()(inp, WIDTH)
    transformed_inp = ImageProjectiveTransformLayer()(inp, params, WIDTH, HEIGHT)
    transformed_outp = classifier(transformed_inp)
    inv_transformed_outp = ImageProjectiveTransformLayer()(transformed_outp, inverse_params)
    model.add_loss(conv_loss(outp, inv_transformed_outp, WIDTH, HEIGHT, BATCH_SIZE))
    return model, model_name

In [None]:
model, model_name = make_model()
print(model_name)

# making dir for model if necessary
try:
    os.makedirs('../models/'+model_name)
    print('directory for the model is created')
except:
    print('directory for the model already exists')
#make a dir for tensorboard logs
logdir = "./models/logs_tb/"+model_name+"/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
os.makedirs(logdir)
print('directory for tb logs is created')
    
#Make callbacks: draw a pic after every epoch, early stopping, model checkpoint, logs to tensorboard
class DrawTestPic(tf.keras.callbacks.Callback):
    def __init__(self,J):
        self.J = J
    def on_batch_end(self, batch, logs=None):
        if batch%200 == 0:
            self.J+=1
            V = VisualClass(path_to_h5)
            no = 72
            img_norm, GEO = V.get_norm_image(no,no+1)
            predicted = model.predict(img_norm, verbose = False)
            predicted_classes = predicted.argmax(axis = -1)
            try:
                os.makedirs("./models/"+model_name+"/figures/fig"+str(no))
            except:
                pass
            f = V.draw_layers(no, predicted_classes)
            f.write_html("./models/"+model_name+"/figures/fig"+str(no)+"/"+str(self.J)+".html")

callbacks = [
                tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, min_delta=5e-4),
                tf.keras.callbacks.ModelCheckpoint(filepath='../models/' + model_name + '/best',
                                                   monitor = 'loss',
                                                   save_freq='epoch'), 
                tf.keras.callbacks.TensorBoard(log_dir=logdir),
                DrawTestPic(J = 0)
            ]

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4))

train_dataset = make_train_dataset(path_to_h5, BATCH_SIZE, WIDTH, HEIGHT, CHANNELS)
history = model.fit(train_dataset, epochs = 100,
                    steps_per_epoch = LENGTH_OF_EPOCH // BATCH_SIZE,
                    callbacks=callbacks,
                    verbose = 1)
model.save('./models/'+ model_name + '/last')
print('Model' + model_name + 'has been trained.')

2023-09-26 12:35:41.536549: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-26 12:35:42.064752: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10572 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:21:00.0, compute capability: 8.6


ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16
directory for the model already exists
directory for tb logs is created
Epoch 1/100


2023-09-26 12:35:46.441597: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8100
2023-09-26 12:35:47.121169: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-09-26 12:35:47.221945: I tensorflow/core/util/cuda_solvers.cc:179] Creating GpuSolver handles for stream 0x1c295b00
2023-09-26 12:35:53.742239: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x1f2e3ad0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-09-26 12:35:53.742284: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2023-09-26 12:35:53.747611: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-09-26 12:35:53.830288: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cann





INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 2/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 3/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 4/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 5/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 6/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 7/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 8/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 9/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 10/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 11/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 12/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 13/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 14/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 15/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 16/100



INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


INFO:tensorflow:Assets written to: ../models/ResNet_f.16.32.16_k.3.3.3_s_CLASSES.10_BS.16/best/assets


Epoch 17/100
 137/2022 [=>............................] - ETA: 6:11 - loss: -1.8513