In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf
import keras
import keras.callbacks
import keras.optimizers
import keras.layers as kl

2023-06-18 12:50:01.914465: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-18 12:50:01.943288: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import src.loss_functions as loss

In [4]:
from src.image_loader import read_images, get_image_paths
BASE_PATH = '/app/data/imagenet_data/'

In [5]:
train_slice = 0,2000

train_color_paths = get_image_paths(os.path.join(BASE_PATH, 'train/color'), *train_slice)
train_color_imgs = read_images(train_color_paths, resize_dimensions=(400,400), show_progress=True)
train_gray_paths  = get_image_paths(os.path.join(BASE_PATH, 'train/grayscale'),  *train_slice)
train_gray_imgs  = read_images(train_gray_paths, resize_dimensions=(400,400), show_progress=True)

  0%|          | 0/2000 [00:00<?, ?it/s]2023-06-18 12:50:03.164011: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
  0%|          | 1/2000 [00:00<21:29,  1.55it/s]2023-06-18 12:50:03.166998: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-18 12:50:03.167088: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero

In [6]:
vali_slice = 100,1000

vali_color_paths = get_image_paths(os.path.join(BASE_PATH, 'vali/color'), *vali_slice)
vali_color_imgs = read_images(vali_color_paths, resize_dimensions=(400,400), show_progress=True)
vali_gray_paths  = get_image_paths(os.path.join(BASE_PATH, 'vali/grayscale'),  *vali_slice)
vali_gray_imgs  = read_images(vali_gray_paths, resize_dimensions=(400,400), show_progress=True)

100%|██████████| 900/900 [00:04<00:00, 211.41it/s]
100%|██████████| 900/900 [00:01<00:00, 456.84it/s]


In [7]:
test_color_paths = get_image_paths(os.path.join(BASE_PATH, 'test/color'), 0,100)
test_color_imgs = read_images(test_color_paths, resize_dimensions=(400,400), show_progress=True)
test_gray_paths  = get_image_paths(os.path.join(BASE_PATH, 'test/grayscale'),  0,100)
test_gray_imgs  = read_images(test_gray_paths, resize_dimensions=(400,400), show_progress=True)

100%|██████████| 100/100 [00:00<00:00, 210.15it/s]
100%|██████████| 100/100 [00:00<00:00, 449.22it/s]


In [8]:
input_dims = train_gray_imgs[0].shape

input_layer = kl.Input(shape=input_dims)
x = kl.Conv2D(64, (3,3), activation='relu', padding='same')(input_layer)
x = kl.MaxPooling2D((2,2), padding='same')(x)
x = kl.BatchNormalization()(x)
skip1 = kl.Conv2D(32, (3,3), activation='relu', padding='same')(x)

x = kl.Conv2D(32, (3,3), activation='relu', padding='same')(skip1)
x = kl.MaxPooling2D((2,2), padding='same')(x)
x = kl.BatchNormalization()(x)
skip2 = kl.Conv2D(32, (3,3), activation='relu', padding='same')(x)
x = kl.Conv2D(32, (3,3), activation='relu', padding='same')(skip2)
x = kl.MaxPooling2D((2,2), padding='same')(x)
x = kl.BatchNormalization()(x)
x = kl.Conv2D(32, (3,3), activation='relu', padding='same')(x)

# upscale 
x = kl.Conv2DTranspose(32, (3,3), strides=(2,2), activation='relu', padding='same')(x)
x = kl.Conv2D(32, (3,3), activation='relu', padding='same')(x)
x = kl.Add()([x, skip2])
x = kl.Conv2DTranspose(32, (3,3), strides=(2,2), activation='relu', padding='same')(x)
x = kl.BatchNormalization()(x)
x = kl.Conv2D(32, (3,3), activation='relu', padding='same')(x)
x = kl.Add()([x, skip1])
x = kl.Conv2DTranspose(32, (3,3), strides=(2,2), activation='relu', padding='same')(x)
x = kl.BatchNormalization()(x)
x = kl.Conv2D(32, (3,3), activation='relu', padding='same')(x)
output_layer = kl.Conv2D(3, (3,3), activation='relu', padding='same')(x)

model = keras.models.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.00001), loss='mse')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 400, 400, 1  0           []                               
                                )]                                                                
                                                                                                  
 conv2d (Conv2D)                (None, 400, 400, 64  640         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 max_pooling2d (MaxPooling2D)   (None, 200, 200, 64  0           ['conv2d[0][0]']                 
                                )                                                             

In [9]:
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [10]:
hist = model.fit(train_gray_imgs, train_color_imgs, epochs=10, batch_size=100, validation_data=(vali_gray_imgs, vali_color_imgs), callbacks=[early_stopping])

Epoch 1/10


2023-06-18 12:50:34.741489: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-06-18 12:50:36.661872: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.42GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2023-06-18 12:50:36.661911: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.42GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2023-06-18 12:50:36.661923: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.42GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if m

ResourceExhaustedError: Graph execution error:

Detected at node 'gradient_tape/model/conv2d_9/Conv2D/Conv2DBackpropInput' defined at (most recent call last):
    File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/usr/local/lib/python3.8/dist-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/usr/local/lib/python3.8/dist-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelapp.py", line 725, in start
      self.io_loop.start()
    File "/usr/local/lib/python3.8/dist-packages/tornado/platform/asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/usr/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/usr/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
      await result
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/usr/local/lib/python3.8/dist-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_13/3606337766.py", line 1, in <module>
      hist = model.fit(train_gray_imgs, train_color_imgs, epochs=10, batch_size=100, validation_data=(vali_gray_imgs, vali_color_imgs), callbacks=[early_stopping])
    File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 1054, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/usr/local/lib/python3.8/dist-packages/keras/optimizers/legacy/optimizer_v2.py", line 585, in minimize
      grads_and_vars = self._compute_gradients(
    File "/usr/local/lib/python3.8/dist-packages/keras/optimizers/legacy/optimizer_v2.py", line 643, in _compute_gradients
      grads_and_vars = self._get_gradients(
    File "/usr/local/lib/python3.8/dist-packages/keras/optimizers/legacy/optimizer_v2.py", line 519, in _get_gradients
      grads = tape.gradient(loss, var_list, grad_loss)
Node: 'gradient_tape/model/conv2d_9/Conv2D/Conv2DBackpropInput'
OOM when allocating tensor with shape[100,32,400,400] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node gradient_tape/model/conv2d_9/Conv2D/Conv2DBackpropInput}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_8388]

In [None]:
df_hist = pd.DataFrame(hist.history)
df_hist.plot()

In [None]:
preds = np.array((model.predict(test_gray_imgs) * 255), dtype=np.uint8)

In [None]:
# show all predictions in grid
fig, axs = plt.subplots(10, 10, figsize=(20,20))
for i in range(10):
    for j in range(10):
        axs[i,j].imshow(preds[i*10+j].astype(int))
        axs[i,j].axis('off')