In [45]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from skimage.io import imread

In [46]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Conv2D, UpSampling2D

base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(640, 800, 3))
classes = 1

x = base_model.output
x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
output = Conv2D(1, (1, 1), activation='sigmoid', padding='same')(x)


model = Model(inputs=base_model.input, outputs=output)



  base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(640, 800, 3))


### Data Augmentation and Training

In [47]:
#get image and mask paths
#convert to dataframe
import pandas as pd
import os
df = pd.read_csv('data/640x800/640x800.csv')

image_paths = np.array(df['snow_path'].values)
mask_paths = np.array(df['snowbinary_path'].values)
image_paths.shape


(131,)

In [48]:

def load_data(image_paths, mask_paths):
    images = [imread(img_path) for img_path in image_paths]
    masks = [imread(mask_path) for mask_path in mask_paths]
    
    #resize all images with 4 channels to 3 channels
    for i in range(len(images)):
        if(len(images[i][0][0])==4):
            images[i] = images[i][:,:,:3]

    #resize all masks to 1120x640
    for i in range(len(masks)):
        masks[i] = masks[i][:800,:640]
    
    
    return np.array(images), np.array(masks)

images, masks = load_data(image_paths, mask_paths)

In [49]:
masks = masks[..., np.newaxis]
masks.shape


(131, 800, 640, 1)

In [50]:
# Assuming your images and masks are numpy arrays
# masks = masks[..., np.newaxis]  # add a channel dimension if necessary

images_dataset = tf.data.Dataset.from_tensor_slices(images)
masks_dataset = tf.data.Dataset.from_tensor_slices(masks)

def augment_data(image, mask):
    # Random rotation
    rotation_angle = tf.random.uniform(shape=[], minval=-10, maxval=10, dtype=tf.float32)
    image = tf.image.rot90(image, tf.cast(rotation_angle / 90, tf.int32))
    mask = tf.image.rot90(mask, tf.cast(rotation_angle / 90, tf.int32))

    # Random horizontal flip
    if tf.random.uniform(shape=[], dtype=tf.float32) > 0.5:
        image = tf.image.flip_left_right(image)
        mask = tf.image.flip_left_right(mask)

    return image, mask

augmented_dataset = tf.data.Dataset.zip((images_dataset, masks_dataset))
augmented_dataset = augmented_dataset.map(augment_data, num_parallel_calls=tf.data.AUTOTUNE)

augmented_dataset = augmented_dataset.shuffle(1000)

BATCH_SIZE = 32
augmented_dataset = augmented_dataset.batch(BATCH_SIZE)
augmented_dataset = augmented_dataset.prefetch(tf.data.AUTOTUNE)

In [51]:
# Define the ModelCheckpoint callback to save weights at each epoch
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='checkpoints/unet_640x1132_epoch_{epoch:02d}.keras',
    save_weights_only=False,
    save_best_only=False,
    save_freq='epoch'
)
os.makedirs('checkpoints', exist_ok=True)


In [52]:
#fit the model
from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])


model.fit(augmented_dataset, epochs=50, callbacks=[checkpoint_callback], verbose=1)

Epoch 1/50


2024-06-18 10:37:00.290981: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907
2024-06-18 10:37:02.970737: W external/local_tsl/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 516.00MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-06-18 10:37:03.082666: W tensorflow/core/framework/op_kernel.cc:1839] OP_REQUIRES failed at xla_ops.cc:580 : UNKNOWN: Failed to determine best cudnn convolution algorithm for:
%cudnn-conv.135 = (f32[32,32,400,320]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,3,801,641]{3,2,1,0} %transpose.964, f32[32,3,3,3]{3,2,1,0} %transpose.965), window={size=3x3 stride=2x2}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward", metadata={op_type="Conv2D" op_name="functional_13_1/Conv1_1/convolution" source_file="/home/apfox/anaconda3/lib/python3.11/site-packages

UnknownError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 701, in start

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 195, in start

  File "/home/apfox/anaconda3/lib/python3.11/asyncio/base_events.py", line 607, in run_forever

  File "/home/apfox/anaconda3/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once

  File "/home/apfox/anaconda3/lib/python3.11/asyncio/events.py", line 80, in _run

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 534, in dispatch_queue

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 523, in process_one

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 429, in dispatch_shell

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 767, in execute_request

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 429, in do_execute

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3106, in _run_cell

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3311, in run_cell_async

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3493, in run_ast_nodes

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "/tmp/ipykernel_160003/2695611729.py", line 9, in <module>

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 314, in fit

  File "/home/apfox/anaconda3/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 117, in one_step_on_iterator

Failed to determine best cudnn convolution algorithm for:
%cudnn-conv.135 = (f32[32,32,400,320]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,3,801,641]{3,2,1,0} %transpose.964, f32[32,3,3,3]{3,2,1,0} %transpose.965), window={size=3x3 stride=2x2}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convForward", metadata={op_type="Conv2D" op_name="functional_13_1/Conv1_1/convolution" source_file="/home/apfox/anaconda3/lib/python3.11/site-packages/tensorflow/python/framework/ops.py" source_line=1177}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"conv_result_scale":1,"activation_mode":"kNone","side_input_scale":0,"leakyrelu_alpha":0}}

Original error: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 541065216 bytes.

To ignore this failure and try to use a fallback algorithm (which may have suboptimal performance), use XLA_FLAGS=--xla_gpu_strict_conv_algorithm_picker=false.  Please also file a bug for the root cause of failing autotuning.
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_one_step_on_iterator_156067]