<a href="https://colab.research.google.com/github/DataScienceAndEngineering/deep-learning-final-project-project-sidewalk/blob/nicholas/notebooks/cityscape_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import PIL
import imageio as iio
import numpy as np
import shutil
from skimage.transform import resize
import subprocess
from google.colab import drive, files
drive.mount('/content/drive')

#Uncompressing the raw data
if not os.path.isdir('./leftImg8bit/'):
  !unzip -n -q ./drive/MyDrive/tensorflow_datasets/downloads/manual/leftImg8bit_trainvaltest.zip
if not os.path.isdir('./gtFine/'):
  !unzip -n -q ./drive/MyDrive/tensorflow_datasets/downloads/manual/gtFine_trainvaltest.zip


Mounted at /content/drive


In [5]:
## Data Extraction
def mask_2_image(path):
  # Converts the mask path string to the image path string corresponding to the same sample
  parts = path.split('/')
  parts[2] = 'leftImg8bit'
  file_name = parts[-1].split('_')
  file_name[-2] = 'leftImg8bit.png'
  file_name = file_name[0:-1]
  parts[-1] = '_'.join(file_name)
  path = '/'.join(parts)
  return path
  
def extract_files(subset='all', paths='old', download=False):
  # Returns filepaths of all labels and images which include a sidewalk segmentation
  # Additinionally, moves all valid samples into extraction folders
  if not os.path.isdir('/content/extract_labels/'):
    os.mkdir('/content/extract_labels/')
    os.mkdir('/content/extract_images/')
  labels_files = []
  img_files = []
  if subset == 'all':
    subsets = ['train', 'val', 'test']
  else:
    subsets = [subset]
  for group in subsets:
    LabelsDir = f'/content/gtFine/{group}'
    print(LabelsDir)
    for root, dirs, files_list in os.walk(LabelsDir):
      for filename in files_list:
        if filename.endswith('labelIds.png'):
          f = os.path.join(root, filename)
          labels = np.array(PIL.Image.open(f))
          if 8 in labels:
            os.rename(f, '/content/extract_labels/' + '_'.join(f.split('/')[-1].split('_')[0:3]) + '.png')
            d = mask_2_image(f)
            os.rename(d, '/content/extract_images/' + '_'.join(d.split('/')[-1].split('_')[0:3]) +'.png')
            if paths == 'old':
              labels_files.append(f)
              img_files.append(d)
            elif paths == 'new':
              labels_files.append('/content/extract_labels/' + '_'.join(f.split('/')[-1].split('_')[0:3]) + '.png')
              img_files.append('/content/extract_images/' + '_'.join(d.split('/')[-1].split('_')[0:3]) +'.png')
  if download:
    !zip -r /content/extracted_images.zip /content/extract_images
    !zip -r /content/extracted_labels.zip /content/extract_labels
    files.download("/content/extracted_images.zip")
    files.download("/content/extracted_labels.zip")
  return labels_files, img_files

In [6]:
## Data Processing and Exporting
def process_images(shape=(256, 512, 3)):
  #Preprocessing of the images, resizing to standard size
  ImageDir = './extract_images/'
  if not os.path.isdir('./processed_images/'):
    os.mkdir('./processed_images/')
  for fil in os.listdir(ImageDir):
    image = iio.imread(ImageDir + fil)
    image = resize(image, shape)
    np.save('./processed_images/' +fil, image)
    #iio.imsave('./processed_images/' + fil, image)

def process_masks(shape=(256, 512, 1), id=8, name='sidewalk'):
  # Preprocessing of the masks: removes all segmentations except sidewalks
  # REsizes masks to standard size
  LabelDir = f'./extract_labels/'
  if not os.path.isdir(f'./processed_{name}/'):
    os.mkdir(f'./processed_{name}/')
  for fil in os.listdir(LabelDir):
    mask = iio.imread(LabelDir + fil)
    mask = np.array(mask == id).astype('uint8')
    mask = resize(mask, shape)
    np.save(f'./processed_{name}/'+fil, mask)
    #iio.imsave('./processed_labels/' + fil, mask)

def process_files(download=True, purge=True, source='/content/', destination='./drive/MyDrive/tensorflow_datasets/'):
  #Processes all extracted files, zips images and labels, then downloads processed data
  
  #List of segmentation IDs to be segmented, can be updated
  ID_log = {
      8:'sidewalk',
      11:'building',
      12:'wall',
      13:'fence',
      14:'guard_rail',
      15:'bridge',
      16:'tunnel',
      21:'vegetation',
      17:'pole',
      18:'polegroup'
  }
  #process_images()
  #print('image processing completed')
  #!zip -r ./drive/MyDrive/tensorflow_datasets/processed_images.zip /content/processed_images
  #if purge:
  #  shutil.rmtree(f'{source}processed_images')
  #if download:
  #  files.download(f"{destination}processed_images.zip")
  print('image processing completed')
  for i in ID_log.keys():
    #Skip ID if processed extraction exists
    if os.path.exists(f"{destination}processed_{ID_log[i]}.zip"):
      print(f'{ID_log[i]} exists as processed zip, skipping....')
      continue
    print(f'processing mask {ID_log[i]}')
    process_masks(id=i, name=ID_log[i])
    cmd = f'zip -r {destination}processed_{ID_log[i]}.zip {source}processed_{ID_log[i]}'
    print(cmd)
    subprocess.run(cmd, shell=True)
    if purge:
      shutil.rmtree(f'{source}processed_{ID_log[i]}')
    if download:
      files.download(f"{destination}processed_{ID_log[i]}.zip")
  print('mask processing completed')
 

In [7]:
extract_files()
shutil.rmtree('./gtFine')
shutil.rmtree('./leftImg8bit')


/content/gtFine/train
/content/gtFine/val
/content/gtFine/test


In [8]:
process_files(download=True, purge=True)

image processing completed
sidewalk exists as processed zip, skipping....
building exists as processed zip, skipping....
wall exists as processed zip, skipping....
fence exists as processed zip, skipping....
guard_rail exists as processed zip, skipping....
bridge exists as processed zip, skipping....
tunnel exists as processed zip, skipping....
vegetation exists as processed zip, skipping....
pole exists as processed zip, skipping....
polegroup exists as processed zip, skipping....
mask processing completed
