<a href="https://colab.research.google.com/github/DataScienceAndEngineering/deep-learning-final-project-project-sidewalk/blob/nicholas/notebooks/cityscape_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import PIL
import imageio as iio
import numpy as np

#Uncompressing the raw data
if not os.path.isdir('./leftImg8bit/'):
  !unzip -n ./drive/MyDrive/tensorflow_datasets/downloads/manual/leftImg8bit_trainvaltest.zip
if not os.path.isdir('./gtFine/'):
  !unzip -n ./drive/MyDrive/tensorflow_datasets/downloads/manual/gtFine_trainvaltest.zip


In [None]:
## Data Extraction
def mask_2_image(path):
  # Converts the mask path string to the image path string corresponding to the same sample
  parts = path.split('/')
  parts[1] = 'leftImg8bit'
  file_name = parts[-1].split('_')
  file_name[-2] = 'leftImg8bit.png'
  file_name = file_name[0:-1]
  parts[-1] = '_'.join(file_name)
  path = '/'.join(parts)
  return path
  
def extract_files(subset='all', paths='old'):
  # Returns filepaths of all labels and images which include a sidewalk segmentation
  # Additinionally, moves all valid samples into extraction folders
  if not os.path.isdir('./extract_labels/'):
    os.mkdir('./extract_labels/')
    os.mkdir('./extract_images/')
  labels_files = []
  img_files = []
  if subset == 'all':
    subsets = ['train', 'val', 'test']
  else:
    subsets = [subset]
  for group in subsets:
    LabelsDir = f'./gtFine/{group}'
    print(LabelsDir)
    for root, dirs, files_list in os.walk(LabelsDir):
      for filename in files_list:
        if filename.endswith('labelIds.png'):
          f = os.path.join(root, filename)
          labels = np.array(PIL.Image.open(f))
          if 8 in labels:
            os.rename(f, './extract_labels/' + '_'.join(f.split('/')[-1].split('_')[0:3]) + '.png')
            d = mask_2_image(f)
            os.rename(d, './extract_images/' + '_'.join(d.split('/')[-1].split('_')[0:3]) +'.png')
            if paths == 'old':
              labels_files.append(f)
              img_files.append(d)
            elif paths == 'new':
              labels_files.append('./extract_labels/' + '_'.join(f.split('/')[-1].split('_')[0:3]) + '.png')
              img_files.append('./extract_images/' + '_'.join(d.split('/')[-1].split('_')[0:3]) +'.png')
  return labels_files, img_files

In [None]:
## Data Processing and Exporting
def process_images(shape=(256, 512, 3)):
  #Preprocessing of the images, resizing to standard size
  ImageDir = './extract_images/'
  if not os.path.isdir('./processed_images/'):
    os.mkdir('./processed_images/')
  for fil in os.listdir(ImageDir):
    image = iio.imread(ImageDir + fil)
    image = resize(image, shape)
    iio.imsave('./processed_images/' + fil, image)

def process_masks(shape=(256, 512, 1)):
  # Preprocessing of the masks: removes all segmentations except sidewalks
  # REsizes masks to standard size
  LabelDir = './extract_labels/'
  if not os.path.isdir('./processed_labels/'):
    os.mkdir('./processed_labels/')
  for fil in os.listdir(LabelDir):
    mask = iio.imread(LabelDir + fil)
    mask = np.array(mask == 8).astype('uint8')
    mask = resize(mask, shape)
    iio.imsave('./processed_labels/' + fil, mask)

def process_files(download=True):
  #Processes all extracted files, zips images and labels, then downloads processed data
  process_images()
  process_masks()
  !zip -r ./drive/MyDrive/tensorflow_datasets/processed_images.zip /content/processed_images
  !zip -r ./drive/MyDrive/tensorflow_datasets/processed_labels.zip /content/processed_labels
  if download:
    files.download("./drive/MyDrive/tensorflow_datasets/processed_images.zip")
    files.download("./drive/MyDrive/tensorflow_datasets/processed_labels.zip")

In [None]:
  extract_files()
  process_files(download=False)