# Augment Dataset with Massachusetts roads dataset

## [Dataset kaggle link](https://www.kaggle.com/datasets/insaff/massachusetts-roads-dataset/download?datasetVersionNumber=1)

The dataset is too big (~5 Go) so first we will only take an subset of it

Click on the link to download the whole dataset, unzip it and place the folder ´road_segmentation_ideal´ in the directory ´data/extent´. Then, run the notebook to take only a subset of the downloaded dataset. 

In [None]:
import os
from random import sample
import shutil
from utils import *

SUBSET_SIZE = 20
DATA_DIR = '../data/'

In [None]:
if not os.path.isdir(DATA_DIR + 'road_segmentation_ideal/'):
    print('please download the dataset and place it in the folder data/subset')

In [None]:
train_in_dst_dir = DATA_DIR +'road_segmentation_ideal_subset/training/input/'
if not os.path.isdir(train_in_dst_dir):
    os.makedirs(train_in_dst_dir)

train_out_dst_dir = DATA_DIR + 'road_segmentation_ideal_subset/training/output/'
if not os.path.isdir(train_out_dst_dir):
    os.makedirs(train_out_dst_dir)

outputs = os.listdir(DATA_DIR + 'road_segmentation_ideal/training/output/')
random_elements = sample(outputs, SUBSET_SIZE)
for name in random_elements :

    train_in_src =  DATA_DIR +f'road_segmentation_ideal/training/input/{name}'
    train_in_dst = train_in_dst_dir + f'{name}'
    shutil.copy(train_in_src, train_in_dst)

    train_out_src =  DATA_DIR +f'road_segmentation_ideal/training/output/{name}'
    train_out_dst = train_out_dst_dir + f'{name}'
    shutil.copy(train_out_src, train_out_dst)

Copy test directory

In [None]:
from distutils.dir_util import copy_tree

test_src_dir = DATA_DIR + 'road_segmentation_ideal/testing'
test_dst_dir = DATA_DIR + 'road_segmentation_ideal_subset/testing'

if not os.path.isdir(test_dst_dir):
    os.mkdir(test_dst_dir)

copy_tree(test_src_dir, test_dst_dir)

In [None]:
images = load_all_from_path(train_in_dst_dir)
masks = load_all_from_path(train_out_dst_dir)

In [None]:
show_first_n(images, masks, 3)

In [None]:
images[0].shape

### We can see that images from Massachusetts dataset are (1500x1500) so we will randomly crop them so they are (400x400)

In [None]:
import torch
import torchvision.transforms as transforms
import torchvision.transforms.functional as TF
from PIL import Image
CROP_SIZE = 400

In [None]:
cropped_directory = DATA_DIR + 'road_segmentation_ideal_subset_cropped/'
if not os.path.isdir(cropped_directory):
    os.mkdir(cropped_directory)

Crop training images 

In [None]:
train_crop_out_dst_dir = cropped_directory + 'training/output/'
if not os.path.isdir(train_crop_out_dst_dir):
    os.makedirs(train_crop_out_dst_dir)

train_crop_in_dst_dir = cropped_directory + 'training/input/'
if not os.path.isdir(train_crop_in_dst_dir):
    os.makedirs(train_crop_in_dst_dir)
    
for name in os.listdir(DATA_DIR + 'road_segmentation_ideal_subset_cropped/training/output/'):
    image = Image.open(train_in_dst_dir + f'{name}')
    mask = Image.open(train_out_dst_dir + f'{name}')
    transform = transforms.RandomCrop(CROP_SIZE)
    i, j, h, w = transform.get_params(image, output_size=(CROP_SIZE, CROP_SIZE))
    image_crop = TF.crop(image, i, j, h, w)
    mask_crop = TF.crop(mask, i, j, h, w)
    image_crop.save(train_crop_in_dst_dir + f'{name}')
    mask_crop.save(train_crop_out_dst_dir + f'{name}')

Crop testing images

In [None]:
test_crop_in_dir = cropped_directory + 'testing/input/'
if not os.path.isdir(test_crop_in_dir):
    os.makedirs(test_crop_in_dir)

test_crop_out_dir = cropped_directory + 'testing/output/'
if not os.path.isdir(test_crop_out_dir):
    os.makedirs(test_crop_out_dir)

test_out_dst_dir = DATA_DIR + 'road_segmentation_ideal_subset_cropped/testing/output/'
test_in_dst_dir = DATA_DIR + 'road_segmentation_ideal_subset_cropped/testing/input/'
for name in os.listdir(test_out_dst_dir):
    image = Image.open(test_in_dst_dir + f'{name}')
    mask = Image.open(test_out_dst_dir + f'{name}')
    transform = transforms.RandomCrop(CROP_SIZE)
    i, j, h, w = transform.get_params(image, output_size=(CROP_SIZE, CROP_SIZE))
    image_crop = TF.crop(image, i, j, h, w)
    mask_crop = TF.crop(mask, i, j, h, w)
    image_crop.save(test_crop_in_dir + f'{name}')
    mask_crop.save(test_crop_out_dir + f'{name}')

Once we have create the cropped subset, we can delete the subset directory 

In [None]:
subset_dir = DATA_DIR + 'road_segmentation_ideal_subset/'
if os.path.isdir(subset_dir):
    shutil.rmtree(subset_dir)

Now, we split the cropped train directory into training and validation

In [None]:
len(os.listdir(DATA_DIR + '/training/groundtruth'))

In [None]:
len(os.listdir(DATA_DIR + '/validation/groundtruth'))