# Import Libraries

In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
import cv2
from tqdm import tqdm

# Load Data from Kaggle

In [2]:
!pip install kaggle
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/ 
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d balraj98/deepglobe-road-extraction-dataset



Saving kaggle.json to kaggle.json
Downloading deepglobe-road-extraction-dataset.zip to /content
100% 3.79G/3.79G [00:48<00:00, 63.4MB/s]



# Extract Data

In [3]:
os.mkdir('/content/dataset')

path = '/content/deepglobe-road-extraction-dataset.zip'
dest = '/content/dataset'

with zipfile.ZipFile(path, 'r') as zip_ref:
  zip_ref.extractall(dest)
  print('Zip is done extracting!')

Zip is done extracting!


# Data Preprocessing

## Observe the dataset from the CSV

In [4]:
path = '/content/dataset/'
csv_path = os.path.join(path, 'metadata.csv')
df = pd.read_csv(csv_path)

In [5]:
df.head()

Unnamed: 0,image_id,split,sat_image_path,mask_path
0,100034,train,train/100034_sat.jpg,train/100034_mask.png
1,100081,train,train/100081_sat.jpg,train/100081_mask.png
2,100129,train,train/100129_sat.jpg,train/100129_mask.png
3,100703,train,train/100703_sat.jpg,train/100703_mask.png
4,100712,train,train/100712_sat.jpg,train/100712_mask.png


In [6]:
df['split'].unique()

array(['train', 'valid', 'test'], dtype=object)

## Split only training data from the dataset


In [7]:
train = df[df['split'] == 'train']
train['split'].unique()

array(['train'], dtype=object)

### We only need to access the image id and the directory to the images, so we can just drop the split column since it's all training images.

In [8]:
train = train[['image_id', 'sat_image_path', 'mask_path']]
train.head()

Unnamed: 0,image_id,sat_image_path,mask_path
0,100034,train/100034_sat.jpg,train/100034_mask.png
1,100081,train/100081_sat.jpg,train/100081_mask.png
2,100129,train/100129_sat.jpg,train/100129_mask.png
3,100703,train/100703_sat.jpg,train/100703_mask.png
4,100712,train/100712_sat.jpg,train/100712_mask.png


## Set directory to the images same as the directory in our colab

In [9]:
train['sat_image_path'] = train['sat_image_path'].apply(lambda img_pth: os.path.join(path, img_pth))
train['mask_path'] = train['mask_path'].apply(lambda img_pth: os.path.join(path, img_pth))

## Randomize our main training dataset

In [10]:
train = train.sample(frac = 1).reset_index(drop = True)

## Set train csv into 1000 training data (optional for optimizing)

In [11]:
small_train = train.sample(1000).reset_index(drop=True)

# Custom Data Generator

## Empty numpy arrays for our training data

In [12]:
images = np.zeros((1000, 256, 256, 3)).astype('float')
masks = np.zeros((1000, 256, 256, 1)).astype('float')

## Convert training images into numpy arrays and assign them into the empty numpy arrays

In [13]:
data_length = len(small_train)

for i in tqdm(range(data_length)):
  img = cv2.imread(small_train['sat_image_path'][i])/255.
  mask = cv2.imread(small_train['mask_path'][i], cv2.IMREAD_GRAYSCALE)/255.
  img =  cv2.resize(img, (256, 256))
  mask = cv2.resize(mask, (256, 256))
  mask = mask.reshape(256, 256, 1)
  images[i] = img
  masks[i] = mask
  del img, mask

100%|██████████| 1000/1000 [00:50<00:00, 19.74it/s]


## Now the data is ready for training

In [14]:
images

array([[[[0.27843137, 0.40784314, 0.6       ],
         [0.29117647, 0.4254902 , 0.61862745],
         [0.26470588, 0.40392157, 0.58039216],
         ...,
         [0.13137255, 0.21372549, 0.17254902],
         [0.15490196, 0.24607843, 0.20784314],
         [0.17647059, 0.27058824, 0.2254902 ]],

        [[0.28039216, 0.41470588, 0.59803922],
         [0.28823529, 0.41862745, 0.60196078],
         [0.30784314, 0.43627451, 0.6       ],
         ...,
         [0.13137255, 0.20882353, 0.16470588],
         [0.13921569, 0.21764706, 0.18235294],
         [0.15686275, 0.22745098, 0.19215686]],

        [[0.31862745, 0.44215686, 0.62254902],
         [0.28137255, 0.39215686, 0.57941176],
         [0.23431373, 0.34019608, 0.49901961],
         ...,
         [0.14607843, 0.2245098 , 0.17745098],
         [0.15      , 0.22352941, 0.18529412],
         [0.15784314, 0.2245098 , 0.18921569]],

        ...,

        [[0.33137255, 0.49117647, 0.64705882],
         [0.30784314, 0.45392157, 0.59215686]

In [15]:
masks

array([[[[0. ],
         [0. ],
         [0. ],
         ...,
         [0. ],
         [0. ],
         [0. ]],

        [[0. ],
         [0. ],
         [0. ],
         ...,
         [0. ],
         [0. ],
         [0. ]],

        [[0. ],
         [0. ],
         [0. ],
         ...,
         [0. ],
         [0. ],
         [0. ]],

        ...,

        [[0. ],
         [0. ],
         [0. ],
         ...,
         [0. ],
         [0. ],
         [1. ]],

        [[0. ],
         [0. ],
         [0. ],
         ...,
         [0. ],
         [0. ],
         [1. ]],

        [[0. ],
         [0. ],
         [0. ],
         ...,
         [0. ],
         [0. ],
         [1. ]]],


       [[[0. ],
         [0. ],
         [0. ],
         ...,
         [1. ],
         [0. ],
         [0. ]],

        [[0. ],
         [0. ],
         [0. ],
         ...,
         [0.5],
         [0. ],
         [0. ]],

        [[0. ],
         [0. ],
         [0. ],
         ...,
         [0. ],
         [