# Constructing Image Data

In this notebook, we will be be taking the images generated in "Satellite and Formation Image Collection" and taking subimages to be used by the GAN.  These images will also be stored in the format required by the GAN

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import contextily as cx
%matplotlib inline

import math

import os
import urllib.request
from PIL import Image
import cv2

import random
import skimage
import skimage.feature
import skimage.viewer
import numpy as np
import sys

  from .core import *


In [2]:
# Seed used for this run for reproduceability
seed = random.randrange(sys.maxsize)
rand = random.Random(seed)
print("Seed used:", seed)

Seed used: 7584410963331951209


## Input Variables

Contains all the variables that you as the user may want to change.

In [3]:
# Name of directory to save everything in
save_path = "Collected_Images"

# Where to pull the GAN images from (GAN refers to Satellite images on the
# left and the corresponding Formation image on the right)
src_path = "Collected_Images/DM_Dataset/GAN_pics/"

# Here is where you can specify what name you want to call it
data_set_name = "Split_Images_DM"

# Percentage of source images to use for the validation images
val_split = 0.08

# Percentage of images that will be used for the testing set
# The split is done after validation images are removed
test_split = 0.20

# Total number of testing and training images
num_tt_imgs = 2160

# Total number of validation images
num_val_imgs = 180

# If you want the resulting formation image to be grayscale
greyscale = True

# If you want the resulting formation image to be run through edge
# detection
edge = False

# If you want the resulting images to have overlap
overlap = False

# The different splits that you can do with the data
# 0 - Images in Test and Train are pulled from the same pool of source Images
#     Validation images are pulled from a completely separate source
# 1 - Images in all folders (Training, Testing, and Validation) are pulled
#     from independent source images.
mode = 1

## Functions

In [4]:
'''
    This function will take the 3 sets (training, testing, and validation)
    and, from the source images, will split them into 256x256 subplots to
    be used for GAN training. These images are saved and subplotted completely
    independent of the other sets. This function will also have the option to
    allow for images to be overlapped or not.
    
    :param train_set: list, The list of all names in the training set
    :param test_set: list, The list of all names in the testing set
    :param val_set: list, The list of all the names in the validation set
    :param n_vals: list, A list containing the number of images desired for each set

'''



def full_split(train_set, test_set, val_set, n_vals):
    # First put all the sets into a 2d array to make it for easier
    # organization
    
    split_sets = [
        train_set,
        test_set,
        val_set
    ]
    
    # As all sets are independent of each other, and all the paths/sets
    # are arranged as following: 1) Training 2) Testing 3) Validation
    # We can just organize this as a loop that loops 3 times.
    for k in range(0, 3):
        
        # List to avoid saving duplicate subplots
        visited = []

        # Image generation for the given set. The imformation for how many
        # images of a set we want is in n_vals
        for i in range(0, n_vals[k]):
            
            # First we get which source image we will be taking from
            src = rand.randrange(0, len(split_sets[k]))
            
            # Depending on whether we want overlapping images or not
            # will dictate how we chunk the source image.
            if(not overlap):
                
                # If we do not want overlap, then split the original image
                # into img_size number of chunks and pick a random number from
                # 0 to that split
                x = rand.randrange(0, input_size / img_size)
                y = rand.randrange(0, input_size / img_size)
                
                # Then save that off in a dictionary to later check for
                # duplicates
                cord = {
                    "source": src,
                    "x_cord": x * img_size,
                    "y_cord": y * img_size,
                }
            else:
                
                # If we don't care about overlapping images, then just pick
                # a random number from 0 to input_size-img_size due to the
                # width of the subimage.
                x = rand.randrange(0, input_size - img_size)
                y = rand.randrange(0, input_size - img_size)
                
                # Then again, save it off in a dictionary for later to check
                # for duplicates
                cord = {
                    "source": src,
                    "x_cord": x,
                    "y_cord": y,
                }
            
            # If we do have a duplicate, then depending on whether you
            # care about duplicates or not, repeat the process until the program
            # has a new subplot
            while(cord in visited):
                
                src = rand.randrange(0, len(split_sets[k]))
                if(not overlap):
                    x = rand.randrange(0, input_size / img_size)
                    y = rand.randrange(0, input_size / img_size)

                    cord = {
                        "source": src,
                        "x_cord": x * img_size,
                        "y_cord": y * img_size,
                    }
                else: 
                    x = rand.randrange(0, input_size - img_size)
                    y = rand.randrange(0, input_size - img_size)

                    cord = {
                        "source": src,
                        "x_cord": x,
                        "y_cord": y,
                    } 
            
            # By this point, the program is dealing with a unique subplot,
            # so append it to the visited to make sure we don't use it again.
            visited.append(cord)
            
            # Next open up the source image selected.
            img = Image.open(
                                src_path + split_sets[k][cord["source"]] 
                                + "-gan.png"
                            )
            
            # Set up the bounding box for the new subplot
            left = cord["x_cord"]
            top = cord["y_cord"]
            right = left + img_size
            bottom = top + img_size
            
            # Crop off the left image (satellite image)
            lft_img = img.crop((
                                left,
                                top,
                                right,
                                bottom
                              ))
            
            # Crop off the corresponding right image (formation image)
            rht_img = img.crop((
                                left+input_size,
                                top,
                                right+input_size,
                                bottom
                              ))
            
            # Next come the special cases.
            
            # Run the Formation image through cv2's Canny to get edge detection
            if(edge):
                rht_img.save(src_path + "temp_rht_img.png")

                img = cv2.imread(src_path + "temp_rht_img.png")
                edges = cv2.Canny(img,100,200)

                rht_img = Image.fromarray(edges)

            # Run the Formation image through skimage to greyscale the image.
            # This is done by reducing the image to it's blue layer.
            if(greyscale):
                rht_img.save(src_path + "temp_rht_img.png")

                temp_rht_img = skimage.io.imread(fname = src_path + "temp_rht_img.png")
                temp_ar = np.copy(temp_rht_img)
                x_ar = temp_ar
                result = x_ar[:, :, 2]
                result = result * (1.0/255.0)
                skimage.io.imsave(
                                  arr = result,
                                  fname = src_path + "temp_rht_g_img.png"
                                 )
                rht_img = Image.open(src_path + "temp_rht_g_img.png")
            
            # Lastly, create an empty image to hold the formation and satellite image
            joint = Image.new(
                               "RGB", 
                               (lft_img.width + rht_img.width,
                                lft_img.height)
                             )
            # Stitch the satellite image and the formation image together.
            joint.paste(lft_img, (0,0))
            joint.paste(rht_img, (lft_img.width, 0))
            
            # Save the image off into the correct folder. This is dictated by
            # which iteration of the outermost loop the program is on.
            joint.save(paths[k] + str(i) + "-" + split_sets[k][cord["source"]] + "-img.png")
    
    # A little bit of clean up depending on whether you opted for either
    # greyscale or edge detection.
    if(greyscale or edge):
        os.remove(src_path + "temp_rht_img.png")
        os.remove(src_path + "temp_rht_g_img.png")
    if((not greyscale) and edge):
        os.remove(src_path + "temp_rht_img.png")

In [5]:
'''
    This function will take the 2 sets (training/testing and validation)
    and, from the source images, will split them into 256x256 subplots to
    be used for GAN training. The subimages created in training/testing
    are completely separate and independent of the subimages created
    in validation. This function will also have the option to allow for 
    images to be overlapped or not.
    
    :param tt_set: list, The list of all names in the training and testing set
    :param val_set: list, The list of all the names in the validation set
    :param n_vals: list, A list containing the number of images desired for each set
'''

def tt_split(tt_set, val_set, n_vals):
    
    # Putting tt_set and val_set into a 2d array for organization
    split_sets = [
        tt_set,
        val_set
    ]
    
    # A list containing all used subimages to avoid duplicates
    visited = []
    
    # Get the total images needed for both the testing images and the
    # training images
    tt_imgs = n_vals[0] + n_vals[1]
    
    # In this flow, the training and testing sets are created before the
    # validation set.
    
    for i in range(0, tt_imgs):
        
        # First we get which source image we will be taking from
        src = rand.randrange(0, len(split_sets[0]))
        
        # Depending on whether we want overlapping images or not
        # will dictate how we chunk the source image.
        if(not overlap):
            
            # If we do not want overlap, then split the original image
            # into img_size number of chunks and pick a random number from
            # 0 to that split
            x = rand.randrange(0, input_size / img_size)
            y = rand.randrange(0, input_size / img_size)
            
            # Then save that off in a dictionary to later check for
            # duplicates
            cord = {
                "source": src,
                "x_cord": x * img_size,
                "y_cord": y * img_size,
            }
        else:
            
            # If we don't care about overlapping images, then just pick
            # a random number from 0 to input_size-img_size due to the
            # width of the subimage.
            x = rand.randrange(0, input_size - img_size)
            y = rand.randrange(0, input_size - img_size)

            cord = {
                "source": src,
                "x_cord": x,
                "y_cord": y,
            }
        
        # If we do have a duplicate, then depending on whether you
        # care about duplicates or not, repeat the process until the program
        # has a new subplot
        while(cord in visited):
            src = rand.randrange(0, len(split_sets[0]))
            if(not overlap):
                x = rand.randrange(0, input_size / img_size)
                y = rand.randrange(0, input_size / img_size)

                cord = {
                    "source": src,
                    "x_cord": x * img_size,
                    "y_cord": y * img_size,
                }
            else:
                x = rand.randrange(0, input_size - img_size)
                y = rand.randrange(0, input_size - img_size)

                cord = {
                    "source": src,
                    "x_cord": x,
                    "y_cord": y,
                }
        

        # At this point we have a new unvisited plot, so we append it to
        # the list of visited plots and continue.
        visited.append(cord)

        # Next, pull the GAN image and split it into it's Satellite and
        # Formation image.
        img = Image.open(src_path + split_sets[0][cord["source"]] + "-gan.png")

        # Next, we make sure to crop out a subplot of the desired 
        # from the original image given the randomized x and y cordinates. 
        # For this we need to make sure we get the subplot of the 
        # satellite and the formation image.
        left = cord["x_cord"]
        top = cord["y_cord"]
        right = left + img_size
        bottom = top + img_size

        # This image is the Satellite image
        lft_img = img.crop((left,
                            top,
                            right,
                            bottom))

        # This image is the Formation image
        rht_img = img.crop((left+input_size,
                            top,
                            right+input_size,
                            bottom))

        # Next come the special cases.

        # Run the Formation image through cv2's Canny to get edge detection
        if(edge):
            rht_img.save(src_path + "temp_rht_img.png")

            img = cv2.imread(src_path + "temp_rht_img.png")
            edges = cv2.Canny(img,100,200)

            rht_img = Image.fromarray(edges)    

        # Run the Formation image through skimage to greyscale the image.
        # This is done by reducing the image to it's blue layer.
        if(greyscale):
            rht_img.save(src_path + "temp_rht_img.png")

            temp_rht_img = skimage.io.imread(fname = src_path + "temp_rht_img.png")
            temp_ar = np.copy(temp_rht_img)
            x_ar = temp_ar  
            result = x_ar[:, :, 2]
            result = result * (1.0/255.0)
            skimage.io.imsave(arr = result, fname = src_path + "temp_rht_g_img.png")

            rht_img = Image.open(src_path + "temp_rht_g_img.png")


        # Lastly, join the satellite subplot image and the formation subplot image
        # and save it off to the correct path.
        joint = Image.new("RGB", (lft_img.width + rht_img.width, lft_img.height))
        joint.paste(lft_img, (0,0))
        joint.paste(rht_img, (lft_img.width, 0))


        # Choose a random number to signify whether the current image will be saved
        # to the training or the testing set.
        r = rand.randrange(0,2)
        choice = n_vals[r]

        # However, if the number in the given set has already reached capacity
        # i.e. We have already generated enough images for the testing set, we
        # need to save it into the training set.
        if(choice == 0):
            r = abs(r - 1)

        n_vals[r] -= 1

        # Save off the image into the correct path.
        joint.save(paths[r] + str(i) + "-" + split_sets[0][cord["source"]] + "-img.png")

    
    # Validation images
    
    # Empty the array to help a little with time
    visited = []
    
    # Next we will construct the validation images
    for i in range(0, n_vals[1]):
        
        # First we get which source image we will be taking from
        src = rand.randrange(0, len(split_sets[1]))
        
        # Depending on whether we want overlapping images or not
        # will dictate how we chunk the source image.
        if(not overlap):
            
            # If we do not want overlap, then split the original image
            # into img_size number of chunks and pick a random number from
            # 0 to that split
            x = rand.randrange(0, input_size / img_size)
            y = rand.randrange(0, input_size / img_size)
            
            # Then save that off in a dictionary to later check for
            # duplicates
            cord = {
                "source": src,
                "x_cord": x * img_size,
                "y_cord": y * img_size,
            }
        else:
            
            # If we don't care about overlapping images, then just pick
            # a random number from 0 to input_size-img_size due to the
            # width of the subimage.
            x = rand.randrange(0, input_size - img_size)
            y = rand.randrange(0, input_size - img_size)

            cord = {
                "source": src,
                "x_cord": x,
                "y_cord": y,
            }
        
        
        # If we do have a duplicate, then depending on whether you
        # care about duplicates or not, repeat the process until the program
        # has a new subplot
        while(cord in visited):
            src = rand.randrange(0, len(split_sets[1]))
            if(not overlap):
                x = rand.randrange(0, input_size / img_size)
                y = rand.randrange(0, input_size / img_size)

                cord = {
                    "source": src,
                    "x_cord": x * img_size,
                    "y_cord": y * img_size,
                }
            else:
                x = rand.randrange(0, input_size - img_size)
                y = rand.randrange(0, input_size - img_size)

                cord = {
                    "source": src,
                    "x_cord": x,
                    "y_cord": y,
                }
        
        # At this point we have a new unvisited plot, so we append it to
        # the list of visited plots and continue.
        visited.append(cord)
        
        # Next, pull the GAN image and split it into it's Satellite and
        # Formation image.
        img = Image.open(src_path + split_sets[1][cord["source"]] + "-gan.png")
        
        
        # Next, we make sure to crop out a subplot of the desired 
        # from the original image given the randomized x and y cordinates. 
        # For this we need to make sure we get the subplot of the 
        # satellite and the formation image.
        left = cord["x_cord"]
        top = cord["y_cord"]
        right = left + img_size
        bottom = top + img_size
        
        # This image is the Satellite image
        lft_img = img.crop((left,
                            top,
                            right,
                            bottom))
        
        # This image is the Formation image
        rht_img = img.crop((left+input_size,
                            top,
                            right+input_size,
                            bottom))
        
        # Next come the special cases.

        # Run the Formation image through cv2's Canny to get edge detection
        if(edge):
            rht_img.save(src_path + "temp_rht_img.png")

            img = cv2.imread(src_path + "temp_rht_img.png")
            edges = cv2.Canny(img,100,200)
            rht_img = Image.fromarray(edges)

        # Run the Formation image through skimage to greyscale the image.
        # This is done by reducing the image to it's blue layer.
        if(greyscale):
            rht_img.save(src_path + "temp_rht_img.png")

            temp_rht_img = skimage.io.imread(fname = src_path + "temp_rht_img.png")
            temp_ar = np.copy(temp_rht_img)
            x_ar = temp_ar  
            result = x_ar[:, :, 2]
            result = result * (1.0/255.0)
            skimage.io.imsave(arr = result, fname = src_path + "temp_rht_g_img.png")

            rht_img = Image.open(src_path + "temp_rht_g_img.png")

        # The only thing that has changed is that there is no need to worry about
        # where to save it, as there is only one validation set.

        joint = Image.new("RGB", (lft_img.width + rht_img.width, lft_img.height))
        joint.paste(lft_img, (0,0))
        joint.paste(rht_img, (lft_img.width, 0))

        joint.save(paths[2] + str(i) + "-" + split_sets[1][cord["source"]] + "-img.png")    
    
    
    # Lastly, if any of the special techniques were used, then clean up any
    # extra images that were generated.
    if(greyscale or edge):
            os.remove(src_path + "temp_rht_img.png")
            os.remove(src_path + "temp_rht_g_img.png")
    if((not greyscale) and edge):
        os.remove(src_path + "temp_rht_img.png")

## Setting Up

Before splitting the images, we need to make sure that directories are created and that the original set of images is split into the necessary sets.

In [6]:
# Setting up the directories

os.makedirs("%s/%s" % (save_path, data_set_name), exist_ok=True)
os.makedirs("%s/%s/test" % (save_path, data_set_name), exist_ok=True)
os.makedirs("%s/%s/val" % (save_path, data_set_name), exist_ok=True)
os.makedirs("%s/%s/train" % (save_path, data_set_name), exist_ok=True)

In [7]:
# Going through all the images given by the src_path and appending them
# to what will be considered as the "full set"

f_set = []

fileList = os.listdir(src_path)
for file in fileList:
    f_set.append(file[0:-8])

In [8]:
# What this notebook will be considering as the "full set"
f_set

['Afganistan',
 'Chihuahua_N',
 'Coahuila',
 'Colorodo',
 'Durango',
 'Guanajuato',
 'Hidalgo',
 'Iran',
 'MidW_Arizona',
 'MidW_Texas',
 'Mid_Arizona',
 'Mid_Idaho',
 'Mid_Iran',
 'Mid_Mexico',
 'Mid_Montana',
 'Mid_Nevada',
 'Mid_New_Mexico',
 'Mid_Texas',
 'Monterrey',
 'Morelos',
 'New_Mexico_Franklin',
 'NE_Afganistan',
 'NE_Iran',
 'NMid_Idaho',
 'NMid_Iran',
 'NW_Canada',
 'NW_Iran',
 'NW_Nevada',
 'N_Arizona',
 'N_Colorodo',
 'N_Idaho',
 'N_Iran',
 'N_Montana',
 'N_Nevada',
 'N_Oregon',
 'N_Sinola',
 'N_Texas',
 'N_Utah',
 'Oaxaca',
 'Pennsylvania',
 'SE_Idaho',
 'SE_Iran',
 'SE_Mid_Montana',
 'SE_Montana',
 'Sonora',
 'SW_Iran',
 'SW_Washington',
 'S_Arizona',
 'S_California',
 'S_Iran',
 'S_Mexico',
 'S_Mid_Idaho',
 'S_New_Mexico',
 'S_Texas',
 'S_Virginia',
 'S_Wyoming',
 'Texas',
 'W_Canada',
 'W_Chihuahua',
 'W_New_Mexico',
 'W_Tamaulipas',
 'W_Texas',
 'W_Virginia',
 'W_Zacatecas',
 'Zacatecas']

In [9]:
# The size of the subplot images that will be used for 
# testing/training/validation
img_size = 256

# The size of the input image
input_size = Image.open(src_path + f_set[0] + "-gan.png").height

In [10]:
# Save paths from the training, testing and validation sets
paths = [
    "Collected_Images/%s/train/" % (data_set_name),
    "Collected_Images/%s/test/" % (data_set_name),
    "Collected_Images/%s/val/" % (data_set_name)
]

# A total count of the number of testing and training images.
n_vals = [
    int(num_tt_imgs * (1 - test_split)),
    int(num_tt_imgs * (test_split)),
    num_val_imgs
]

In [11]:
# An array containing the names of the images used for the validation set
val_set = []
v_size = int(len(f_set) * val_split)
# Picking out val_p random images to hold off from the original set to use
# for the validation set.
for i in range(0, v_size):
    choice = rand.randrange(0, len(f_set))
    val_set.append(f_set[choice])
    f_set.remove(f_set[choice])

# All remaining images are used for the training and testing sets    
tt_set = f_set

In [12]:
# If mode is 1, then we want to make sure that test_set and train_set
# is in global scope.

test_set = []
train_set = []
test_size = int(len(tt_set) * test_split)
if(mode == 1):
    for i in range(0, test_size):
        choice = rand.randrange(0, len(tt_set))
        test_set.append(tt_set[choice])
        tt_set.remove(tt_set[choice])
    train_set = tt_set

In [13]:
# The Images that will make up the validation set in this notebook.
val_set

['Mid_Mexico', 'Mid_Idaho', 'Morelos', 'S_Texas', 'Iran']

In [14]:
# Because the different modes lead to different arrays being used
# This is just to see which images are in which sets.
if mode == 1:
    print("Training set:")
    for name in train_set:
        print(name)
    print("Testing set:")
    for name in test_set:
        print(name)
else:
    print("Testing/Training set:")
    for name in tt_set:
        print(name)

Training set:
Afganistan
Chihuahua_N
Coahuila
Durango
Guanajuato
Hidalgo
MidW_Arizona
MidW_Texas
Mid_Arizona
Mid_Iran
Mid_Nevada
Mid_Texas
NE_Afganistan
NE_Iran
NMid_Idaho
NW_Canada
NW_Iran
NW_Nevada
N_Colorodo
N_Idaho
N_Iran
N_Montana
N_Nevada
N_Oregon
N_Sinola
N_Texas
Oaxaca
SE_Idaho
SE_Iran
SE_Mid_Montana
SE_Montana
Sonora
SW_Washington
S_Arizona
S_California
S_Iran
S_Mexico
S_New_Mexico
S_Virginia
Texas
W_Canada
W_Chihuahua
W_New_Mexico
W_Tamaulipas
W_Texas
W_Virginia
W_Zacatecas
Zacatecas
Testing set:
NMid_Iran
N_Utah
N_Arizona
S_Wyoming
Colorodo
Monterrey
Mid_New_Mexico
Pennsylvania
S_Mid_Idaho
Mid_Montana
SW_Iran
New_Mexico_Franklin


In [15]:
# Lastly, just run the correct function given by the mode
if mode == 0:
    tt_split(tt_set, val_set, n_vals)
elif mode == 1:
    full_split(train_set, test_set, val_set, n_vals)



































































