# Creating training, testing, and validation sets

In this notebook, we will create a training, testing, and validation sets for the data that has already been provided. An added feature is that the images used in the training and testing sets are completely separate from the images used in the validation set.

In [1]:
# Importing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import contextily as cx
%matplotlib inline

import math

import os
import urllib.request
from PIL import Image
import cv2

import random
import skimage
import skimage.feature
import skimage.viewer
import numpy as np
import sys

  from .core import *


In [2]:
# Here is where you can specify what name you want to call it
data_set_name = "Split_Images_DM_Edge"

os.makedirs("Collected_Images/%s" % (data_set_name), exist_ok=True)
os.makedirs("Collected_Images/%s/test" % (data_set_name), exist_ok=True)
os.makedirs("Collected_Images/%s/val" % (data_set_name), exist_ok=True)
os.makedirs("Collected_Images/%s/train" % (data_set_name), exist_ok=True)

In [3]:
# Seed used for this run for reproduceability
seed = random.randrange(sys.maxsize)
rand = random.Random(seed)
print("Seed used:", seed)

Seed used: 6988439079880625549


In [4]:
# Name of the images that we will be using
f_set = [
    "Chihuahua_N",
    "Colorodo",
    "Idaho",
    "Mid_Nevada",
    "Mid_New_Mexico",
    "N_Colorodo",
    "New_Mexico_Franklin",
    "S_Arizona",
    "S_California",
    "S_New_Mexico",
    "S_Virginia",
    "Tennessee",
    "Texas",
    "W_New_Mexico",
    "W_Texas",
    "W_Virginia"
]

# Number of images that are going to be used for the validation set
val_p = len(f_set) // 4

In [5]:
# Percentage of images that will be used for the testing set
test_split = 0.2

# Total number of testing and training sets
num_tt_imgs = 1200

# Total number of validation sets
num_val_imgs = 200

# The size of the subplot images that will be used for 
# testing/training/validation
img_size = 256

# The size of the input image
input_size = 880

# If you want the resulting formation image to be grayscale
greyscale = False

# If you want the resulting formation image to be run through edge
# detection
edge = True

# Where to pull the GAN images from (GAN refers to Satellite images on the
# left and the corresponding Formation image on the right)
src_path = "Collected_Images/DM_Dataset/GAN_pics/"

# Save paths from the training, testing and validation sets
paths = [
    "Collected_Images/%s/train/" % (data_set_name),
    "Collected_Images/%s/test/" % (data_set_name),
    "Collected_Images/%s/val/" % (data_set_name)
]

# A total count of the number of testing and training images.
n_vals = [
    int(num_tt_imgs * (1 - test_split)),
    int(num_tt_imgs * (test_split))
]

In [6]:
# An array containing the names of the images used for the validation set
v_set = []

# Picking out val_p random images to hold off from the original set to use
# for the validation set.
for i in range(0, val_p):
    choice = rand.randrange(0, len(f_set))
    v_set.append(f_set[choice])
    f_set.remove(f_set[choice])

# All remaining images are used for the training and testing sets    
tt_set = f_set

In [7]:
# The Images that will make up the validation set in this notebook
v_set

['W_Texas', 'Mid_Nevada', 'Colorodo', 'S_Virginia']

In [8]:
# The Images that will make up the testing and training set 
# in this notebook
tt_set

['Chihuahua_N',
 'Idaho',
 'Mid_New_Mexico',
 'N_Colorodo',
 'New_Mexico_Franklin',
 'S_Arizona',
 'S_California',
 'S_New_Mexico',
 'Tennessee',
 'Texas',
 'W_New_Mexico',
 'W_Virginia']

## Constructing training and testing sets

It does not matter which sets we will construct first considering how the validation set and the training and testing sets are completely separate from each other. However here, we will be constructing the training and testing sets first.

In [9]:
# Array of dictionaries to ensure there are no repeated plots
# But there is no protection about overlapping subplots
visited = []

for i in range(0, num_tt_imgs):
    
    # First select a image to pull from,
    # the x coordinate for the subplot,
    # and the y coordinate for the subplot
    src = rand.randrange(0, len(tt_set))
    x = rand.randrange(0,input_size - img_size)
    y = rand.randrange(0,input_size - img_size)
    
    # Establish it as a dictionary
    cord = {
        "source": src,
        "x_cord": x,
        "y_cord": y,
    }
    
    # Lastly, make sure that we have not already looked at this
    # plot. If we have already looked at it, then we pick out a new
    # plot.
    while(cord in visited):
        src = rand.randrange(0, len(tt_set))
        x = rand.randrange(0,input_size - img_size)
        y = rand.randrange(0,input_size - img_size)

        cord = {
            "source": src,
            "x_cord": x,
            "y_cord": y,
        }
    
    # At this point we have a new unvisited plot, so we append it to
    # the list of visited plots and continue.
    visited.append(cord)
    
    # Next, pull the GAN image and split it into it's Satellite and
    # Formation image.
    img = Image.open(src_path + tt_set[src] + "-gan.png")
    
    # Next, we make sure to crop out a subplot of the desired 
    # from the original image given the randomized x and y cordinates. 
    # For this we need to make sure we get the subplot of the 
    # satellite and the formation image.
    left = x
    top = y
    right = left + img_size
    bottom = top + img_size
    
    # This image is the Satellite image
    lft_img = img.crop((left,
                        top,
                        right,
                        bottom))
    
    # This image is the Formation image
    rht_img = img.crop((left+input_size,
                        top,
                        right+input_size,
                        bottom))
    
    # Next come the special cases.
    
    # Run the Formation image through cv2's Canny to get edge detection
    if(edge):
        rht_img.save(src_path + "temp_rht_img.png")
        
        img = cv2.imread(src_path + "temp_rht_img.png")
        edges = cv2.Canny(img,100,200)
        
        rht_img = Image.fromarray(edges)    
    
    # Run the Formation image through skimage to greyscale the image
    if(greyscale):
        rht_img.save(src_path + "temp_rht_img.png")

        temp_rht_img = skimage.io.imread(fname = src_path + "temp_rht_img.png")
        temp_ar = np.copy(temp_rht_img)
        x_ar = temp_ar  
        result = x_ar[:, :, 2]
        result = result * (1.0/255.0)
        skimage.io.imsave(arr = result, fname = src_path + "temp_rht_g_img.png")

        rht_img = Image.open(src_path + "temp_rht_g_img.png")
    
    
    # Lastly, join the satellite subplot image and the formation subplot image
    # and save it off to the correct path.
    joint = Image.new("RGB", (lft_img.width + rht_img.width, lft_img.height))
    joint.paste(lft_img, (0,0))
    joint.paste(rht_img, (lft_img.width, 0))
    
    
    # Choose a random number to signify whether the current image will be saved
    # to the training or the testing set.
    r = rand.randrange(0,2)
    choice = n_vals[r]
    
    # However, if the number in the given set has already reached capacity
    # i.e. We have already generated enough images for the testing set, we
    # need to save it into the training set.
    if(choice == 0):
        r = abs(r - 1)
    
    n_vals[r] -= 1
    
    # Save off the image into the correct path.
    joint.save(paths[r] + str(i) + "-" + tt_set[src] + "-img.png")

# Lastly, if any of the special techniques were used, then clean up any
# extra images that were generated.
if(greyscale or edge):
    os.remove(src_path + "temp_rht_img.png")

## Constructing the validation set

The next segment of code is almost exactly the same as the code used for constructing the training and testing set. So, I'll just comment the change.

In [10]:
visited = []

for i in range(0, num_val_imgs):
    src = rand.randrange(0, len(v_set))
    x = rand.randrange(0,input_size - img_size)
    y = rand.randrange(0,input_size - img_size)
    
    cord = {
        "source": src,
        "x_cord": x,
        "y_cord": y,
    }
    
    while(cord in visited):
        src = rand.randrange(0, len(v_set))
        x = rand.randrange(0,input_size - img_size)
        y = rand.randrange(0,input_size - img_size)

        cord = {
            "source": src,
            "x_cord": x,
            "y_cord": y,
        }
    
    visited.append(cord)
    
    img = Image.open(src_path + v_set[src] + "-gan.png")
    
    left = x
    top = y
    right = left + img_size
    bottom = top + img_size
    
    lft_img = img.crop((left,
                        top,
                        right,
                        bottom))
    
    rht_img = img.crop((left+input_size,
                        top,
                        right+input_size,
                        bottom))
    
    if(edge):
        rht_img.save(src_path + "temp_rht_img.png")
        
        img = cv2.imread(src_path + "temp_rht_img.png")
        edges = cv2.Canny(img,100,200)
        rht_img = Image.fromarray(edges)
    
    
    if(greyscale):
        rht_img.save(src_path + "temp_rht_img.png")

        temp_rht_img = skimage.io.imread(fname = src_path + "temp_rht_img.png")
        temp_ar = np.copy(temp_rht_img)
        x_ar = temp_ar  
        result = x_ar[:, :, 2]
        result = result * (1.0/255.0)
        skimage.io.imsave(arr = result, fname = src_path + "temp_rht_g_img.png")

        rht_img = Image.open(src_path + "temp_rht_g_img.png")
    
    # The only thing that has changed is that there is no need to worry about
    # where to save it, as there is only one validation set.
    
    joint = Image.new("RGB", (lft_img.width + rht_img.width, lft_img.height))
    joint.paste(lft_img, (0,0))
    joint.paste(rht_img, (lft_img.width, 0))
    
    joint.save(paths[2] + str(i) + "-" + v_set[src] + "-img.png")
    
if(greyscale or edge):
    os.remove(src_path + "temp_rht_img.png")