# Data Preparation

Loads the formatted data into a training and testing matrix. This had to be done in a notebook otherwise my computer would crash.

In [None]:
import numpy as np
import time
from time import time
from torch.autograd import Variable
import os
import sys
sys.path.append('../')
import matplotlib.pyplot as plt
import rasterio as rs
from src.sample_tiles import *
%load_ext autoreload
%autoreload 2
%matplotlib inline

from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers

import torch
from img2vec_pytorch import Img2Vec
from PIL import Image
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Step 1. Sample image names

For each biome store the names of each image in an array

### Organise structure for directory iteration

In [None]:
POS_to_CODE = {
    0: 1,
    1: 2,
    2: 3,
}

In [None]:
def one_ugly_encoding(hot_arr):
    hot_arr = np.array(hot_arr)
    ugly_arr = []
    for arr in hot_arr:
        code = POS_to_CODE[np.argmax(arr)]
        ugly_arr.append(code)
    return np.array(ugly_arr)

In [None]:
def normalize_red(array):
    return (array - array.min()) / (array.max() - array.min())
def normalize_green(array):
    return (array - array.min()) / (array.max() - array.min())
def normalize_blue(array):
    return (array - array.min()) / (array.max() - array.min())

In [20]:
def load_Processed_Data(DIR):

    X_data = np.zeros((1,4,51,51))

    image_PATHS = [f.path for f in os.scandir(DIR) if 'npy' in f.path]
    failed_images = []
    idx = 1
    previous_progress = 0
    start = time()

    for image_PATH in image_PATHS[:-1]:
        
        try:
            image = np.load(image_PATH)

            X_data = np.append(X_data, np.array([image]), axis = 0)

            if (idx%10 == 0):
                progress = (idx/len(image_PATHS[:-1]))*100
                end = time()
                time_remaining = ((end - start)/(progress-previous_progress)) * (100-progress)

                print ("Progress: {:.2f}% Cur Image: {} TIME REMAINING: {:.2f} seconds ".format(progress, image_PATH.split('/')[-1], time_remaining))
                previous_progress= progress
                start = time()
        except:
            image_name = image_PATH.split('/')[-1].split('.')[0]

            failed_images.append(image_name)

            print ("FAILED: {} \t {}".format(image_name, image_PATH))
            
            image = np.load(image_PATH)
            print (np.sum(image))
            
            
        idx += 1

        
        
    return X_data, failed_images

In [21]:
src_DIR = '/Volumes/GoogleDrive-103278653964135897318/My Drive/TemporalData-Processed-Eval-SNDVI'
X_data, y_data = load_Processed_Data(src_DIR)
y_data = y_data[0:4]
print (X_data.shape)
print (len(y_data))

Progress: 0.36% Cur Image: 9.npy TIME REMAINING: 19.13 seconds 
Progress: 0.73% Cur Image: 19.npy TIME REMAINING: 14.12 seconds 
Progress: 1.09% Cur Image: 29.npy TIME REMAINING: 18.82 seconds 
Progress: 1.45% Cur Image: 39.npy TIME REMAINING: 23.13 seconds 
Progress: 1.82% Cur Image: 49.npy TIME REMAINING: 26.08 seconds 
Progress: 2.18% Cur Image: 59.npy TIME REMAINING: 24.07 seconds 
Progress: 2.54% Cur Image: 69.npy TIME REMAINING: 28.71 seconds 
Progress: 2.91% Cur Image: 79.npy TIME REMAINING: 27.03 seconds 
Progress: 3.27% Cur Image: 89.npy TIME REMAINING: 29.49 seconds 
Progress: 3.64% Cur Image: 99.npy TIME REMAINING: 37.06 seconds 
Progress: 4.00% Cur Image: 109.npy TIME REMAINING: 33.72 seconds 
Progress: 4.36% Cur Image: 119.npy TIME REMAINING: 37.42 seconds 
Progress: 4.73% Cur Image: 129.npy TIME REMAINING: 37.18 seconds 
Progress: 5.09% Cur Image: 139.npy TIME REMAINING: 43.32 seconds 
Progress: 5.45% Cur Image: 149.npy TIME REMAINING: 49.03 seconds 
Progress: 5.82% Cur I

In [22]:
np.save(os.path.join(src_DIR, 'Collated.npy'), X_data)

In [27]:
#X_data = np.load('/Volumes/GoogleDrive/My Drive/TemporalData-Processed-Train/Collated.npy')
#y_data = np.load('/Volumes/GoogleDrive/My Drive/TemporalData-Processed-Train/labels.npy')
X_test = np.load('/Volumes/GoogleDrive/My Drive/TemporalData-Processed-Eval/Collated.npy')
y_test = np.load('/Volumes/GoogleDrive/My Drive/TemporalData-Processed-Eval/labels.npy')

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X_data[:-1], y_data, test_size=0.20, random_state=42)

Total Images for Temporal: 43,652

Total Images for Spatial: 35,340

Temporal:

Training: (6528, 4, 51, 51, 3), (6528, 3)

Validation: (1633, 4, 51, 51, 3), (1633, 3)

Testing: (2752, 4, 51, 51, 3), (2751, 3)

Spatial:



In [19]:
print ("Training: {}, {}".format(X_train.shape, y_train.shape))
print ("Validation: {}, {}".format(X_val.shape, y_val.shape))
print ("Testing: {}, {}".format(X_test.shape, y_test.shape))

Training: (6528, 4, 51, 51, 3), (6528, 3)
Validation: (1633, 4, 51, 51, 3), (1633, 3)
Testing: (2752, 4, 51, 51, 3), (2751, 3)
