In [None]:
import numpy as np
import rasterio
from tqdm import tqdm
import math
from pathlib import Path
from rasterio.plot import show
import matplotlib.pyplot as plt
from os import walk
import shutil
import random
from random import randint
import matplotlib as mpl

# Optional: set the default dpi of the plots
mpl.rcParams['figure.dpi'] = 300

In [None]:
# Show a tif image. In this case we only show the second band. 
fp = "../images/australia_601.tif"
img = rasterio.open(fp)
show((img, 2))

In [None]:
# Here we read the image and flatten it to a 1D array.
array = img.read()[4].flatten()
print(array.shape)

In [None]:
"""
Problems with the data:

- The data from the tif files present a lot of NaN values. This is due to clouds and 
satellite data not always being clear. We need to clean them before we can use them.

- Each image is not 200x200. We need to crop the image to 200x200 to fit into the model.

- The data is not normalised. We need to normalise the data before we can use it.

"""

def count_nan(image):
    """
    Count the amount of NaN values in the image.
    """
    count_nans = np.empty(image.shape[0], dtype="float32")
    
    for idx in range(0, image.shape[0]):
        array = image[idx].flatten()
        count = np.isnan(array).sum()
        count_nans[idx] = count
        
    return count_nans
    
def show_uniques(image):
    """
    Show the unique values in the image.
    """
    unique, counts = np.unique(image, return_counts=True)
    return unique, counts

def split_image(image, img_size):
    """
    Split the image into 200x200 images.
    """
    
    old_imgshape_y = int(image.shape[2]/2)
    half_img = int(img_size/2)
    old_imgshape_x = int(image.shape[1]/2)
    array_img_1 = image[:, (old_imgshape_x - half_img):(old_imgshape_x + half_img), (old_imgshape_y - half_img):(old_imgshape_y + half_img)]
    return array_img_1

def get_label(image):
    """
    Get the truth label from the tif image. It is the first layer of the tif image.
    
    We format the label to be 0 for no fire and 1 for fire.
    We pack the rest of the features into a single array.
    """
    
    label = image[0]
    idxs_fire = np.where(label >= 1)

    label[:] = 0
    label[idxs_fire] = 1

    # Remove the first layer
    array_img = np.delete(image, 0, 0)
    
    return array_img, label

def get_label_two_classes(image):
    """
    Get the truth label from the tif image. It is the first layer of the tif image.
    
    We use this method to count the unique fire instances in the image, as there are 
    multiple instances of fire in the image.
    """
    
    label = image[0]
    uniqlo, cunts = show_uniques(label)
    
    counter = 0
    for idx in range(0,len(uniqlo)):
        if(uniqlo[idx] > 400):
            counter = counter + cunts[idx] 
    
    label = 0
    if counter > 15:
        label = 1

    # Remove the first layer
    array_img = np.delete(image, 0, 0)
    return array_img, label

def convert_landcover(image):
    """
    Method to covert the landcover feature into 0s and 1s.
    This is mostly based on lands that can be flamable and not.
    """
    
    last_idx = image.shape[0] -1
    tmp_arr = image[last_idx]
    
    tmp_arr[tmp_arr == 1.] = 1.
    tmp_arr[tmp_arr == 2.] = 1.
    tmp_arr[tmp_arr == 3.] = 1.
    tmp_arr[tmp_arr == 4.] = 1.
    tmp_arr[tmp_arr == 5.] = 1.
    tmp_arr[tmp_arr == 6.] = 1.
    tmp_arr[tmp_arr == 7.] = 1.
    tmp_arr[tmp_arr == 8.] = 1.
    tmp_arr[tmp_arr == 9.] = 1.
    tmp_arr[tmp_arr == 10.] = 1.
    tmp_arr[tmp_arr == 11.] = 0.
    tmp_arr[tmp_arr == 12.] = 0.
    tmp_arr[tmp_arr == 13.] = 0.
    tmp_arr[tmp_arr == 14.] = 1.
    tmp_arr[tmp_arr == 15.] = 0.
    tmp_arr[tmp_arr == 16.] = 0.
    tmp_arr[tmp_arr == 17.] = 0.
    return image

def remove_nan_zeros(image, idx, value=0):
    image[idx] = np.nan_to_num(image[idx], value)
    return image

def remove_nan_mean(image, idx):
    mean = np.nanmean(image[idx])
    if math.isnan(mean):
        mean = 0
    image[idx] = np.nan_to_num(image[idx], nan=mean)
    return image

def clean_features(image):
    """
    Method to clean the features of the image.
    """
    
    idxs = range(2, image.shape[0] - 2)
    
    # clean the fires
    image = remove_nan_zeros(image,0)
    # clean the dem
    image = remove_nan_zeros(image,1)
    
    for idx in idxs:
        image = remove_nan_mean(image, idx)
    
    # clean the history of fires
    image = remove_nan_zeros(image, (image.shape[0] - 2))
    
    # clean last feature
    remove_nan_zeros(image , image.shape[0]-1, value=17)
    
    idfss = range(0, image.shape[0])
    for idf in idfss:
        check_fornan(image[idf])
    
    return image

def check_fornan(image):
    flat_image = np.asarray(image).flatten()
    
    array = np.isnan(flat_image)
    sum_s = np.sum(array)
    if sum_s > 0:
        print("with null")
        
def normalise(image):
    img_min = np.min(image)
    img_max = np.max(image)
    image_tmp = (image - img_min) / (img_max - img_min)
    return image_tmp

def normalise_features(dataset, features):
    for feat in range(0, features):
        dataset[:, feat, :, :] =  normalise(dataset[:, feat, :, :])
        
def check_forones(label):
    if np.sum(label) == 0:
        return False
    else:
        return True
    
def cut_array_to(arr, des_length):
    """
    Cut the array to the desired length.
    """
    to_cut = range(des_length, (arr.shape[0] - des_length) + des_length)
    print(to_cut)
    
    return np.delete(arr, to_cut, axis=0)

def check_size(img):
    if img.shape[1] <200 or img.shape[2] <200:
        print('image is smaller')

        
def get_tif_in_path(path):
    dataset_files = []
    for (dirpath, dirnames, filenames) in walk(path):
        dataset_files.extend(filenames)
        break

    dataset_files = [file for file in dataset_files if '.tif' in file ]
    
    return dataset_files

def split_and_save_datasets(train_size=900, val_size=150, path='', img_features=20):
    """
    Method to split the dataset into training and validation datasets.
    
    Understand that this method is not the most efficient, but it is a good starting point.
    
    train_size: the size of the training dataset
    val_size: the size of the validation dataset
    path: the path where the dataset is located
    img_features: the amount of features in the dataset
    """
    
    # We set the desired image size
    IMG_SIZE = int(200)
    base_path = 'your-path/' + path
    
    datasets = {'dataset':train_size, 'dataset_val': train_size + val_size}
    
    # Create the numpy arrays to store the data
    arr_imgs = np.empty([train_size + val_size, img_features, IMG_SIZE, IMG_SIZE], dtype="float32")
    arr_label = np.empty([train_size + val_size, IMG_SIZE, IMG_SIZE], dtype="float32")
    
    count_nans = np.empty([train_size + val_size, img_features + 1], dtype="float32")

    idx = 0
    
    for datas_key in datasets:
        print('Processing ' + datas_key)
        subpath = base_path + '/' + datas_key
        
        # gather the files
        dataset_files = get_tif_in_path(subpath)
        
        # count the countries
        count_countries = {'africa':0, 'australia':0, 'eurasia':0, 'south_america':0, 'us':0, 'europe':0, 'asia':0}
        
        for file in tqdm(dataset_files):
            if idx == datasets[datas_key]:
                break

            path = subpath + '/'+ file

            # read the image as an array
            tmp_img = rasterio.open(path)
            array = tmp_img.read() #np.nan_to_num(tmp_img.read())

            split_img = split_image(array, IMG_SIZE)
            
            # count the amount of nans
            count_nans[idx] = count_nan(split_img)
            
            # remove nan from fatures
            cleaned_img = clean_features(split_img)
            cleaned_img, label = get_label(cleaned_img)
            
            if cleaned_img.shape[0] < 16:
                continue

            # if there are not ones, skip
            if not check_forones(label):
                continue
                
            # count the instances
            for key in count_countries:
                if key in file:
                    count_countries[key] = count_countries[key] + 1
                    break

            arr_imgs[idx,] = cleaned_img
            arr_label[idx] = label
            idx += 1
            
        print(datas_key, count_countries)

    # cut the array to only with ones labels
    arr_imgs = cut_array_to(arr_imgs, idx)
    arr_label = cut_array_to(arr_label, idx)
    count_nans = cut_array_to(count_nans, idx)
    
    print('Saving the training dataset ...')
    np.savez_compressed(base_path + '/dataset/Xdataset.npy', arr_imgs[0:train_size,])
    np.savez_compressed(base_path + '/dataset/Ydataset.npy', arr_label[0:train_size,])

    print('Saving the validation dataset ...')
    np.savez_compressed(base_path + '/dataset_val/Xdataset_val.npy', arr_imgs[train_size:,])
    np.savez_compressed(base_path + '/dataset_val/Ydataset_val.npy', arr_label[train_size:,])
    print('Saved')

In [None]:
split_and_save_datasets(path='', val_size=150, img_features=20)