## Imports

In [1]:
import numpy as np
import pandas as pd
import math
import time
from PIL import Image
# from patchify import patchify
from IPython.display import SVG
import matplotlib.pyplot as plt
%matplotlib inline
import os, re, sys, random, shutil #, cv2
from google.cloud import storage

import tensorflow as tf 
from tensorflow import keras

2023-06-12 14:24:11.866415: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-12 14:24:16.348511: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-06-12 14:24:16.349452: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

## Key params

In [4]:
# parameters
DATA_SIZE=15000
LAND_USE_ARRAY_SIZE=250
TRIAL_SIZE=1000     # after running it once change to a larger number (up to 15000)
LOAD_CHUNK_SIZE=TRIAL_SIZE
DATA_RUN = 'local_VM' #  select from these options:'gcp' local_VM' , 'local' 

# on VM workbench in GCP when loading data locally from VM  - change to what is has to be 
VM_dataset_folder = "/home/jupyter/GlobalGreening"  # need to change this 
VM_images_dir = "zoomed_photos"
VM_masks_download_dir = 'ESA_worldcover'
VM_masks_upload_dir = 'masks'
VM_output_folder = "/home/jupyter/GlobalGreening/training_outputs" # need to change this 

## Loading

In [3]:
!pwd

/home/jupyter/GlobalGreening


### Functions to identify images, masks

In [64]:
# functions used to load images

def list_image_filenames(dataset_root_folder, images_dir):
    """Lists all the files in the images dir."""
    images_directory = f'{dataset_root_folder}/{images_dir}'
    print(f'loading file names from {images_directory}')
    file_list = os.listdir(images_directory)
    images_file_list = []
    
    for file_name in file_list:
        if file_name.endswith('.png'):
            images_file_list.append(file_name)
        
    image_files = np.sort(images_file_list)

    return image_files

def list_mask_filenames(dataset_root_folder, masks_dir):
    
    """Lists all the files in the masks dir."""
    masks_directory = f'{dataset_root_folder}/{masks_dir}'
    print(f'loading file names from {masks_directory}')
    file_list = os.listdir(masks_directory)
    
    masks_file_list = []
    
    for file_name in file_list:
        if file_name.endswith('.npy'):
            masks_file_list.append(file_name)
        
    mask_files = np.sort(masks_file_list)

    return mask_files

def get_image_geo_locations(dataset_root_folder, images_dir):
    # generating file names from the directory
    
    image_files = list_image_filenames(dataset_root_folder, images_dir)
      
    file_names = np.char.rstrip(image_files, '.png')
    file_names = np.char.split(file_names, '_') 

    print(f'sample images file names {image_files[:3]} and amount of images in the directory {len(image_files)}')
    print(f'sample split file names {file_names[:3]}, and check that same lenght of lists {len(file_names)}')

    # creating df with information about longitude, latitude (can also be used to load the images)
    image_geo_locations = np.zeros((len(file_names),2))
    image_geo_locations = pd.DataFrame(image_geo_locations, columns=['latitude', 'longitude'])

    for image_type in ['latitude', 'longitude']:
        for i in range(len(file_names)):
            file = file_names[i]
            
            if image_type == 'latitude':
                text = file[0]
                image_number = ''.join(num for num in text if num.isdigit())
                latitude = file[1].strip('-') 
    #            print(f'latitude is {latitude}')
                image_geo_locations.at[int(image_number),'latitude'] = latitude
                                    
            elif image_type == 'longitude':
                text = file[0]
                image_number = ''.join(num for num in text if num.isdigit())
                longitude = file[2] 
    #            print(f'longitude is {longitude}')
                image_geo_locations.at[int(image_number),'longitude'] = longitude                     
    return image_geo_locations

### Loading

In [59]:
image_geo_locations = get_image_geo_locations(VM_dataset_folder, VM_images_dir)
image_geo_locations

loading file names from /home/jupyter/GlobalGreening/zoomed_photos
sample images file names ['image0_-109.0_37.0.png' 'image10000_-104.47_38.54.png'
 'image10001_-104.47_38.58.png'] and amount of images in the directory 15415
sample split file names [list(['image0', '-109.0', '37.0'])
 list(['image10000', '-104.47', '38.54'])
 list(['image10001', '-104.47', '38.58'])], and check that same lenght of lists 15415


Unnamed: 0,latitude,longitude
0,109.0,37.0
1,109.0,37.04
2,109.0,37.09
3,109.0,37.13
4,109.0,37.17
...,...,...
15411,102.04,40.8
15412,102.04,40.84
15413,102.04,40.89
15414,102.04,40.93


In [56]:
image_geo_locations.index.nunique()

15416

In [58]:
len(image_geo_locations)

15416

In [63]:
download_mask_files = list_mask_filenames(VM_dataset_folder, VM_masks_dir)
len(mask_files)

loading file names from /home/jupyter/GlobalGreening/ESA_worldcover


62

In [66]:
image_files = list_image_filenames(VM_dataset_folder, VM_images_dir)

loading file names from /home/jupyter/GlobalGreening/zoomed_photos


array(['image0_-109.0_37.0.png', 'image10000_-104.47_38.54.png',
       'image10001_-104.47_38.58.png'], dtype='<U28')

In [70]:
len(image_files)

15415

In [69]:
for image in image_files:
    if "image154" in image:
        print(image)

image15400_-102.04_40.33.png
image15401_-102.04_40.37.png
image15402_-102.04_40.42.png
image15403_-102.04_40.46.png
image15404_-102.04_40.5.png
image15405_-102.04_40.54.png
image15406_-102.04_40.59.png
image15407_-102.04_40.63.png
image15408_-102.04_40.67.png
image15409_-102.04_40.71.png
image1540_-108.32_38.54.png
image15410_-102.04_40.76.png
image15411_-102.04_40.8.png
image15412_-102.04_40.84.png
image15413_-102.04_40.89.png
image15414_-102.04_40.93.png
image15415_-102.04_40.97.png
image1541_-108.32_38.58.png
image1542_-108.32_38.62.png
image1543_-108.32_38.67.png
image1544_-108.32_38.71.png
image1545_-108.32_38.75.png
image1546_-108.32_38.79.png
image1547_-108.32_38.84.png
image1548_-108.32_38.88.png
image1549_-108.32_38.92.png
image154_-108.96_39.56.png


In [51]:
mask_upload_file_names = []

for i in range(len(image_files)):
    mask_file =f'mask{i}_-{image_geo_locations.iat[i,0]}_{image_geo_locations.iat[i,1]}.npy'
    mask_upload_file_names.append(mask_file)

len(mask_upload_file_names), mask_upload_file_names[-10:]

(15416,
 ['mask15406_-102.04_40.59.npy',
  'mask15407_-102.04_40.63.npy',
  'mask15408_-102.04_40.67.npy',
  'mask15409_-102.04_40.71.npy',
  'mask15410_-102.04_40.76.npy',
  'mask15411_-102.04_40.8.npy',
  'mask15412_-102.04_40.84.npy',
  'mask15413_-102.04_40.89.npy',
  'mask15414_-102.04_40.93.npy',
  'mask15415_-102.04_40.97.npy'])

### Function to change numpy arrays

In [39]:
total_mask_files = len(download_mask_files)
total_mask_files

62

In [40]:
download_mask_files[:1]

array(['land_use_data_from_0_to_249.npy'], dtype='<U37')

In [41]:
# load one numpy of 250 masks into masks_dataset

for mask_file in download_mask_files[:1]:
    print(f'loading the masks from these file {mask_file}')
    path_mask = f'{VM_dataset_folder}/{VM_masks_download_dir}/{mask_file}'    
    
    masks_dataset = np.load(path_mask)
    print(f'loading array {mask_file} into mask dataset with shape {masks_dataset.shape}')

loading the masks from these file land_use_data_from_0_to_249.npy
loading array land_use_data_from_0_to_249.npy into mask dataset with shape (250, 512, 512, 3)


In [19]:
type(masks_dataset), len(masks_dataset), masks_dataset.shape

(numpy.ndarray, 250, (250, 512, 512, 3))

In [97]:
# testing range
i=61
from_masks_array_file = i*LAND_USE_ARRAY_SIZE
to_masks_array_file = min((i+1)*LAND_USE_ARRAY_SIZE-1, 15415)
range_mask_uploading = to_masks_array_file - from_masks_array_file + 1


print(f'range is {from_masks_array_file}, {to_masks_array_file}')
print(f'number of files is {range_mask_uploading}') 

range is 15250, 15415
number of files is 166


In [85]:
## testing code to save into new files

for j in range(from_masks_array_file,from_masks_array_file+10):
        mask_upload_file =f'mask{j}_-{image_geo_locations.iat[j,0]}_{image_geo_locations.iat[j,1]}.npy'
        print(mask_upload_file)
        path_upload_mask = f'{VM_dataset_folder}/{VM_masks_upload_dir}/{mask_upload_file}'
        print(path_upload_mask)
        #np.save(path_upload_mask, masks_dataset[j])

mask500_-108.79_38.28.npy
/home/jupyter/GlobalGreening/masks/mask500_-108.79_38.28.npy
mask501_-108.79_38.32.npy
/home/jupyter/GlobalGreening/masks/mask501_-108.79_38.32.npy
mask502_-108.79_38.37.npy
/home/jupyter/GlobalGreening/masks/mask502_-108.79_38.37.npy
mask503_-108.79_38.41.npy
/home/jupyter/GlobalGreening/masks/mask503_-108.79_38.41.npy
mask504_-108.79_38.45.npy
/home/jupyter/GlobalGreening/masks/mask504_-108.79_38.45.npy
mask505_-108.79_38.49.npy
/home/jupyter/GlobalGreening/masks/mask505_-108.79_38.49.npy
mask506_-108.79_38.54.npy
/home/jupyter/GlobalGreening/masks/mask506_-108.79_38.54.npy
mask507_-108.79_38.58.npy
/home/jupyter/GlobalGreening/masks/mask507_-108.79_38.58.npy
mask508_-108.79_38.62.npy
/home/jupyter/GlobalGreening/masks/mask508_-108.79_38.62.npy
mask509_-108.79_38.67.npy
/home/jupyter/GlobalGreening/masks/mask509_-108.79_38.67.npy


In [104]:
### loading each numpy array and then savings the files
### save each part of numpy array as separate file

LAND_USE_ARRAY_SIZE=250

for i in range(39,62): #i create numpy array file names for masks (e..g 0-249, 250-499, 15250-15415 (should be 15415)
    from_masks_array_file = i*LAND_USE_ARRAY_SIZE
    to_masks_array_file = min((i+1)*LAND_USE_ARRAY_SIZE-1 , 15415)
    
    mask_download_file = f'land_use_data_from_{from_masks_array_file}_to_{to_masks_array_file}.npy'
    
    path_download_mask = f'{VM_dataset_folder}/{VM_masks_download_dir}/{mask_download_file}'
    
    print(f'loading the masks from these file {path_download_mask}')
        
    masks_dataset = np.load(path_download_mask)

    range_mask_uploading = to_masks_array_file - from_masks_array_file + 1
    
    for j in range(range_mask_uploading):
#        print(f'range is {from_masks_array_file} - {to_masks_array_file}')
        mask_file_loc = (i*LAND_USE_ARRAY_SIZE) + j
        mask_upload_file =f'mask{mask_file_loc}_-{image_geo_locations.iat[mask_file_loc,0]}_{image_geo_locations.iat[mask_file_loc,1]}.npy'
        path_upload_mask = f'{VM_dataset_folder}/{VM_masks_upload_dir}/{mask_upload_file}'
        np.save(path_upload_mask, masks_dataset[j])
    
    print(f'finished another batch {path_download_mask}')


loading the masks from these file /home/jupyter/GlobalGreening/ESA_worldcover/land_use_data_from_9750_to_9999.npy
finished another batch /home/jupyter/GlobalGreening/ESA_worldcover/land_use_data_from_9750_to_9999.npy
loading the masks from these file /home/jupyter/GlobalGreening/ESA_worldcover/land_use_data_from_10000_to_10249.npy
finished another batch /home/jupyter/GlobalGreening/ESA_worldcover/land_use_data_from_10000_to_10249.npy
loading the masks from these file /home/jupyter/GlobalGreening/ESA_worldcover/land_use_data_from_10250_to_10499.npy
finished another batch /home/jupyter/GlobalGreening/ESA_worldcover/land_use_data_from_10250_to_10499.npy
loading the masks from these file /home/jupyter/GlobalGreening/ESA_worldcover/land_use_data_from_10500_to_10749.npy
finished another batch /home/jupyter/GlobalGreening/ESA_worldcover/land_use_data_from_10500_to_10749.npy
loading the masks from these file /home/jupyter/GlobalGreening/ESA_worldcover/land_use_data_from_10750_to_10999.npy
fini

In [105]:
# testing if it worked
total_uploaded_mask_files = list_mask_filenames(VM_dataset_folder, VM_masks_upload_dir)
len(total_uploaded_mask_files)

loading file names from /home/jupyter/GlobalGreening/masks


15416

In [102]:
upload_mask_files = list_mask_filenames(VM_dataset_folder,VM_masks_upload_dir)
len(upload_mask_files)

loading file names from /home/jupyter/GlobalGreening/masks


250

In [33]:
start_range = 0
stop_range = 1

load range for images is 1000
load range for masks is 4


### Functions not used

In [71]:
def load_masks(dataset_root_folder, masks_dir, load_range_masks, land_use_array_size):

    for i in range(load_range_masks):
        mask_file = f'land_use_data_from_{i*land_use_array_size}_to_{(i+1)*land_use_array_size-1}.npy'
        path_mask = f'{dataset_root_folder}/{masks_dir}/{mask_file}'
#        print(f'loading the masks from these file {path_mask}')
        
        if i == 0:
            masks_dataset = np.load(path_mask)
#            print(f'loading array {i+1} into mask dataset with shape {masks_dataset.shape}')
    
        else:
            array_to_append = np.load(path_mask)
            masks_dataset = np.vstack((masks_dataset, array_to_append))
#            print(f'appending to masks_dataset an array {i+1} with shape {array_to_append.shape}')
    
    return masks_dataset

In [None]:
def load_images(dataset_root_folder, images_dir, load_range_images):
    
    image_geo_locations = get_image_geo_locations(dataset_root_folder, images_dir)

    # loading images into the list (new code so the files are loaded in correct order (by index))
    images_dataset = []

    for i in range(load_range_images):
        image_file =f'image{i}_-{image_geo_locations.iat[i,0]}_{image_geo_locations.iat[i,1]}.png'
        path_image = f'{dataset_root_folder}/{images_dir}/{image_file}'
#        print(f'loading the image from these file {path_image}')
        
        image = Image.open(path_image)
        type(image)
        if np.asarray(image).shape[2] >3: 
            image = image.convert('RGB')
        
        image = np.asarray(image)
        images_dataset.append(image)
#        print(f'appended image of size {image.shape}')
    
    images_dataset = np.array(images_dataset)
    
    return images_dataset


