In [None]:
"""
Created on April 2024

Author: Fenia Psomouli
"""

In [None]:
#Import libraries
import os
import tempfile
import numpy as np
import rasterio
import pickle
import tensorflow as tf
from tensorflow import keras
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from matplotlib import colors
import matplotlib.colors as clr
import pandas as pd

In [None]:
# Cloud authentication
from google.colab import auth
auth.authenticate_user()

In [None]:
# Mount Google Drive to Google Colab
from google.colab import drive
if not os.path.exists('/content/drive'):
  drive.mount('/content/drive', force_remount=True)

In [None]:
# Loading the list with the paths for the normalized data for both draining and refreezing lakes
draining_paths = np.load('/content/drive/MyDrive/Thesis/Data/file_pathsD64_normalized.npy', allow_pickle=True)
refreezing_paths = np.load('/content/drive/MyDrive/Thesis/Data/file_pathsR64_normalized.npy', allow_pickle=True)
# Combine draining and refreezing paths into one list, # [:92] -> draining [92:] -> refreezing (93)
all_paths = np.concatenate((draining_paths, refreezing_paths)).tolist()

In [None]:
#Preparing Antarctic dataset for the prediction
antarctica = np.load('/content/drive/MyDrive/Thesis/Data/file_pathsAntarctica_normalized.npy', allow_pickle=True)
print(len(antarctica))

In [None]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(value).numpy()]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def serialize_example(feature, lat, lon):
    feature_dict = {
        'feature': _bytes_feature(tf.convert_to_tensor(feature, dtype=tf.float32)),
        'lat': _float_feature(lat),
        'lon': _float_feature(lon)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature_dict))
    return example_proto.SerializeToString()

def read_tiff_file(file_path):
    with rasterio.open(file_path) as src:
        data = src.read()  # Read all bands at once
        features = np.transpose(data[:30, :, :], axes=(1, 2, 0))  # Transpose to height x width x channels
        lat, lon = src.xy(0, 0)  # Extract upper-left corner coordinates
        return features.astype(np.float32), lat, lon

def write_tfrecord(file_name, file_paths):
    with tf.io.TFRecordWriter(file_name) as writer:
        for file_path in file_paths:
            features, lat, lon = read_tiff_file(file_path)
            example = serialize_example(features, lat, lon)
            writer.write(example)
        print(f"Wrote {len(file_paths)} entries to {file_name}")

# Defining paths
file_paths = antarctica

# Write data to a TFRecord file
tfrecord_filename = '/content/drive/MyDrive/Results/Dataset_Antarctica/antarctica_dataset_with_coords.tfrecords'
write_tfrecord(tfrecord_filename, file_paths)

def _parse_function(proto):
    feature_description = {
        'feature': tf.io.FixedLenFeature([], tf.string),
        'lat': tf.io.FixedLenFeature([], tf.float32),
        'lon': tf.io.FixedLenFeature([], tf.float32)
    }
    parsed_features = tf.io.parse_single_example(proto, feature_description)
    feature = tf.io.parse_tensor(parsed_features['feature'], out_type=tf.float32)
    lat = parsed_features['lat']
    lon = parsed_features['lon']
    feature.set_shape([64, 64, 30])
    return feature, lat, lon

def load_dataset(tfrecord_file, batch_size):
    dataset = tf.data.TFRecordDataset(tfrecord_file)
    dataset = dataset.map(_parse_function)
    dataset = dataset.batch(batch_size)
    return dataset

batch_size = 8

Wrote 1607 entries to /content/drive/MyDrive/Results/Dataset_Antarctica/antarctica_dataset_with_coords.tfrecords


In [None]:
def get_file_paths(folder_path):
    """
    Get a list of file paths in the specified folder.
    """
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return np.array(file_paths)

def save_file_paths(file_paths, output_file):
    """
    Save the file paths to a NumPy file.
    """
    np.save(output_file, file_paths)

def main(folder_path, output_file):
    file_paths = get_file_paths(folder_path)
    save_file_paths(file_paths, output_file)
    print(f"File paths saved to {output_file}")


folder_path = '/content/drive/MyDrive/Thesis/Data/test_within_shuffled_pixels_with_index'
output_file = '/content/drive/MyDrive/Thesis/Data/test_dataset_shuffled_pixels_index_final.npy'
main(folder_path, output_file)


File paths saved to /content/drive/MyDrive/Thesis/Data/test_dataset_shuffled_pixels_index_final.npy


In [None]:
#Prepare Greenland dataset for Training
#Define Helper Functions to encode TensorFlow compatible types into TFRecords.
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(value).numpy()]))

def serialize_example(feature, label):
    feature_dict = {
        'feature': _bytes_feature(tf.convert_to_tensor(feature, dtype=tf.float32)),
        'label': _bytes_feature(tf.convert_to_tensor(label, dtype=tf.int32))
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature_dict))
    return example_proto.SerializeToString()


# #Read TIFF Files
def read_tiff_file(file_path):
    with rasterio.open(file_path) as src:
        data = src.read()  # Read all bands at once
        features = np.transpose(data[:30, :, :], axes=(1, 2, 0))  # Transpose to height x width x channels
        label = data[30, :, :]  # Last band: label
        return features.astype(np.float32), label.astype(np.int32)


# #Split data
def stratified_split_data(file_paths, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    draining_files = [fp for fp in file_paths if "DrainingEvents" in fp]
    refreezing_files = [fp for fp in file_paths if "RefreezingEvents" in fp]

    def split_files(files):
        np.random.shuffle(files)
        n = len(files)
        return {
            'train': files[:int(n * train_ratio)],
            'val': files[int(n * train_ratio):int(n * (train_ratio + val_ratio))],
            'test': files[int(n * (train_ratio + val_ratio)):]
        }

    return {
        'train': split_files(draining_files)['train'] + split_files(refreezing_files)['train'],
        'val': split_files(draining_files)['val'] + split_files(refreezing_files)['val'],
        'test': split_files(draining_files)['test'] + split_files(refreezing_files)['test']
    }

#Write to TFRecord
def write_tfrecord(file_name, file_paths):
    with tf.io.TFRecordWriter(file_name) as writer:
        for file_path in file_paths:
            features, label = read_tiff_file(file_path)
            example = serialize_example(features, label)
            writer.write(example)
        print(f"Wrote {len(file_paths)} entries to {file_name}")
    return file_paths

In [None]:
#Implement the conversion
file_paths = all_paths

# Get stratified split
splits = stratified_split_data(file_paths)

# Print file names in the train dataset to check the balance
print("Files in the training dataset:")
for f in splits['train']:
    print(f)

print("Files in the testing dataset:")
for f in splits['test']:
    print(f)

# Write TFRecords
for split_name, paths in splits.items():
    write_tfrecord(f'{split_name}.tfrecords', paths)
    print(f'Wrote {len(paths)} records to {split_name}.tfrecords')

# Write TFRecords for each split
for split_name, paths in splits.items():
    tfrecord_filename = f'/content/drive/MyDrive/Results/Datasets_int/{split_name}.tfrecords'
    write_tfrecord(tfrecord_filename, paths)
    print(f'Wrote {len(paths)} records to {tfrecord_filename}')

In [None]:
#Chcek the Balance in Classes (refreezing, draining)
def count_categories(files):
    draining_count = len([f for f in files if "DrainingEvents" in f])
    refreezing_count = len([f for f in files if "RefreezingEvents" in f])
    return draining_count, refreezing_count

train_counts = count_categories(splits['train'])
val_counts = count_categories(splits['val'])
test_counts = count_categories(splits['test'])

print("Training set - Draining: {}, Refreezing: {}".format(*train_counts))
print("Validation set - Draining: {}, Refreezing: {}".format(*val_counts))
print("Testing set - Draining: {}, Refreezing: {}".format(*test_counts))


Training set - Draining: 64, Refreezing: 65
Validation set - Draining: 18, Refreezing: 18
Testing set - Draining: 10, Refreezing: 10


In [None]:
# file_paths = np.load('/content/drive/MyDrive/Thesis/Data/test_dataset_corrected_image_removed.npy', allow_pickle = True)
tfrecord_filename = '/content/drive/MyDrive/Results/Datasets_test_various/test_dataset_shuffled_pixels_index.tfrecords'

write_tfrecord(tfrecord_filename, file_paths)
batch_size = 8

#Load and use the data
def _parse_function(proto):
    feature_description = {
        'feature': tf.io.FixedLenFeature([], tf.string),
        'label': tf.io.FixedLenFeature([], tf.string),
    }
    parsed_features = tf.io.parse_single_example(proto, feature_description) #ok
    feature = tf.io.parse_tensor(parsed_features['feature'], out_type=tf.float32)
    label = tf.io.parse_tensor(parsed_features['label'],out_type=tf.int32)
    feature.set_shape([64, 64, 30])
    label.set_shape([64, 64])
    return feature, label


def load_dataset(tfrecord_file, batch_size):
    dataset = tf.data.TFRecordDataset(tfrecord_file)
    dataset = dataset.map(_parse_function)
    dataset = dataset.batch(batch_size)
    # dataset = dataset.shuffle(1000).batch(batch_size) #.repeat()
    return dataset

Wrote 19 entries to /content/drive/MyDrive/Results/Datasets_test_various/test_dataset_shuffled_pixels_index.tfrecords
19


In [None]:
dataset = load_dataset(tfrecord_filename, batch_size)
for features, labels in dataset.take(1):
    print("Features batch shape:", features.shape, features.dtype)  # Expected: (BATCH_SIZE, 64, 64, 30)
    print("Labels batch shape:", labels.shape, labels.dtype)      # Expected: (BATCH_SIZE, 64

Features batch shape: (8, 64, 64, 30) <dtype: 'float32'>
Labels batch shape: (8, 64, 64) <dtype: 'int32'>
