# Imports and Reading Data

In [1]:
import os
import random
import numpy as np

import rasterio
from glob import glob

In [2]:
random.seed(42)

label_train = './labels/train'
label_test = './labels/test'

# Labels as categories
label_names = [
    "Bad data",
    "Snow and Ice",
    "Wet ice and meltwater",
    "Freshwater",
    "Sediment",
    "Bedrock",
    "Vegetation",
    ]
label_files = sorted(glob(os.path.join(label_train, '*.tif')))
testlabel_files = sorted(glob(os.path.join(label_test, '*.tif')))

In [3]:
#Extract the labels of all files in filelist (1 channel)
def extract_labels(fileList):
    labels = []
    for filename in fileList:
        with rasterio.open(filename) as src:
            data = src.read(1)
            labels.append(data)
    return np.array(labels)

In [4]:
train_labels = extract_labels(label_files)
test_labels = extract_labels(testlabel_files)

# Label Distributions for Train and Test

In [11]:
#Get the value counts for all the labels in training and test data
train_size = train_labels.size
test_size = test_labels.size
train_values, train_counts = np.unique(train_labels, return_counts=True)
test_values, test_counts = np.unique(test_labels, return_counts=True)

for i in range(len(label_names)):
    print(f"{label_names[i]:<{23}} Train: {round(train_counts[i] / train_size * 100, 2):>6}%  Test: {round(test_counts[i] / test_size * 100, 2):>6}%")

Bad data                Train:  10.08%  Test:   11.6%
Snow and Ice            Train:  25.45%  Test:  18.86%
Wet ice and meltwater   Train:   4.68%  Test:   4.74%
Freshwater              Train:   5.29%  Test:   7.51%
Sediment                Train:   5.08%  Test:    2.8%
Bedrock                 Train:  29.66%  Test:   8.35%
Vegetation              Train:  19.75%  Test:  46.14%
