# Setup

In [11]:
import os
from glob import glob
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from matplotlib import colors

import seaborn as sns
sns.set_style('whitegrid')

# Read Data

## Read Submission

In [10]:
# read submissions
df_sub = pd.read_csv("../../hyperview/random_forest/submissions/submission_RandomForest_SIMPLE202204140731_nest=1197_maxd=None_minsl=1.csv")
df_sub.head()

Unnamed: 0,sample_index,P,K,Mg,pH
0,0,68.476023,211.478697,141.7934,6.865004
1,1,68.400251,217.498747,154.350794,6.860835
2,2,65.068421,220.259064,164.963743,6.817895
3,3,83.922306,244.462824,169.843275,6.827018
4,4,74.576942,219.159566,160.51203,6.928521


## Read Train Data

In [12]:
def load_data(directory: str):
    """Load each cube, reduce its dimensionality and append to array.

    Args:
        directory (str): Directory to either train or test set
    Returns:
        [type]: A list with spectral curve for each sample.
    """
    datalist = []
    masklist = []
    
    all_files = np.array(
        sorted(
            glob(os.path.join(directory, "*.npz")),
            key=lambda x: int(os.path.basename(x).replace(".npz", "")),
        )
    )
    for file_name in all_files:
        with np.load(file_name) as npz:
            mask = npz['mask']
            data=npz['data']
            
            datalist.append(data)
            masklist.append(mask)
    return datalist,masklist
            


def load_gt(file_path: str):
    """Load labels for train set from the ground truth file.
    Args:
        file_path (str): Path to the ground truth .csv file.
    Returns:
        [type]: 2D numpy array with soil properties levels
    """
    gt_file = pd.read_csv(file_path)
    labels = gt_file[["P", "K", "Mg", "pH"]].values/np.array([325.0, 625.0, 400.0, 7.8]) # normalize ground-truth between 0-1
    return labels

In [13]:
cols = ["P2O5", "K", "Mg", "pH"]

In [None]:
raw_data = '/p/project/hai_cons_ee/kuzu/ai4eo-hyperview/hyperview/keras/train_data'
raw_data='/local_home/kuzu_ri/GIT_REPO/ai4eo_hyperview/hyperview/keras/train_data'
train_data = os.path.join(raw_data, 'train_data')

X_train, M_train = load_data(train_data)
y_train = load_gt(os.path.join(raw_data, "train_gt.csv"))

print(f"Train data size: {len(X_train)}")