## Data Preparation

- Prepare the dataset for training the DAE
- Remove the Continuum

In [1]:
import os
import pickle
import pandas as pd
import numpy as np

output_data_path = 'output/'
os.makedirs(output_data_path, exist_ok=True)

#### Load Dataset
##### Spectra + TSG Class + Cu analysis

In [2]:
import pandas as pd

spec_df_chem = pd.read_pickle('data/prominetHill_spec_chem_final_.pkl')

In [3]:
spec_df_chem.head()

Unnamed: 0,Sample,Depth (m),Min1 uTSAS,Wt1 uTSAS,Min2 uTSAS,Wt2 uTSAS,Min3 uTSAS,Wt3 uTSAS,Error uTSAS,Min1 ujCLST,...,14300.0,14325.0,14350.0,14375.0,14400.0,14425.0,14450.0,14475.0,14500.0,Depth_idx
0,241877_0001_1,62.903,,,,,,,,,...,0.123612,0.129789,0.135583,0.139656,0.141847,0.142505,0.142782,0.143598,0.145446,62.903017
1,241877_0001_2,62.903,,,,,,,,,...,0.085993,0.089868,0.097571,0.108306,0.120761,0.13294,0.143911,0.152296,0.156787,62.903019
2,241877_0001_3,62.903,,,,,,,,,...,0.103591,0.106344,0.113392,0.123753,0.135379,0.145596,0.153365,0.158294,0.160469,62.903011
3,241877_0001_4,62.903,,,,,,,,,...,0.089949,0.090597,0.094342,0.10133,0.111611,0.124039,0.137684,0.150909,0.161748,62.903013
4,241877_0001_5,62.903,,,,,,,,,...,0.088858,0.087487,0.090884,0.09851,0.10833,0.117291,0.124278,0.129234,0.132313,62.903014


In [4]:
# Extract Cu analysis from dataset
training_data_cu = spec_df_chem["Cu"]

In [5]:
training_data_cu.head()

0    0.004
1    0.004
2    0.004
3    0.004
4    0.004
Name: Cu, dtype: float64

##### Wavelenght Arrays (Feature Names)

In [6]:
with open('data/wvl_arr.pkl', 'rb') as f:
    base_wvl_arr = pickle.load(f)
swir_wvl_base = base_wvl_arr[0].astype(str).values
tir_wvl_base = base_wvl_arr[1].astype(str).values
tir_wvl_base = [float(w) for w in tir_wvl_base]

#### Set Drill Hole Training data

In [7]:
# Construct training data only with spectral data
training_data = spec_df_chem[list(swir_wvl_base) + list([int(w) for w in tir_wvl_base])]
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 219363 entries, 0 to 220159
Columns: 872 entries, 380.0 to 14500.0
dtypes: float64(872)
memory usage: 1.4 GB


In [8]:
training_data.head()

Unnamed: 0,380.0,384.0,388.0,392.0,396.0,400.0,404.0,408.0,412.0,416.0,...,14275.0,14300.0,14325.0,14350.0,14375.0,14400.0,14425.0,14450.0,14475.0,14500.0
0,0.252693,0.252136,0.250987,0.251158,0.2503,0.250713,0.250657,0.250953,0.251826,0.253221,...,0.119495,0.123612,0.129789,0.135583,0.139656,0.141847,0.142505,0.142782,0.143598,0.145446
1,0.16908,0.168682,0.167721,0.167736,0.167315,0.167663,0.167769,0.168206,0.16896,0.170064,...,0.086353,0.085993,0.089868,0.097571,0.108306,0.120761,0.13294,0.143911,0.152296,0.156787
2,0.168449,0.169021,0.169171,0.170367,0.170694,0.172072,0.172899,0.17394,0.175187,0.176431,...,0.105031,0.103591,0.106344,0.113392,0.123753,0.135379,0.145596,0.153365,0.158294,0.160469
3,0.151682,0.15235,0.152605,0.153702,0.154191,0.155479,0.156285,0.157167,0.158204,0.159429,...,0.092608,0.089949,0.090597,0.094342,0.10133,0.111611,0.124039,0.137684,0.150909,0.161748
4,0.13259,0.131751,0.13058,0.130464,0.130035,0.130468,0.130801,0.131667,0.132959,0.134908,...,0.094155,0.088858,0.087487,0.090884,0.09851,0.10833,0.117291,0.124278,0.129234,0.132313


##### Save Drill Hole Training

In [9]:
with open(
    os.path.join(output_data_path, "prominentHill_swir_tir_specs_cu_corr.pkl"), "wb"
) as f:
    pickle.dump((training_data, training_data_cu), f, protocol=pickle.HIGHEST_PROTOCOL)

#### Load Training Data


In [10]:
with open(
    os.path.join(output_data_path, "prominentHill_swir_tir_specs_cu_corr.pkl"), "rb"
) as f:

    training_data, training_data_cu = pickle.load(f)

#### Remove Continuum Drill Hole Data

In [11]:
from spectraZones.spectra.utils import remove_continuum_parallel

In [12]:
all_spectra = training_data.values
all_cu = training_data_cu.values.reshape((-1,1))

In [13]:
all_swir = all_spectra[:, :len(swir_wvl_base)]
all_tir = all_spectra[:, len(swir_wvl_base):]

In [14]:
print("SWIR DATA SHAPE: ", all_swir.shape)
print("TIR DATA SHAPE: ", all_tir.shape)

SWIR DATA SHAPE:  (219363, 531)
TIR DATA SHAPE:  (219363, 341)


In [15]:
all_swir_cr = remove_continuum_parallel(all_swir, swir_wvl_base)

Using 32 cores for parallel processing.


Processing samples:   0%|          | 0/219363 [00:00<?, ?it/s]

In [16]:
all_tir_cr = remove_continuum_parallel(all_tir, tir_wvl_base)

Using 32 cores for parallel processing.


Processing samples:   0%|          | 0/219363 [00:00<?, ?it/s]

In [17]:
all_spectra_cr = np.concatenate((all_swir_cr, all_tir_cr), axis=1).reshape((all_swir_cr.shape[0],-1))

In [18]:
print("ALL SPECTRA CR SHAPE: ", all_spectra_cr.shape)

ALL SPECTRA CR SHAPE:  (219363, 872)


##### Save CR Drill Hole Data

In [19]:
with open(
    os.path.join(output_data_path, "prominentHill_all_spectra_cr_cu.pkl"), "wb"
) as f:

    pickle.dump((all_spectra_cr, all_cu), f, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
with open(
    os.path.join(output_data_path, "prominentHill_all_spectra_cr_cu_idxs.pkl"), "wb"
) as f:

    pickle.dump(training_data.index, f, protocol=pickle.HIGHEST_PROTOCOL)