## Cluster Segmentation

In [2]:
import os
import pickle
import pandas as pd
import numpy as np

output_data_path = 'output/'

#### Load Data

In [3]:
spec_df_chem = pd.read_pickle('data/prominetHill_spec_chem_final_.pkl')

In [4]:
with open(
    os.path.join(output_data_path, "prominentHill_all_spectra_cr_cu_idxs.pkl"), "rb"
) as handle:
    all_idxs = pickle.load(handle)

In [5]:
with open(
    os.path.join(output_data_path, "prominentHill_all_spectra_cr_cu.pkl"), "rb"
) as f:
    trainig_data, trainig_data_cu = pickle.load(f)

In [6]:
with open(
    os.path.join(
        output_data_path, "promintHill_all_spectra_cr_encoded_mdvpt_clusters.pkl"
    ),
    "rb",
) as handle:

    all_spectra_cr_clusters = pickle.load(handle)

In [7]:
with open('data/wvl_arr.pkl', 'rb') as f:
    base_wvl_arr = pickle.load(f)
swir_wvl_base = base_wvl_arr[0].astype(str).values
tir_wvl_base = base_wvl_arr[1].astype(str).values

In [8]:
all_wvl_base = list(swir_wvl_base) + list([float(w) for w in tir_wvl_base])

### Get Metadata

In [9]:
# Get the spectra for the selected indexes
all_df = spec_df_chem.loc[all_idxs, :] 

In [10]:
all_df.head()

Unnamed: 0,Sample,Depth (m),Min1 uTSAS,Wt1 uTSAS,Min2 uTSAS,Wt2 uTSAS,Min3 uTSAS,Wt3 uTSAS,Error uTSAS,Min1 ujCLST,...,14300.0,14325.0,14350.0,14375.0,14400.0,14425.0,14450.0,14475.0,14500.0,Depth_idx
0,241877_0001_1,62.903,,,,,,,,,...,0.123612,0.129789,0.135583,0.139656,0.141847,0.142505,0.142782,0.143598,0.145446,62.903017
1,241877_0001_2,62.903,,,,,,,,,...,0.085993,0.089868,0.097571,0.108306,0.120761,0.13294,0.143911,0.152296,0.156787,62.903019
2,241877_0001_3,62.903,,,,,,,,,...,0.103591,0.106344,0.113392,0.123753,0.135379,0.145596,0.153365,0.158294,0.160469,62.903011
3,241877_0001_4,62.903,,,,,,,,,...,0.089949,0.090597,0.094342,0.10133,0.111611,0.124039,0.137684,0.150909,0.161748,62.903013
4,241877_0001_5,62.903,,,,,,,,,...,0.088858,0.087487,0.090884,0.09851,0.10833,0.117291,0.124278,0.129234,0.132313,62.903014


In [11]:
# Create From and To columns (for ploting points as intervals)
def fromToCols(df, depth_col="Depth (m)"):
    print(df["DH_NAME"].unique())
    sorted_df = df.sort_values(by=depth_col)
    depth_vals = sorted_df[depth_col].values
    from_ = []
    to_ = []
    for i in range(len(depth_vals)):
        if i == 0:
            from_.append(depth_vals[i])
            to_.append((depth_vals[i] + depth_vals[i+1])/2)
        elif i == len(depth_vals)-1:
            from_.append((depth_vals[i-1] + depth_vals[i])/2)
            to_.append(depth_vals[i])
        else:
            from_.append((depth_vals[i-1] + depth_vals[i])/2)
            to_.append((depth_vals[i] + depth_vals[i+1])/2)
    
    df["FROM_comp"] = from_
    df["TO_comp"] = to_
    return df

In [12]:
# TSG Columns grouped by DH_NAME and sorted by Depth
tsg_cols_DH_g = all_df.iloc[:,:25].sort_values(by="Depth (m)").groupby("DH_NAME")

In [13]:
dh_results_df = tsg_cols_DH_g.apply(fromToCols)

['141786']
['241877']
['241878']
['269223']


  dh_results_df = tsg_cols_DH_g.apply(fromToCols)


In [14]:
# Drop DH group idx
dh_results_df = dh_results_df.reset_index(level=0, drop=True)
# Reset order of samples
dh_results_df = dh_results_df.loc[all_idxs, :] 

In [15]:
dh_results_df.head()

Unnamed: 0,Sample,Depth (m),Min1 uTSAS,Wt1 uTSAS,Min2 uTSAS,Wt2 uTSAS,Min3 uTSAS,Wt3 uTSAS,Error uTSAS,Min1 ujCLST,...,Min1 uTSAT,Wt1 uTSAT,Min2 uTSAT,Wt2 uTSAT,Min3 uTSAT,Wt3 uTSAT,Error uTSAT,DH_NAME,FROM_comp,TO_comp
0,241877_0001_1,62.903,,,,,,,,,...,,,,,,,,241877,62.903,62.903
1,241877_0001_2,62.903,,,,,,,,,...,,,,,,,,241877,62.903,62.903
2,241877_0001_3,62.903,,,,,,,,,...,,,,,,,,241877,62.903,62.903
3,241877_0001_4,62.903,,,,,,,,,...,,,,,,,,241877,62.903,62.903
4,241877_0001_5,62.903,,,,,,,,,...,,,,,,,,241877,62.903,62.903


In [16]:
dh_results_df["clusters_dae_som"]  = all_spectra_cr_clusters.astype(str)

#### Cluster Segmentation

In [17]:
from spectraZones.tools.cluster_segmentation import ClusterSegmentation

In [18]:
def computeSegmentation(
    df,
    depth_col="Depth (m)",
    cluster_col="clusters_dae_som",
    output_col="clusters_segmented",
):
    print("Drill Hole: ", df["DH_NAME"].unique())
    cluster_segmentation = ClusterSegmentation(
        df[depth_col].values,
        df[cluster_col].values,
        grid_spacing=2,
    )
    new_clusters = cluster_segmentation.compute_new_clusters()

    df[output_col] = new_clusters.astype(int).astype(str)
    return df

In [19]:
# Segment the clusters in each drill hole
g = dh_results_df.groupby("DH_NAME")
dh_results_df_segmented = g.apply(
    computeSegmentation, "Depth (m)", "clusters_dae_som", "clusters_segmented"
)

Drill Hole:  ['141786']
Drill Hole:  ['241877']
Drill Hole:  ['241878']
Drill Hole:  ['269223']


  dh_results_df_segmented = g.apply(


In [20]:
# Drop DH group idx
dh_results_df_segmented = dh_results_df_segmented.reset_index(level=0, drop=True)
# Reset order of samples
dh_results_df_segmented = dh_results_df_segmented.loc[all_idxs, :] 

In [21]:
# Assign Cu values to the segmented clusters
dh_results_df_segmented["Cu"] = trainig_data_cu

##### Add logged lithology to result DataFrame

In [22]:
# Load metadata df
meta_df_arr = pd.read_pickle("data/prominentHill_meta_.pkl")
litho_meta_df = meta_df_arr[1]

  setstate(state)


In [23]:
# Get the metadata df indef for each sample
# using the DH_NAME and Depth From/To

# Create array to map the metadata index to the segmented clusters
meta_idx_map = np.full(dh_results_df_segmented.shape[0], -1.)
# Get the DH number and FROM/TO from each sample
lito_samples = litho_meta_df[['DRILLHOLE_NO','DEPTH_FROM_M','DEPTH_TO_M']]
# Iterate over the segmented clusters                          
for j, i in enumerate(dh_results_df_segmented.index):
    # Get the DH number and Depth for the current sample
    dh_name, dep = dh_results_df_segmented.loc[i,['DH_NAME','Depth (m)']]
    # Get the index of the metadata df for the current sample
    # comparing sample DH and depth to the metadata from/to interval
    lito_sample = lito_samples[(lito_samples['DRILLHOLE_NO'] == int(dh_name)) & \
                 (dep >= lito_samples['DEPTH_FROM_M']) & \
                 (dep < lito_samples['DEPTH_TO_M'])].index.values
    
    # If the sample is found, assign the index to the map
    if lito_sample.shape[0] > 0:
        meta_idx_map[j] = lito_sample[0]

In [24]:
# Columns to extract from the metadata df
meta_cols_to_extract = [
    "MAJOR_LITHOLOGY_CODE",
    "MAJOR_LITHOLOGY_CONF",
    "MAJOR_LITHOLOGY",
    "MINOR_LITHOLOGY_CODE",
    "MINOR_LITHOLOGY_CONF",
    "MINOR_LITHOLOGY",
    "DESCRIPTION",
]

In [25]:
# Extract the metadata for each sample
dh_results_df_segmented[meta_cols_to_extract] = [
    litho_meta_df.loc[x, meta_cols_to_extract] if x > -1 else np.nan
    for x in meta_idx_map
]

#### Save Cluster Results

In [26]:
with open(
    os.path.join(output_data_path, "prominentHill_swir_tir_cluster_results.pkl"),
    "wb",
) as f:

    pickle.dump(dh_results_df_segmented, f, protocol=pickle.HIGHEST_PROTOCOL)