# 1: Import libraries and set paths

In [1]:
# Models and utils
import json
from pathlib import Path
from datetime import datetime
from beak.models import magmatic_nickel_nat
from beak.utilities.io import load_model, check_path

# SOM specific
import beak.methods.som.argsSOM as asom
from beak.methods.som.nextsomcore.nextsomcore import NxtSomCore
args = asom.Args()

# Files
import sys
if sys.version_info < (3, 9):
    from importlib_resources import files
else:
    from importlib.resources import files

# Choose model
MODEL = "BASELINE_FINAL"
model = magmatic_nickel_nat.models[MODEL]

BASE_PATH = files("beak.data")
BASE_SPATIAL = "EPSG_4326_RES_0_025"
BASE_EXTENT = "CLIPPED_USC"

# Choose data path
PATH_MCCAFFERTY =  BASE_PATH / "MCCAFFERTY23" / "PROCESSED" / BASE_SPATIAL / BASE_EXTENT / "GEOPHYSICS_SCALED_STD"
PATH_ISOGRAVITY = BASE_PATH / "GEOPHYSICS_ISOGRAVITY" / "PROCESSED" / BASE_SPATIAL / BASE_EXTENT / "UNIFIED_SCALED_STD"
PATH_MAGNETOTELLURIC = BASE_PATH / "GEOPHYSICS_MAGNETOTELLURICS" / "PROCESSED" / "CONUS_2023" / BASE_SPATIAL / BASE_EXTENT / "UNIFIED_SCALED_STD"
PATH_ULTRAMAFICS = BASE_PATH / "GEOLOGY_US_ULTRAMAFICS" / "PROCESSED" / BASE_SPATIAL / BASE_EXTENT / "UNIFIED"
PATH_LAWLEY_NUMERICAL = BASE_PATH / "LAWLEY22" / "EXPORT" / BASE_SPATIAL / BASE_EXTENT / "NUMERICAL_SCALED_STANDARD"
PATH_LAWLEY_CATEGORICAL = BASE_PATH / "LAWLEY22" / "EXPORT" / BASE_SPATIAL / BASE_EXTENT / "CATEGORICAL"
PATH_LABELS = (BASE_PATH / "TRAINING_LABELS" / "MAGMATIC_NICKEL_NAT" / "MAGMATIC_NICKEL_TA2_EPSG_4326_RES_0_025_CLIPPED_USC.tif")

model_dict, file_list, counts = load_model(
    model=model,
    folders=[PATH_MAGNETOTELLURIC, PATH_ISOGRAVITY,
             PATH_MCCAFFERTY, PATH_LAWLEY_NUMERICAL,
             PATH_ULTRAMAFICS, PATH_LAWLEY_CATEGORICAL],
    file_extensions=[".tif", ".tiff"],
    verbose=0,
)

# SOM specific
label_data_file_list = [str(PATH_LABELS)]


Loading model definition...
Selected 14 evidence layers.

Searching for files and folders in provided paths...
Found 6 folders, 43 subfolders and 837 files.

Searching for corresponding files...
Found 'US_IsostaticGravity_WGS84' in 1 file(s).
Found 'Gravity_Up30km_HGM' in 1 file(s).
Found 'Gravity_Bouguer_UpCont30km_HGM_Worms_Proximity' in 1 file(s).
Found 'CONUS_MT2023_9km_cog' in 1 file(s).
Found 'CONUS_MT2023_15km_cog' in 1 file(s).
Found 'CONUS_MT2023_30km_cog' in 1 file(s).
Found 'MagRTP' in 1 file(s).
Found 'MagRTP_HGMDeepSources' in 1 file(s).
Found 'Magnetic_LongWavelength_HGM_Worms_Proximity' in 1 file(s).
Found 'Mag_AnalyticSignal_cog' in 1 file(s).
Found 'Moho' in 1 file(s).
Found 'LAB' in 1 file(s).
Found 'LAB_HGM_cog' in 1 file(s).
Found 'LAB_Worms_Proximity' in 1 file(s).

Ensuring that all layers have matching files...
All layers have matching files.

Checking files for multiple occurences...
No duplicates found. All filenames occur only once.
Number of files in file lis

In [2]:
# Replace Mag Analytical Signal STD with LOG version
LOG_MOD = True

if LOG_MOD is True:
    for i, path in enumerate(file_list):
        if "Mag_AnalyticSignal_cog.tif" in str(path):
            print(f"{path} found at index {i}.")
            file_list[i] = str(file_list[i]).replace("GEOPHYSICS_SCALED_STD", "GEOPHYSICS_SCALED_LOG")
            file_list[i] = Path(file_list[i])
            break

s:\projekte\20230082_darpa_criticalmaas_ta3\bearbeitung\github\beak-ta3\src\beak\data\MCCAFFERTY23\PROCESSED\EPSG_4326_RES_0_025\CLIPPED_USC\GEOPHYSICS_SCALED_STD\Mag_AnalyticSignal_cog.tif found at index 9.


# 2: Set SOM parameters and model **name** and **configuration**

In [3]:
args.som_x = 50                    # X dimension of generated SOM
args.som_y = 50                    # Y dimension of generated SOM
args.epochs = 10                   # Number of epochs to run

args.kmeans = "true"               # Run k-means clustering (true, false)
args.kmeans_init = 5               # Number of initializations
args.kmeans_min = 20               # Minimum number of k-mean clusters
args.kmeans_max = 50               # Maximum number of k-mean clusters

args.neighborhood = "gaussian"     # Shape of the neighborhood function. gaussian or bubble
args.std_coeff = 0.5               # Coefficient in the Gaussian neighborhood function
args.maptype = "toroid"            # Type of SOM (sheet, toroid)
args.initialcodebook = None        # File path of initial codebook, 2D numpy.array of float32.
args.radius0 = 0                   # Initial size of the neighborhood
args.radiusN = 1                   # Final size of the neighborhood
args.radiuscooling = "linear"      # Function that defines the decrease in the neighborhood size as the training proceeds (linear, exponential)
args.scalecooling = "linear"       # Function that defines the decrease in the learning scale as the training proceeds (linear, exponential)
args.scale0 = 0.1                  # Initial learning rate
args.scaleN = 0.01                 # Final learning rate
args.initialization = "random"     # Type of SOM initialization (random, pca)
args.gridtype = "rectangular"      # Type of SOM grid (hexagonal, rectangular)

args.label = False                 # Whether data contains label column, True or False


In [4]:
# Set model name and folder
MODEL_NAME = "SOM_F" + str(len(file_list)) + "_X" + str(args.som_x) + "_Y" + str(args.som_y) + "_CMAX" + str(args.kmeans_max) + "_" + datetime.now().strftime("%Y%m%d-%H%M%S")
MODEL_FOLDER = Path.cwd() / "models" / MODEL_NAME
check_path(MODEL_FOLDER)


WindowsPath('s:/Projekte/20230082_DARPA_CriticalMAAS_TA3/Bearbeitung/GitHub/beak-ta3/experiments/mag_nickel_nat/som/baseline_final/models/SOM_F14_X50_Y50_CMAX50_20240307-110259')

# 3: Input data, file lists etc.

In [5]:
# Set input files path
file_path = MODEL_FOLDER / "input_file_list.txt"
label_data_file_path = MODEL_FOLDER / "label_file_list.txt"

# Write input file paths and parameters to text files
with open(file_path, 'w') as file:
    for string in file_list:
        file.write(f"{string}\n")
    file.close()

with open(label_data_file_path, 'w') as file:
    for string in label_data_file_list:
        file.write(f"{string}\n")
    file.close()

args_path = MODEL_FOLDER / "args.txt"
def write_args_to_file(file_path, **kwargs):
  with open(file_path, "w") as f:
    json.dump(kwargs, f, indent=4)
    file.close()

args_path = MODEL_FOLDER / "args.json"
write_args_to_file(file_path=args_path,
                   som_x=args.som_x,
                   som_y=args.som_y,
                   epochs=args.epochs,
                   kmeans=args.kmeans,
                   k_means_init=args.kmeans_init,
                   kmeans_min=args.kmeans_min,
                   kmeans_max=args.kmeans_max,
                   neighborhood=args.neighborhood,
                   std_coeff=args.std_coeff,
                   maptype=args.maptype,
                   initialcodebook=args.initialcodebook,
                   radius0=args.radius0,        
                   radiusN=args.radiusN,                  
                   radiuscooling=args.radiuscooling, 
                   scalecooling=args.scalecooling,
                   scale0=args.scale0,          
                   scaleN=args.scaleN,
                   initialization=args.initialization,
                   gridtype=args.gridtype,
                   label=args.label
                   )


In [6]:
# Args
args.output_folder = str(MODEL_FOLDER) + "/" + "exports"                             # Folder to save som dictionary and cluster dictionary
check_path(args.output_folder)

args.output_file_somspace = args.output_folder + "/" + "result_som.txt"              # Txt file for: som_x som_y b_data1 b_data2 b_dataN umatrix cluster, geospace. DO NOT CHANGE!
args.outgeofile = args.output_folder + "/" + "result_geo.txt"                        # DO NOT CHANGE!
args.output_file_geospace = args.outgeofile                                          # Text file for {X Y Z} data1 data2 dataN som_x som_y cluster b_data1 b_data2 b_dataN, geospace.
args.label_geotiff_file = args.output_folder + "/" + "input_file_list.txt"           # GeoTiff_input file (None)

In [7]:
args.input_file = args.create_list_from_file(file_path)
args.geotiff_input=args.input_file                                # Geotiff_input files, separated by komma, to write GeoTIF out 
                                                                  # (only first line is used to get the geotransform and projection information 
                                                                  # to set output GeoFIT geotransform and projection)

Number of files added: 14
Files:
s:\projekte\20230082_darpa_criticalmaas_ta3\bearbeitung\github\beak-ta3\src\beak\data\GEOPHYSICS_ISOGRAVITY\PROCESSED\EPSG_4326_RES_0_025\CLIPPED_USC\UNIFIED_SCALED_STD\US_IsostaticGravity_WGS84.tif
s:\projekte\20230082_darpa_criticalmaas_ta3\bearbeitung\github\beak-ta3\src\beak\data\MCCAFFERTY23\PROCESSED\EPSG_4326_RES_0_025\CLIPPED_USC\GEOPHYSICS_SCALED_STD\Gravity_Up30km_HGM.tif
s:\projekte\20230082_darpa_criticalmaas_ta3\bearbeitung\github\beak-ta3\src\beak\data\LAWLEY22\EXPORT\EPSG_4326_RES_0_025\CLIPPED_USC\NUMERICAL_SCALED_STANDARD\Gravity_Bouguer_UpCont30km_HGM_Worms_Proximity.tif
s:\projekte\20230082_darpa_criticalmaas_ta3\bearbeitung\github\beak-ta3\src\beak\data\GEOPHYSICS_MAGNETOTELLURICS\PROCESSED\CONUS_2023\EPSG_4326_RES_0_025\CLIPPED_USC\UNIFIED_SCALED_STD\CONUS_MT2023_9km_cog.tif
s:\projekte\20230082_darpa_criticalmaas_ta3\bearbeitung\github\beak-ta3\src\beak\data\GEOPHYSICS_MAGNETOTELLURICS\PROCESSED\CONUS_2023\EPSG_4326_RES_0_025\CLIPP

# 4: Run SOM 

Run SOM with parameters specified above and save the results. Uses NxtSomCore package to do the actual work. <p>
Before running SOM - clean up existing files and move them to a subfolder.

In [8]:
import beak.methods.som.do_nextsomcore_save_results as dnsr
import beak.methods.som.move_to_subfolder as mts
import warnings

mts.remove_som_results(args.output_folder)                              # move or remove existing SOM output files from previous runs into subfolder
                                                                        # mts.move_som_results(args.output_folder, "old_results")
# Run SOM
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    dnsr.run_SOM(args)
    

Load data
    Execution time: 0.8900392055511475 seconds
Run SOM
    Execution time: 43.31419610977173 seconds
Clustering progress:
0.00%
9.68%
19.35%
29.03%
38.71%
48.39%
58.06%
67.74%
77.42%
87.10%
96.77%
100% Clustering completed.
    Execution time: 21.632780075073242 seconds
Save geo space results
         combine data colums for output geo file (for large data arrays memory usage might be a concern)
         savetxt
    Execution time: 59.918529748916626 seconds
Save SOM space results
    Execution time: 1.1636793613433838 seconds
Save SOM object to file
    Execution time: 0.006000041961669922 seconds
Write GeoTIFF file
     read_csv som_data
     read_csv geo_data
     Iterate over each geoTIF file:
          cluster
          b_US_IsostaticGravity_WGS84
          b_Gravity_Up30km_HGM
          b_Gravity_Bouguer_UpCont30km_HGM_Worms_Proximity
          b_CONUS_MT2023_9km_cog
          b_CONUS_MT2023_15km_cog
          b_CONUS_MT2023_30km_cog
          b_MagRTP
          b_MagRT

In [9]:
import beak.methods.som.plot_som_results as plot
from IPython.display import Image, display, clear_output

# Load cluster dictionary
loaded_cluster_list = plot.load_cluster_dictionary(args.output_folder)

# Plot and save the Davies-Bouldin Index vs Number of Clusters
plot.plot_davies_bouldin(loaded_cluster_list, args.output_folder)

# 5: Plot results.

Specify the parameters to plot the results and create figures. The Python script "plot_som_results.py" creates .png files of the results in som space, geospace and also creates boxplots.

Move figures into a sub folder. If the destination folder does not exist, it is created here. All file names are stored in a list that is used in the next step to show all output figures.

In [10]:
import beak.methods.som.argsPlot
import beak.methods.som.plot_som_results as plot
import beak.methods.som.move_to_subfolder as mts

argsP = beak.methods.som.argsPlot.Args()

argsP.outsomfile = args.output_file_somspace            # som calculation somspace output text file
argsP.som_x = args.som_x                                # som x dimension
argsP.som_y = args.som_y                                # som y dimension
argsP.input_file = args.input_file                      # input file (*.lrn)
argsP.dir = args.output_folder                          # input file (*.lrn) or directory where som.dictionary was safed to (/output/som.dictionary)
argsP.grid_type = 'rectangular'                         # grid type (square or hexa), (rectangular or hexagonal)
argsP.redraw = 'true'                                   # true: draw all plots. false: draw only polts required for clustering.
argsP.outgeofile = args.output_file_geospace            # som geospace results txt file
argsP.dataType = 'grid'                                 # data type (scatter or grid)
argsP.noDataValue= ' -9999'                             # nodata value

plot.run_plotting_script(argsP)

subfolder_name = "plots"
images, labels = mts.move_figures(args.output_folder, subfolder_name)

Setup figures
        Read som data execution time: 0.046001434326171875 seconds
        Read geo data execution time: 7.537466049194336 seconds
    Execution time: 7.59146785736084 seconds
Plot geo space results
    geospace plot no. 2 from 16

  z[z == noDataValue] = np.nan


    geospace plot no. 16 from 16
    q-error plot
    Execution time: 41.866708517074585 seconds
Plot Cluster result SOM space
    Plot Davies Bouldin index
    Plot cluster hit count
    Execution time: 1.5974392890930176 seconds
Plot SOM space results
    somspace plot no. 14 from 14
    Execution time: 6.7386438846588135 seconds
Plot Boxplots
    boxplot no. 14 from 14
    Execution time: 15.42821455001831 seconds


Show all figures that were produced by "plot_som_results.py" and moved into a subfolder.

Boxplot:
> x axis is Cluster ID </br>
> Z axis is original data value </br>
> legend: number of data points in each cluster </br>

In [11]:
import matplotlib.pyplot as plt
from IPython.display import clear_output
import ipyplot

# Clear Matplotlib cache
plt.close('all')

# Clear output
clear_output(wait=True)

# Get the tabs descriptions
tabs = [image.split('_')[-2] for image in labels]

# print(labels)
# print(tabs)

# Plot the images
ipyplot.plot_class_tabs(images, tabs, max_imgs_per_tab=50, img_width=400)
ipyplot.plot_class_representations(images,  labels, img_width=200, show_url=False)
