In [1]:
%matplotlib qt5

import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns

import glob

# handling XRF data and datatypes
import h5py # interface to HDF5 data format and allows manipulation using numpy
import hyperspy.api as hs # data analysis of multidimensional datasets for analytical procedure

#Dimension reduction and clustering
import umap
import hdbscan
import sklearn as skl
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
import sklearn.cluster as cluster
# from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

# Convert XRF data to Hyperspy standard

In [2]:
!cd '/Users/user/Documents/Projects/XRF_machine_learning/data'
!ls '/Users/user/Documents/Projects/XRF_machine_learning/data'
!pwd

[31mISE_500sqaures_A21-016_Map1_001.h5[m[m
[31mISE_500sqaures_A21_054_botom_right_map_center_001.h5[m[m
map.hspy
map1.hspy
/Users/user/Documents/GitHub/melt_maps


In [3]:
def h5printR(item, leading = ''):
    for key in item:
        if isinstance(item[key], h5py.Dataset):
            print(leading + key + ': ' + str(item[key].shape))
        else:
            print(leading + key)
            h5printR(item[key], leading + '  ')

# Print structure of a `.h5` file            
def h5print(filename):
    with h5py.File(filename, 'r') as h:
        print(filename)
        h5printR(h, '  ')

In [4]:
xrf_data = h5py.File('/Users/user/Documents/Projects/XRF_machine_learning/data/ISE_500sqaures_A21_054_botom_right_map_center_001.h5','r')
h5print('/Users/user/Documents/Projects/XRF_machine_learning/data/ISE_500sqaures_A21_054_botom_right_map_center_001.h5')

/Users/user/Documents/Projects/XRF_machine_learning/data/ISE_500sqaures_A21_054_botom_right_map_center_001.h5
  xrmmap
    areas
      A21-054_Br_Xanes_spot: (201, 201)
      A21-054_Br_Xanes_spot_2: (201, 201)
      A21-054_I_Xanes_spot: (201, 201)
      A21-054_I_Xanes_spot_2: (201, 201)
      area_003: (201, 201)
      area_004: (201, 201)
    config
      environ
        address: (65,)
        name: (65,)
        value: (65,)
      general
        basedir: ()
        envfile: ()
      mca_calib
        offset: (7,)
        quad: (7,)
        slope: (7,)
      mca_settings
      motor_controller
        group: ()
        host: ()
        mode: ()
        passwd: ()
        positioners: ()
        type: ()
        user: ()
      notes
      positioners
        13IDE:En:Energy: ()
        13IDE:m19: ()
        13IDE:m25: ()
        13IDE:m28: ()
        13IDE:m31: ()
        13IDE:m32: ()
        13IDE:m34: ()
        13IDE:m35: ()
        13IDE:m36: ()
        13IDE:m39: ()
        1

In [97]:
# investigates h5 file
#row_1 = xrf_data['xrmmap']['mcasum']['counts']
#row_1.shape

(201, 201, 4096)

## Physical parameters

these are needed to scale the data and pixelscorrectly etc.

In [5]:
for att in xrf_data['xrmmap']['mcasum']['counts'].attrs:
    print(att,xrf_data['xrmmap']['mcasum']['counts'].attrs[att])

In [6]:
xrf_data['xrmmap']['mcasum']['counts'].shape

(201, 201, 4096)

In [8]:
pix_x, pix_y, num_ch = xrf_data['xrmmap']['mcasum']['counts'].shape

print(pix_x, pix_y, num_ch)

201 201 4096


In [119]:
pix_x_size= pix_x - 1

pix_y_size= pix_x - 1

print(pix_x_size,pix_y_size)

200 200


# Map and axes management

In [7]:
xrf_map = hs.signals.Signal1D(xrf_data['xrmmap']['mcasum']['counts'])
xrf_map
xrf_map.axes_manager

Navigation axis name,size,index,offset,scale,units
,201,0,0.0,1.0,
,201,0,0.0,1.0,

Signal axis name,size,Unnamed: 2,offset,scale,units
,4096,,0.0,1.0,


In [8]:
dict0 = {'name': 'X', 'offset': 1, 'scale': 1, 'size': 500} # x axis
dict1 = {'name': 'Y', 'offset': 1, 'scale': 1, 'size': 500} # y axis
dict2 = {'name': 'energy', 'offset': 300, 'scale': 1, 'size': 500} # energy axis

xrf_map = hs.signals.Signal1D(xrf_data['xrmmap']['mcasum']['counts'], axes = [dict0][dict1][dict2])


#xrf_map.crop(axis=0,start=1)
#xrf_map.axes_manager[0].name = 'X'
#xrf_map.axes_manager['X'].units = '\u03BCm'

#xrf_map.crop(axis=1,start=1)
#xrf_map.axes_manager[1].name = 'Y'
#xrf_map.axes_manager['X'].units = '\u03BCm'

#xrf_map.axes_manager[2].name= 'Energy'
#xrf_map.axes_manager['Energy'].units = 'kev'

#xrf_map = hs.signals.Signal1D(xrf_data['xrmmap']['mcasum']['counts'], axes = xrf_map.axes_manager)

TypeError: list indices must be integers or slices, not dict

In [50]:
save_path='/Users/user/Documents/Projects/XRF_machine_learning/data/'

In [60]:
xrf_map.save(save_path+'map'.format(map))
xrf_map.plot()

Overwrite '/Users/user/Documents/Projects/XRF_machine_learning/data/map.hspy' (y/n)?
y


In [52]:
plt.close('all')

# working...

## Dimensions of the map
also sets save paths...

In [54]:
xrf_map.change_dtype('float32')
xrf_map.save(save_path+'map1')

xrf_stack = xrf_map

In [56]:
xrf_stack

<Signal1D, title: , dimensions: (201, 201|4096)>

In [57]:
xrf_stack.plot()

In [83]:
xrf_stack.change_dtype('float32')


xrf_stack.save(save_path+'at16_map2_002_mapped_crop')

Overwrite '/Users/joshuashea/melt_mapsat16_map2_002_mapped_crop.hspy' (y/n)?
y


In [84]:
plt.close('all')

In [127]:

xrf_stack.decomposition(normalize_poissonian_noise=False, algorithm="sklearn_pca", output_dimension=20)

xrf_stack.plot_explained_variance_ratio(log=True, vline=True)

xrf_stack.plot_decomposition_results()

Decomposition info:
  normalize_poissonian_noise=False
  algorithm=sklearn_pca
  output_dimension=20
  centre=None
scikit-learn estimator:
PCA(n_components=20)


VBox(children=(HBox(children=(Label(value='Decomposition component index', layout=Layout(width='15%')), IntSli…

In [128]:
xrf_stack.plot_cumulative_explained_variance_ratio()

<AxesSubplot:xlabel='Principal component', ylabel='Cumulative explained variance ratio'>

**note** iterative cropping above showed that most data/ analysis was better when cropping down to the shape (157, 63|2001)

In [118]:
xrf_stack.save(save_path+'lisheen_low_res_map_data_Crop')

## re-explore

get 6 good factors... need to label peaks etc but starting to pull out data.



In [120]:
xrf_stack.decomposition(normalize_poissonian_noise=False, algorithm='NMF', output_dimension=5)


xrf_stack.plot_decomposition_results()



Decomposition info:
  normalize_poissonian_noise=False
  algorithm=NMF
  output_dimension=5
  centre=None
scikit-learn estimator:
NMF(n_components=5)




VBox(children=(HBox(children=(Label(value='Decomposition component index', layout=Layout(width='15%')), IntSli…

In [None]:
xrf_stack.decomposition(normalize_poissonian_noise=True, algorithm='NMF', output_dimension=5)


xrf_stack.plot_decomposition_results()

## examine with factor analysis

In [122]:
pipeline = Pipeline([("FA", FactorAnalysis(n_components=5,rotation = 'varimax'))])

xrf_stack.decomposition(normalize_poissonian_noise=False, algorithm=pipeline, return_info=True,output_dimension=11)

xrf_stack.plot_decomposition_results()

Decomposition info:
  normalize_poissonian_noise=False
  algorithm=Pipeline(steps=[('FA', FactorAnalysis(n_components=5, rotation='varimax'))])
  output_dimension=11
  centre=None
scikit-learn estimator:
Pipeline(steps=[('FA', FactorAnalysis(n_components=5, rotation='varimax'))])


VBox(children=(HBox(children=(Label(value='Decomposition component index', layout=Layout(width='15%')), IntSli…

In [18]:
xrf_stack.plot_decomposition_results()

VBox(children=(HBox(children=(Label(value='Decomposition component index', layout=Layout(width='15%')), IntSli…

In [123]:
fa_load= xrf_stack.get_decomposition_loadings()
#fa_load.save('snv_eds_analysis/plage_TD_eds_FA_4_load')

fa_fact= xrf_stack.get_decomposition_factors()
#fa_fact.set_elements(elements)


#fa_fact.save('snv_eds_analysis/plage_TD_eds_FA_4_fact')

In [None]:
fa_facts, ydim,xdim=fa_load.data.shape

fact_load_vect= pd.DataFrame((fa_load.data.reshape(fa_facts, ydim*xdim).T), columns = ['Factor 1','Factor 2','Factor 3','Factor 4'])


In [126]:
fact_load_vect

NameError: name 'fact_load_vect' is not defined

In [None]:
fa_facts, ydim,xdim=fa_load.data.shape
fa_vect=fa_load.data.reshape(fa_facts,ydim*xdim).transpose()
fa_vect.shape

In [None]:
loadings_to_cluster=fa_vect[:,:fa_facts]
loadings_to_cluster.shape

In [None]:
class PGK(Probabilistic, GustafsonKesselMixin):
    pass

In [None]:
num_clus=10
pgk = PGK(n_clusters =num_clus, n_init=10).fit(loadings_to_cluster)
# Process results for visualisation
print(pgk.memberships_)
labels_ = np.argmax(pgk.memberships_, axis=1)
memberships_ = pgk.memberships_[range(len(pgk.memberships_)), labels_] 

labels = labels_.reshape([ydim,xdim])
labels=hs.signals.Signal2D(labels)
labels.plot(cmap='tab10')

mems=pgk.memberships_.reshape(ydim,xdim,num_clus)
print(mems.shape)

mem_maps=hs.signals.Signal2D(mems)
mem_maps=mem_maps.transpose(signal_axes=(2,0))
mem_maps.change_dtype('float32')
mem_maps.plot(cmap='viridis')

In [None]:
plt.close('all')

mem_maps.plot(cmap='viridis')

## next steps

1. makes sense to calcualte an elbow test or examine the BIC to determine the 'optimal' number of clusters
2. explore applying HDBSCAN on the FA/ PCA factors
3. would also to explore the UMAP pretreatment then HDBSCAN
4. maybe blind source separation (ie ICA) after PCA.

for 2 and 3 need a different virtual enviroment.