In [5]:
# author: LijiongChen
# contact: s15010125@s.upc.edu.cn
# time:19/06/2021
# update:06/11/2021
# file: hv_cluster.ipynb
#       This code is designead to sort the HVSR using keans and plot the result with plotly.

In [6]:
import os
import tkinter as tk
from tkinter import filedialog
from pathlib import Path
from glob import glob
import pandas as pd
import numpy as np
from scipy.cluster.vq import kmeans, vq
import plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
import time
from tqdm import trange

In [7]:
print('Start of the program :)')
# Load HV
root = tk.Tk()
root.withdraw()
Folderpath = filedialog.askdirectory()
print('Folderpath:', Folderpath)
root.destroy()
# find the .hv data
fs = glob(Folderpath + '/*.hv')
# file type ifo in the selected file path
all_files = os.listdir(Folderpath)
type_dict = dict()
for each_file in all_files:
    if os.path.isdir(each_file):
        type_dict.setdefault('Folders', 0)
        type_dict['Folders'] += 1
    else:
        ext = os.path.splitext(each_file)[1]
        type_dict.setdefault(ext, 0)
        type_dict[ext] += 1
for each_type in type_dict.keys():
    print('FolderType: This folder has a total of [%s] file %d ' % (each_type, type_dict[each_type]))
# define the range of Freq
freq = np.geomspace(0.1, 100.1, 400)
# define the min and max Freq you wanted
min_freq = float(input('Min freq from 1Hz to 100Hz: \n'))
max_freq = float(input('Max freq from 1Hz to 100Hz: \n'))
# read the HVSR data
start_time_01 = time.time()
sel = (freq >= min_freq) & (freq <= max_freq)
freq_sel = freq[sel]
data = []
data_sel = np.zeros(shape=(len(fs), len(freq_sel)))
data2 = np.zeros(shape=(len(fs), len(freq)))
print('----------------------Loading...---------------------')
for i in trange(len(fs)):
    time.sleep(0.1)
    # here skiprows=9 is the headers of .hv data
    data.append(pd.read_table(filepath_or_buffer=fs[i], sep='\t', skiprows=9, names=['Freq', 'Aver', 'max', 'min']))
    data2[i] = np.interp(freq, data[i].Freq, data[i].Aver)  # interpolation for the same data dimension
    data_sel[i] = data2[i][sel]
# change numpy array to pandas dataframe
data2 = pd.DataFrame(data2).T
# get the name of dataframe.columns
hvname = [f'hv{n}' for n in range(len(fs))]
# change the dataframe.columns name to what you create
data2.columns = hvname
# change numpy array to pandas dataframe
freq = pd.DataFrame(freq)
# change the dataframe.columns name to what you want
freq.columns = ['freq']
# get the .hv file name
names = []
for i in range(len(fs)):
    names.append(Path(fs[i]).stem)
# define the number which you want to sort
n_clusters = int(input('Please input the number of clusters: from 1 to ' + str(len(fs)) + '\n'))
centroid, _ = kmeans(data_sel, n_clusters)
result1, _ = vq(data_sel, centroid)
result1 = result1.tolist()
# print(result1)
print('The min frequency is ' + str(min_freq) + 'Hz')
print('The max frequency is ' + str(max_freq) + 'Hz')
print('The cluster number is ' + str(n_clusters))
end_time_01 = time.time()
print('Time cost',end_time_01-start_time_01,'s')
print('------------------------Done!------------------------')

Start of the program :)
Folderpath: D:/ProjectMaterials/Chengdu_Line13/Part_01/DataProcess/20211125/B/HV
FolderType: This folder has a total of [.hv] file 72 


  1%|▏         | 1/72 [00:00<00:08,  8.64it/s]

----------------------Loading...---------------------


100%|██████████| 72/72 [00:08<00:00,  8.35it/s]


The min frequency is 5.0Hz
The max frequency is 20.0Hz
The cluster number is 6
Time cost 11.208673238754272 s
------------------------Done!------------------------


In [8]:
# define the plotly subplot
start_time_02 = time.time()
pio.templates.default = "plotly_white"  # set the plotly templates
if n_clusters == 1:
    fig = make_subplots(rows=1, cols=1, print_grid=False)
    count = 0
    for k in range(len(fs)):
        count = count + 1
        fig.add_trace(go.Scatter(x=freq['freq'], y=data2[hvname[k]], name=names[k]),
                      row=1, col=1)
elif n_clusters == 2:
    fig = make_subplots(rows=1, cols=2, print_grid=False)
    count = 0
    for k in range(len(fs)):
        count = count + 1
        if result1[k] == 0:
            fig.add_trace(go.Scatter(x=freq['freq'], y=data2[hvname[k]], name=names[k]),
                          row=1, col=1)
        else:
            fig.add_trace(go.Scatter(x=freq['freq'], y=data2[hvname[k]], name=names[k]),
                          row=1, col=2)
elif n_clusters == 3:
    fig = make_subplots(rows=1, cols=3, print_grid=False)
    count = 0
    for k in range(len(fs)):
        count = count + 1
        if result1[k] == 0:
            fig.add_trace(go.Scatter(x=freq['freq'], y=data2[hvname[k]], name=names[k]),
                          row=1, col=1)
        elif result1[k] == 1:
            fig.add_trace(go.Scatter(x=freq['freq'], y=data2[hvname[k]], name=names[k]),
                          row=1, col=2)
        else:
            fig.add_trace(go.Scatter(x=freq['freq'], y=data2[hvname[k]], name=names[k]),
                          row=1, col=3)
else:
    numRow = 2
    numCol = int(np.ceil(n_clusters / 2))
    fig = make_subplots(rows=numRow, cols=numCol, print_grid=False)
    # plot the result of classification
    for z in range(n_clusters):  # z is the number of classification
        count = 0
        for k in range(len(fs)):
            if result1[k] == z:
                count = count + 1
                if z < numCol:
                    fig.add_trace(go.Scatter(x=freq['freq'], y=data2[hvname[k]], name=names[k]),
                                  row=1, col=z+1)
                else:
                    fig.add_trace(go.Scatter(x=freq['freq'], y=data2[hvname[k]], name=names[k]),
                                  row=2, col=z+1-numCol)
# update the plot frame
fig.update_xaxes(type="log")
fig.update_xaxes(range=[0.3, 2])
fig.update_yaxes(range=[0.0, 7.0], tick0=0.0, dtick=1.0)
fig.update_xaxes(title_text="Frequency (Hz)")
fig.update_yaxes(title_text="Amplitude")
fig.update_layout(title='HV cluster', showlegend=False)
# print the Classification result of HVSR
for z in range(n_clusters):
    print('Group {:.0f} '.format(z + 1))
    for i in range(len(fs)):
        if result1[i] == z:
            print(names[i])
# save the .html file
htmlFileName = Folderpath.split("/")[-1] + '_HV_Cluster' + '.html'
plotly.offline.plot(fig, filename=Folderpath + '/' + htmlFileName)
end_time_02 = time.time()
print('Time cost',end_time_02-start_time_02,'s')
print('------------------------Done!------------------------')


Group 1 
B0-125.B.3
B0-125.E.2
B0-127.5.E.1
B0-127.5.E.2
B2.5-125.B.3
B2.5-125.E.2
B2.5-127.5.E.1
B2.5-127.5.E.2
Group 2 
B0-135.E.2
B0-137.5.E.2
B2.5-135.E.2
B2.5-137.5.E.2
Group 3 
B0-130.B.3
B0-130.E.2
B0-132.5.E.1
B0-132.5.E.2
B2.5-130.B.2
B2.5-130.B.3
B2.5-130.D.5
B2.5-130.E.2
B2.5-132.5.D.1
B2.5-132.5.D.5
B2.5-132.5.E.1
B2.5-132.5.E.2
B5-130.B.1
B5-130.B.2
B5-130.B.5
B5-130.D.5
B5-132.5.B.5
B5-132.5.D.1
B5-132.5.D.5
B5-132.5.E.3
Group 4 
B2.5-125.B.2
B2.5-125.D.5
B2.5-127.5.D.1
B2.5-127.5.D.5
B5-125.B.1
B5-125.B.2
B5-125.B.5
B5-125.D.5
B5-127.5.B.5
B5-127.5.D.1
B5-127.5.D.5
B5-127.5.E.3
Group 5 
B0-135.B.3
B0-137.5.E.1
B2.5-135.B.2
B2.5-135.B.3
B2.5-135.D.5
B2.5-137.5.D.1
B2.5-137.5.D.5
B2.5-137.5.E.1
B5-135.B.1
B5-135.B.2
B5-135.B.5
B5-135.D.5
B5-137.5.B.5
B5-137.5.D.1
B5-137.5.D.5
B5-137.5.E.3
Group 6 
B0-125.B.4
B0-125.D.4
B0-127.5.D.3
B0-127.5.D.4
B0-130.B.4
B0-130.D.4
B0-132.5.D.3
B0-132.5.D.4
B0-135.B.4
B0-135.D.4
B0-137.5.D.3
B0-137.5.D.4
Time cost 0.4857518672943115 s
---