# GET AND PREPARE DATA FOR LOCALIZED-POD-DL-ROM
This code performes the clustering to find n-clusters POD basis to use the POD-DL-ROM algorithms.
Notice that this algorithm is reduced to usual POD-DL-ROM setting n-clusters = 1. The POD-DL-ROM architecture is explained here https://www.sciencedirect.com/science/article/pii/S0045782521005120. One could also only use only the clustering part to study the classifier. Use the blocks in the order it is indicated. If there are different possible routes you will find direction a or b, follow just one!

In [None]:
#1
import numpy as np
import matplotlib.pyplot as plt
import time
import scipy.interpolate
import os
import csv
import copy
import pandas as pd
from scipy.interpolate import griddata
from numpy import savetxt
import sklearn.utils.extmath
import progressbar
from fcmeans import FCM
from time import sleep

# FUNCTIONS

In [None]:
#2
def get_field(path,simulations,seconds,mesh,field):
    Ns = len(simulations)*len(seconds)
    S = np.zeros((Ns,mesh))
    row = 0
    bar = progressbar.ProgressBar(maxval=len(simulations), \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    for i,s in enumerate(simulations):
        bar.update(i+1)
        for t in seconds:
            data = get_data(path+'/'+'sim_'+str(s)+'/'+str(t)+'/'+field,mesh)
            S[row,:] = data
            row+=1
        sleep(0.1)
    bar.finish()
    return S

def get_data(path,mesh):
    with open(path, 'r') as file:
        output = [None]*mesh
        count = 0
        start = False
        for line in file:
            #print(str(line))
            if line[0] == ')':
                break
            if start:
                output[count] = (float(line))
                #print(output[count])
                count +=1
            if line[0] == "(":
                start = True
        return np.array(output)

def read_files_centers(file_path):
    with open(file_path, 'r') as file:
        if file_path[-1] != 'U':
            lines = np.array(file.read().split("\n"))
            start = np.where(lines == "(")
            end = np.where(lines == ")")
            if len(start[0]) != 0:
                output = np.array(lines[start[0][0]+1:end[0][0]])
                return output
            else:
                return []
        else:
            U = read_U(file_path)
            return U

def M_matrix(simulations,time,parameters,path):
    file = open(path+'/X_LHS_Uniform.csv')
    csvreader = csv.reader(file)
    #header = next(csvreader)
    #print(header)
    rows = []
    for count,row in enumerate(csvreader):
        if count+1<=simulations:
            rows.append(row)
        else: 
            break
    file.close()
    M = np.zeros((simulations*len(time),parameters))#parameters = parameters+1
    count = 0
    for i in rows:
        for t in time:
            first = np.array([t])
            second = np.array(i)
            #print(count*time+t)
            M[count,:] = np.concatenate([first,second])
            count+=1
    return M

def normalize_M(M,path):
    M_norm = np.zeros((np.shape(M)[0],np.shape(M)[1]))
    M_max = np.amax(M, axis=0)
    M_min = -np.amax(-M,axis=0)
    maxmin = {'max_par':M_max,'min_par':M_min}
    df = pd.DataFrame(maxmin,dtype = 'float')
    df.to_csv(path+'normalization.csv')
    for i in range(np.shape(M)[0]):
        for j in range(np.shape(M)[1]):
            M_norm[i,j] = (M[i,j]- M_min[j])/(M_max[j]-M_min[j])
    return M_norm

def normalize_S(S):
    S_max = np.amax(S)
    S_min = -np.amax(-S)
    print(S_max)
    print(S_min)
    return (S-S_min)/(S_max-S_min)

def normalize_Input(Input,path,maxima,minima,n_clusters,labels):
    Input_norm = np.zeros((np.shape(Input)[0], np.shape(Input)[1]))
    for count,i in enumerate(Input):
        Input_norm[count,:] = (i-minima['V_'+str(labels[count])])/(maxima['V_'+str(labels[count])]-minima['V_'+str(labels[count])])
    df = pd.read_csv(path+'/normalization.csv')    
    for i in range(n_clusters):
        df['max V_'+str(i)] = maxima['V_'+str(i)]
        df.to_csv(path+'/normalization.csv',index=False)
        df['min V_'+str(i)] = minima['V_'+str(i)]
        df.to_csv(path+'/normalization.csv',index=False)
    return Input_norm 

def variance_simulations(S,t,seconds,sim):
    x,y = np.shape(S)
    var_mat = np.zeros((sim,y))
    for i in range(sim):
        var_mat[i,:] = S[(t-2)+i*seconds,:]   
    var_arr = np.std(var_mat,axis=0)
    #ma = np.amax(var_arr)
    #mi = np.amin(var_arr)
    return (var_arr)#-mi)/(ma-mi)
    

def formatNumber(num):
    arr = []
    for count,i in enumerate(num):
        if i % 1 == 0:
            arr.append(int(i))
        else:
            arr.append(i)
    return arr

def interpolate(x,y,z,step,method = 'cubic'):
    xi = np.arange(0,0.0006,step)
    yi = np.arange(0,0.0035,step)
    xi,yi = np.meshgrid(xi,yi)
    mask = ((xi > 0.0001) & (yi < 0.00118)) | ((xi > 7.9e-5) & (yi < 0.001784)&(yi > 0.001604))
    # interpolate
    zi = griddata((x,y),z,(xi,yi),method=method)
    zi[mask] = np.nan
    return xi,yi,zi

# DATA ACQUISITION AND PREPARATION OF INPUT MATRICES
You can choose the direction a or direction b. In the first case you will take blood clots data, in the second one you will take whatever data you have already prepared. The matrices S and M are collected accordingly to the paper cited at the beginning. Notice that here we take the transposed of what is said in the paper, since it is easier to deal with them like this with python.

In [None]:
#3a
#define variables important for acquisition and preparation of data
specie = 'vWFs'
N_cluster = 5 #size of POD basis used for clustering
n_clusters = 3
n_param = 6 #including time
mesh = 68650 #meshsize
#choose which seconds from the simulations you want to get

which_sim = np.arange(1,101,1)
print('Simulation taken = '+str(which_sim))
print('Number of simulation taken = '  +str(len(which_sim)))
range_time = np.arange(2,42,1)
rang_time = formatNumber(range_time)
print('Time interval = '+str(range_time))
print('Length simulation = '  +str(len(range_time)))

In [None]:
#4a
#get species snaphshots from simulations, parameter matrix including time and mesh coordinates;
#be careful to the name of the file and path, change it accordingly to where the files are!


S = (get_field('../DoE',which_sim,rang_time,mesh,specie))
M =(M_matrix(len(which_sim),rang_time,n_param,'../'))
cell_centers = read_files_centers('../cellCenters')
cell_true = []
for i in cell_centers:
    cell_true.append(np.array(i[1:-1].split()).astype(np.float64))
cell_true = np.array(cell_true)
print('Shape S = ' + str(np.shape(S)))
print('Shape M = '+ str(np.shape(M)))

In [None]:
#5a
#create the necessary directories
path_cluster = '../'+specie+'/CLASSIFICATION/MODELS/'+str(N_cluster)+'/Data/'
os.makedirs(path_cluster,exist_ok=True)

In [None]:
#3b
#use only if you have not used direction a!

#Define S and M as above
S = #you know it
M = #you know it 

In [None]:
#4b #use only if you have not used direction a!

In [None]:
#5b
#use only if you have not used direction a!

#create the necessary directories
path_cluster = 'whatever you want'
os.makedirs(path_cluster)
os.makedirs(path_cluster,exist_ok=True)

# CLUSTERING SECTION

In [None]:
#6
#Compute POD on S, the POD basis size is N_cluster. I suggest to keep it small

_,s,v = sklearn.utils.extmath.randomized_svd(S,N_cluster,random_state = 8) #random SVD on S

V_transp = v #define basis Matrix v  
Input = np.zeros((np.shape(S)[0],N_cluster))
print('Shape V transposed = '+ str(np.shape(V_transp)))
print('Shape Input coefficients = '+ str(np.shape(Input)))



In [None]:
#7
#Get the coefficients for every snapshot of S and put it in Input

for count,data in enumerate(S):
    Input[count,:] = np.matmul(V_transp,data)
print('Shape Input coefficients = '+ str(np.shape(Input)))

In [None]:
#8
#Normalize every snapshot coefficients by their maximum and minimum

Input_norm = np.zeros((np.shape(S)[0],N_cluster))
for count,i in enumerate(Input):
    Input_norm[count,:] = (i-np.amin(i))/(np.amax(i)-np.amin(i))
Input = Input_norm.copy()


In [None]:
#9
#save input
savetxt(path_cluster+'/coefficients.csv', Input, delimiter=',')

In [None]:
#10
#perform the clustering

X = Input
fcm = FCM(n_clusters=n_clusters)
fcm.fit(X)
fcm_centers = fcm.centers #get the centers in a dimensional space of N dimension
fcm_labels = fcm.predict(X) # get the labels for each set of coefficients and thus for each snapshot

In [None]:
#11
#divide in training and testing for the classification algorithm

param_train = copy.copy(M[0:3800])
labels_train = copy.copy(fcm_labels[0:3800])
param_test = copy.copy(M[-200:])
labels_test = copy.copy(fcm_labels[-200:])
print('Shape param_train = ' + str(np.shape(param_train)))
print('Shape param_test = ' + str(np.shape(param_test)))
print('Shape labels_train = ' + str(np.shape(labels_train)))
print('Shape labels_test = ' + str(np.shape(labels_test)))

In [None]:
#12
#normalize parameters and save in a file maxima and minima of every feature 

param_train = normalize_M(param_train,path_cluster)

In [None]:
#13
#save training matrices

savetxt(path_cluster+'/param.csv', param_train, delimiter=',')
savetxt(path_cluster+'/labels.csv', labels_train, delimiter=',')

In [None]:
#14
#save test matrices

savetxt(path_cluster+'/param_test.csv', param_test, delimiter=',')
savetxt(path_cluster+'/label_test.csv', labels_test, delimiter=',')

END OF CLUSTERING SECTION. NOW YOU SHOULD RUN THE CLASSIFIER IN ../code_CLASSIFICATION/main_training.py. After that you have a classifier. Then you can proceed to the next section to prepare data for POD-DL-ROM based on clustering.


# CREATE THE N_CLUSTERS POD BASIS BASED ON FCM_LABELS

In [None]:
#15a
#create path for POD-DL-ROM if you use blood clots
N = 64
path_pod_dl_rom = '../'+specie+'/MODELS/'+str(N)+'/Data/'
os.makedirs(path_pod_dl_rom,exist_ok=True)

In [None]:
#15b
#create path for POD-DL-ROM if you use blood clots

path_pod_dl_rom = 'whatever you want'
os.makedirs(path_pod_dl_rom,exist_ok=True)

In [None]:
#16
# divide snapshots (not coefficients!) and parameters vectors in train and test

S_train = copy.copy(S[0:3800])
S_test = copy.copy(S[-200:])
M_train = copy.copy(M[0:3800])
M_test = copy.copy(M[-200:])
print('Shape S_train = ' + str(np.shape(S_train)))
print('Shape S_test = ' + str(np.shape(S_test)))
print('Shape M_train = ' + str(np.shape(M_train)))
print('Shape M_test = ' + str(np.shape(M_test)))

In [None]:
#17
#create the n_cluster snapshot matrices. Notice the labels_train from the clustering!

clusters = {}
for i in range(n_clusters):
    clusters['V_'+str(i)] = []
for count, data in enumerate(S_train):
    clusters['V_'+str(labels_train[count])].append(data)

In [None]:
#18
#create the n_cluster POD basis with their singular values s!

pod_basis = {}
for i in range(n_clusters):
    _,s,v = sklearn.utils.extmath.randomized_svd(np.array(clusters['V_'+str(i)]),N,random_state = 8)
    pod_basis['V_'+str(i)] = v,s

In [None]:
#19
#create coefficient matrix Input. The coefficient will be obtained by projecting onto the correct basis!
#at the same time we get the max and the min for every cluster, they will be used for the normalization

maxima = {}
minima = {}

for i in range(n_clusters):
    maxima['V_'+str(i)] = 0
    minima['V_'+str(i)] = 0

Input = np.zeros((np.shape(S_train)[0],N))
for count,data in enumerate(S_train):
    projection =  np.matmul(pod_basis['V_'+str(labels_train[count])][0],data)
    Input[count,:] = projection
    ma = np.amax(projection)
    mi = np.amin(projection)
    if np.amax(projection)>maxima['V_'+str(labels_train[count])]:
        maxima['V_'+str(labels_train[count])] = ma
    if np.amin(projection)<minima['V_'+str(labels_train[count])]:
        minima['V_'+str(labels_train[count])] = mi

print('Input matrix shape = '+str(np.shape(Input)))

In [None]:
#19
#normalize data and save on file normalization the max and the min
#both of parameter matrix and of POD coefficients for every basis

M_train = normalize_M(M_train,path_pod_dl_rom)
Input = normalize_Input(Input,path_pod_dl_rom,maxima,minima,n_clusters,labels_train)

In [None]:
#20
#save training matrices

savetxt(path_pod_dl_rom+'/Input.csv', Input, delimiter=',')
savetxt(path_pod_dl_rom+'/M_train.csv',M_train,delimiter = ',')


In [None]:
#21
#save test matrices
savetxt(path_pod_dl_rom+'/S_test.csv', S_test, delimiter=',')
savetxt(path_pod_dl_rom+'/M_test.csv', M_test, delimiter=',')


In [None]:
#22
#save labels test
savetxt(path_pod_dl_rom+'/label_test.csv', labels_test, delimiter=',')

In [None]:
#23
#save POD MATRICES

for i in range(n_clusters):
    savetxt(path_pod_dl_rom+'/V_'+str(i)+'.csv', pod_basis['V_'+str(i)][0].transpose(), delimiter=',')

Now you should send the directory path_pod_dl_rom to the cluster and run main_training.py of the directory LOCALIZED-POD-DL-ROM/code_LOC-POD-DL-ROM.

# END OF DATA ACQUISITION, NOW YOU SHOULD CHECK THE DATA

In [None]:
#visualize some modes
fig,ax = plt.subplots(1,3,figsize = (20,10))
cm = plt.cm.get_cmap('RdYlBu_r')
#ax = fig.gca(projection='3d')
scatter_plot = ax[0].scatter(cell_true[:,0], cell_true[:,1], c = pod_basis['V_0'][0][0,:], lw=0, s=20,cmap=cm)
plt.colorbar(scatter_plot, ax=ax[0])
scatter_plot = ax[1].scatter(cell_true[:,0], cell_true[:,1], c = pod_basis['V_2'][0][0,:], lw=0, s=20,cmap=cm)
plt.colorbar(scatter_plot, ax=ax[1])
scatter_plot = ax[2].scatter(cell_true[:,0], cell_true[:,1], c = pod_basis['V_1'][0][0,:], lw=0, s=20,cmap=cm)
plt.colorbar(scatter_plot, ax=ax[2])

In [None]:
V = np.zeros((40,256))
for i in range(40):
    v = variance_simulations(X,i+2,40,100)
    V[i,:] = v
savetxt(path+'/CLASSIFICATION'+'/MODELS/'+dir_cst+'/Data/'+'/std.csv', V, delimiter=',')

In [None]:
plt.plot(fcm_labels[0:40],'o')


In [None]:
print(M)