In [1]:
import numpy as np 
import pandas as pd
import os
import re

In [2]:
data_folder = "output_31_03_2025/" #"output_19_01_2025/" ; "output_21_03_2025/", "output_31_03_2025/" ; 
folder_path_outputs = '/app/nse/outputs/' + data_folder
file_path_results = '/app/nse/results/' + data_folder

In [4]:
x_size = 128
y_size = 128
z_size = 64
x_lenght = 2000
y_width = 2000
z_height = 1000
x_start = 500

In [5]:
def read_data(filename):
    with open(filename, 'r') as file:
        # Skip the first three lines
        for _ in range(3):
            next(file)
        
        # Read the rest of the file
        data = np.loadtxt(file)
    
    return data

In [6]:
def get_folder_paths(folder_path, folder_pattern):
    folder_list = [folder_path + f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f)) and re.search(folder_pattern, f)]
    folder_list.sort(key=os.path.getctime)
    return folder_list

In [7]:
def extract_number(filename):
    match = re.search(r'C\[(\d+)\]-avg-\.plt', filename)
    return int(match.group(1)) if match else float('inf') 

In [8]:
def sort_filenames(filenames):
    return sorted(filenames, key=extract_number)

In [9]:
def get_filenames(file_path, file_pattern):
    file_list = [f for f in os.listdir(file_path) if os.path.isfile(os.path.join(file_path, f)) and re.search(file_pattern, f)]
    file_list = sort_filenames(file_list)
    return file_list

In [10]:
def get_coords_from_np(data, num_coords):
    coords = []
    for i in range(num_coords):
        coord = np.unique(data[:, i])
        coords.append(coord)

    return coords

In [11]:
def get_matrix(data, coords_size):
    return np.reshape(data, coords_size) 

In [12]:
def  get_moments_from_1d(slice, coord):
    slice[slice<0]=0
    sl_sum = np.sum(slice)
    if sl_sum != 0:
        slice = np.asarray(slice)/(np.sum(slice))
    mean = np.sum(np.asarray(slice) * np.asarray(coord))
    std_2 = np.sum((np.asarray(coord) - mean) **2 * np.asarray(slice))
    std = np.sqrt(std_2)
    return mean, std

In [13]:
def get_moments_from_2d(slice, coord_1, coord_2):
    slice[slice<0]=0
    slice_2 = np.array([np.sum(s) for s in slice])
    slice_T = np.array(slice).T
    slice_1 = np.array([np.sum(s) for s in slice_T])
    mean_1, std_1 = get_moments_from_1d(slice_1, coord_1)
    mean_2, std_2 = get_moments_from_1d(slice_2, coord_2)
    return mean_1, std_1, mean_2, std_2

In [14]:
def get_slice_from_3d(data, distance, coord_name):
    match coord_name:
        case "x":
            return data[:,:, distance]
        case "y":
            return data[:, distance, :]
        case "z":
            return data[distance, :, :]

In [15]:
def get_slice_from_2d(data, distance, coord_name):
    match coord_name:
        case 1:
            return data[:, distance]
        case 2:
            return data[distance, :]

In [16]:
def get_moments_from_paths_2d(stat_xy_path, stat_xz_path, xy_files, xz_files):
    global x_size, y_size, z_size, x_start
    dist_list = []
    mean_y_list = []
    std_y_list = []
    mean_z_list = []
    std_z_list = []
    
    for i in range(len(xy_files)):
        data_xy = read_data(stat_xy_path + xy_files[i])
        
        data_xz = read_data(stat_xz_path + xz_files[i])
        
        coord_x, coord_y = get_coords_from_np(data_xy, 2)
        coord_x, coord_z = get_coords_from_np(data_xz, 2)

        data_m_xy = get_matrix(data_xy[:, -1], [y_size, x_size]) # y, x
        data_m_xz = get_matrix(data_xz[:, -1], [z_size, x_size]) # z, x 
        start_idx = np.where(coord_x > x_start)[0][0]
        for idx in range(start_idx, len(coord_x)):
            dist_list.append(coord_x[idx] - x_start)
            slice_y = get_slice_from_2d(data_m_xy, idx, 1)
            slice_z = get_slice_from_2d(data_m_xz, idx, 1)

            mean_y, std_y = get_moments_from_1d(slice_y, coord_y)
            mean_z, std_z = get_moments_from_1d(slice_z, coord_z)
            
            mean_y_list.append(mean_y)
            std_y_list.append(std_y)
            mean_z_list.append(mean_z)
            std_z_list.append(std_z)
    return dist_list, mean_y_list, std_y_list, mean_z_list, std_z_list

In [17]:
def get_moments_from_paths_3d(stat_path, files):
    global x_size, y_size, z_size, x_start
    dist_list = []
    mean_y_list = []
    std_y_list = []
    mean_z_list = []
    std_z_list = []
    
    for i in range(len(files)):
        data = read_data(stat_path + files[i])
        
        coord_x, coord_y, coord_z = get_coords_from_np(data, 3)

        data_m = get_matrix(data[:, -1], [z_size, y_size, x_size]) # z, y, x
        start_idx = np.where(coord_x > x_start)[0][0]
        for idx in range(start_idx, len(coord_x)):
            dist_list.append(coord_x[idx] - x_start)
            slice_yz = get_slice_from_3d(data_m, idx, "x") # z, y

            mean_y, std_y, mean_z, std_z = get_moments_from_2d(slice_yz, coord_y, coord_z)
            
            mean_y_list.append(mean_y)
            std_y_list.append(std_y)
            mean_z_list.append(mean_z)
            std_z_list.append(std_z)
    return dist_list, mean_y_list, std_y_list, mean_z_list, std_z_list

In [18]:
outputs_folder_names = get_folder_paths(folder_path_outputs, "output*")

In [19]:
len(outputs_folder_names)

50

In [20]:
mean_y_list = []
mean_z_list = []
std_y_list = []
std_z_list = []
distances_list = []

In [None]:
# for folder in outputs_folder_names:
#     stat3d_path = folder + "/stat-3d/"
#     stat3d_files = get_filenames(stat3d_path, r'^C\[\d+\]-avg-\.plt$')
#     dist_tmp, mean_y_tmp, std_y_tmp, mean_z_tmp, std_z_tmp = get_moments_from_paths_3d(stat3d_path, stat3d_files)
#     distances_list.extend(dist_tmp)
#     mean_y_list.extend(mean_y_tmp)
#     mean_z_list.extend(mean_z_tmp)
#     std_y_list.extend(std_y_tmp)
#     std_z_list.extend(std_z_tmp)

In [21]:
%%time
i=0
for folder in outputs_folder_names:
    stat2d_xy_path = folder + "/stat-2d-XY/"
    stat2d_xy_files = get_filenames(stat2d_xy_path, r'^C\[\d+\]-avg-\.plt$')
    stat2d_xz_path = folder + "/stat-2d-XZ/"
    stat2d_xz_files = get_filenames(stat2d_xy_path, r'^C\[\d+\]-avg-\.plt$')
    dist_tmp, mean_y_tmp, std_y_tmp, mean_z_tmp, std_z_tmp = get_moments_from_paths_2d(stat2d_xy_path, stat2d_xz_path, stat2d_xy_files, stat2d_xz_files)
    distances_list.extend(dist_tmp)
    mean_y_list.extend(mean_y_tmp)
    mean_z_list.extend(mean_z_tmp)
    std_y_list.extend(std_y_tmp)
    std_z_list.extend(std_z_tmp)
    i+=1
    print(i)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
CPU times: user 1min 44s, sys: 5.33 s, total: 1min 50s
Wall time: 8min 21s


In [None]:
# # features_pd = pd.read_csv(file_path_results + "features_compare.csv")
# features_pd = pd.read_csv(file_path_results + "features.csv")
# # features_pd["distances"] = distances_list
# features_pd["Tracer"] = features_pd["Tracer"].astype(int) 
# print(features_pd.shape)

(5800, 9)


In [None]:
# features_pd.drop(features_pd.tail(100*64).index, inplace = True)
# features_pd.insert(
#     loc = 0,
#     column = "Tracer",
#     value = list(np.arange(1, 65).astype(int)) * 400  
# )
# features_pd.to_csv(file_path_results + "features.csv", index=False)

In [25]:
target_pd = pd.DataFrame({
                        "c_mean_z": mean_z_list,
                        "c_mean_y":mean_y_list, 
                        "c_std_z": std_z_list,
                        "c_std_y": std_y_list})
target_pd.to_csv(file_path_results + "target_full.csv", index=False)

In [26]:
num_distances = 96

In [None]:
# features_tmp_pd = pd.read_csv(file_path_results + "features_full.csv")
# distances_list = features_tmp_pd["distances"]

In [28]:
features_pd = pd.read_csv(file_path_results + "features.csv")
features_full = features_pd.loc[features_pd.index.repeat(num_distances)].reset_index(drop=True)
features_full["distances"] = distances_list
features_full.to_csv(file_path_results +"features_full.csv", index=False)

In [27]:
target_pd["distances"] = distances_list
target_pd["_tmp_index"] = target_pd.index
target_pd_sort = target_pd.sort_values(by=["distances", "_tmp_index"]) 
target_pd_sort = target_pd_sort.drop(columns=["_tmp_index"])

In [29]:
for dist in np.unique(distances_list):
    tmp_table = target_pd_sort[target_pd_sort.distances == dist].dropna().drop(columns=["distances"])
    tmp_table.to_csv(file_path_results + "target-" + str(dist).replace(".", "_") + ".csv", index=False)