# Reduce training data extracted from simulations

The *extractTrainingData* utility currently extracts all the available data but at lot of the data is redundant. For example, in the interface region between $\varphi=0^\circ$ and $\varphi=120^\circ$ the boundary layer is quickly developed (after about $\tilde{t}\approx0.3$). In the rear part, however, it takes until about $\tilde{t}\approx1$ before the concentration field close to the interface is developed. Moreover, the changes along $\varphi$ are small, but the number of faces along the surface is enormous.

## Loading the raw data

In [1]:
import helper_module as hm
import pandas as pd
import numpy as np
import gc

data_path = "../data/sgs_data/"
names_phys = ["t", "x", "y", "dist", "A_av", "A_f", "gradA_f", "gradA_s"]
names_decay = ["t", "x", "y", "dist", "A_av", "A_f", "gradA_f", "gradA_s", "P_av", "P_f", "gradP_f"]
names_single = ["t", "x", "y", "dist", "A_av", "A_f", "gradA_f", "gradA_s", "B_av", "B_f", "gradB_f",
                "P_av", "P_f", "gradP_f", "rAB_av"]
names_cons = ["t", "x", "y", "dist", "A_av", "A_f", "gradA_f", "gradA_s", "B_av", "B_f", "gradB_f",
                "P_av", "P_f", "gradP_f", "S_av", "S_f", "gradS_f", "rAB_av", "rAP_av"]
all_names = [names_phys, names_decay, names_single, names_cons]
names_drop = ["t", "x", "y", "phi"]

In [2]:
def read_raw_data(path, names):
    raw = pd.read_csv(path, header=0, names=names)
    print("The file {:s} has {:d} rows and {:d} columns.".format(path, raw.shape[0], raw.shape[1]))
    print("")
    print(raw.head(5))
    return raw
    
def find_closest_entry(df, column, values):
    unique = df[column].unique()
    closest = np.zeros(values.shape[0])
    for i, val in enumerate(values):
        idx = (np.abs(unique - val)).argmin()
        closest[i] = unique[idx]
    return closest
    
def reduce_data(path, names, time_list, phi_list):
    print("")
    raw = read_raw_data(path, names)
    times = find_closest_entry(raw, "t", time_list)
    drop_times = np.asarray([t for t in raw.t.unique() if t not in times])
    index = raw[raw.t == drop_times[0]].index
    for t in drop_times[1:]:
        index = np.append(index, raw[raw.t == t].index)
    raw = raw.drop(index)
    print("Shape after dropping times: ", raw.shape)
    _, phi = hm.transform_polar_2D(raw.x.values, raw.y.values)
    raw["phi"] = phi
    phi = find_closest_entry(raw, "phi", phi_list)
    drop_phi = np.asarray([p for p in raw.phi.unique() if p not in phi])
    index = raw[raw.phi == drop_phi[0]].index
    for p in drop_phi[1:]:
        index = np.append(index, raw[raw.phi == p].index)
    red = raw.drop(index)
    print("Shape after dropping phis: ", red.shape)
    return red

In [3]:
reactions = ["phys", "decay", "single", "cons"]
# list of times for physisorption
time_list_phys = np.arange(0.005, 0.101, 0.005)
time_list_phys = np.append(time_list_phys, np.arange(0.1, 1.05, 0.1))
# list of times for all reactions
time_list = np.arange(0.005, 0.101, 0.005)
time_list = np.append(time_list, np.arange(0.1, 3.05, 0.1))
all_time_lists = [time_list_phys, time_list, time_list, time_list]
# list of polar angles to consider
phi_list = np.linspace(0.0, np.pi, 20)

for i, r in enumerate(reactions):
    path = data_path + "training_data_{:s}.csv".format(r)
    red = reduce_data(path, all_names[i], all_time_lists[i], phi_list)
    red = red.drop(columns=names_drop)
    red.to_csv(data_path + "{:s}_red.csv".format(r), index=False)
    gc.collect()


The file ../data/sgs_data/training_data_phys.csv has 2724000 rows and 8 columns.

       t         x        y      dist      A_av       A_f      gradA_f  \
0  0.005  0.051564 -0.49733  0.000147  0.905043  0.813109 -1240.535451   
1  0.005  0.051564 -0.49733  0.000308  0.803876  0.621159 -1115.389324   
2  0.005  0.051564 -0.49733  0.000485  0.700497  0.439614  -921.386642   
3  0.005  0.051564 -0.49733  0.000680  0.599819  0.284136  -686.945425   
4  0.005  0.051564 -0.49733  0.000895  0.506663  0.165407  -456.028159   

      gradA_s  
0 -1296.06242  
1 -1296.06242  
2 -1296.06242  
3 -1296.06242  
4 -1296.06242  
Shape after dropping times:  (394980, 8)
Shape after dropping phis:  (17400, 9)

The file ../data/sgs_data/training_data_decay.csv has 8172000 rows and 11 columns.

       t         x        y      dist      A_av       A_f      gradA_f  \
0  0.005  0.051564 -0.49733  0.000147  0.904789  0.812738 -1242.502000   
1  0.005  0.051564 -0.49733  0.000308  0.803493  0.620635 -1115