# Training ML models to predict CFD 


In [None]:
import os, glob, re, time
import copy as cp
import numpy as np
import statistics, scipy
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi']= 200
matplotlib.rcParams.update({'font.size': 5})

## <span style='background :yellow' > Import Shape Modelling PCA Data </span> 

In [None]:
# path to .csv file with shape mode scores
ssm_pca_path = ""

In [None]:
ssm_file = open(ssm_pca_path, "r")
ssm_df = pd.read_csv(ssm_file, header=None, index_col=False)
ssm_pca_u = ssm_df.to_numpy()
print(np.shape(ssm_pca_u))

**keep only first X modes (99% cumvar)**

In [None]:
ssm_modes_n = 35

In [None]:
ssm_pca_u = ssm_pca_u[:, :ssm_modes_n]
print(np.shape(ssm_pca_u))

## <span style='background :yellow' > Import CFD Data </span> 

In [None]:
# human sorting function

def tryint(s):
    try:
        return int(s)
    except:
        return s

def alphanum_key(s):
    return [ tryint(c) for c in re.split('([0-9]+)', s) ]

def sort_nicely(l):
    l.sort(key=alphanum_key)
    return l

In [None]:
# path to folder containing CFD simulations in total point correspondence
cfd_path = ""

In [None]:
fnames = sort_nicely(os.listdir(cfd_path))

**import all csv flow fields (keep columns 1: pressure, 3: velocity)**

In [None]:
# build array of csv files
cfd_files = sort_nicely(os.listdir(cfd_path))
cfd_data = []
cfd_empty = [] # only x,y,z

for fn in cfd_files:
    file = open(cfd_path + fn, "r")
    df = pd.read_csv(file, index_col=False)
    df.columns = df.columns.str.replace(' ', '')
    # pressure/velocity df
    df_cfd = df.drop(df.columns[[2, 3, 4]], axis=1)
    # points xyz df
    df_empty = df.drop(df.columns[[0, 1]], axis=1)
    cfd_data.append(df_cfd.values)
    cfd_empty.append(df_empty.values)

In [None]:
# cfd_data[subject][node][pressure/velocity]
cfd_data = np.array(cfd_data)

# p_data[subject][node_pressure]
p_data = np.squeeze(cfd_data[:, :, :1])
v_data = np.squeeze(cfd_data[:, :, 1:])

In [None]:
# 594 subjects, 27420 pressure nodes (features)
np.shape(p_data)

### <span style='background :yellow' > SPLIT DATASET </span> 

In [None]:
train_n = 2800
test_n = 200

In [None]:
X_train, X_test = train_test_split(ssm_pca_u, train_size=train_n, shuffle=False)

p_data_train, p_data_test = train_test_split(p_data, train_size=train_n, shuffle=False)

v_data_train, v_data_test = train_test_split(v_data, train_size=train_n, shuffle=False)

# <span style='background :pink' > PCA ON CFD *TRAINING* DATA </span> 

#### rows=training subjects, columns=features

### Pressure

In [None]:
# center each feature
scaler_p = StandardScaler(with_std=True)

# fit scaler on TRAINING (including VALIDATION) set
p_data_train_scaled = scaler_p.fit_transform(p_data_train)

p_data_test_scaled = scaler_p.transform(p_data_test)

In [None]:
# compute PCA
U_p, S_p, Vt_p = np.linalg.svd(p_data_train_scaled, full_matrices=False)

In [None]:
'''
Get cumulative variance (start from mode 0)
Keep adding modes until 99% variance captured
'''
cumvar_p = np.cumsum(S_p**2) / sum(S_p**2)
cumvar_p[:12]

In [None]:
n_modes_p = 12

In [None]:
'''
U matrix is [subjects, scores(per mode)]
(not needed in training)
'''
p_data_pca_train = (U_p[:, :n_modes_p])
p_data_pca_train.shape

### Velocity

In [None]:
scaler_v = StandardScaler(with_std=True)

v_data_train_scaled = scaler_v.fit_transform(v_data_train)

v_data_test_scaled = scaler_v.transform(v_data_test)

In [None]:
# compute PCA
U_v, S_v, Vt_v = np.linalg.svd(v_data_train_scaled, full_matrices=False)

In [None]:
'''
Capturing variance much more difficult for velocity
'''
cumvar_v = np.cumsum(S_v**2) / sum(S_v**2)
cumvar_v[:55]

In [None]:
n_modes_v = 55

### Finalise train and test datasets

In [None]:
P_train, P_test = p_data_train_scaled, p_data_test_scaled
V_train, V_test = v_data_train_scaled, v_data_test_scaled

In [None]:
print("X_train: "+str(X_train.shape)+" X_test: " + str(X_test.shape))
print("")
print("P_train: "+str(P_train.shape)+" P_test: "+str(P_test.shape))
print("V_train: "+str(V_train.shape)+" V_test: " + str(V_test.shape))

# <span style='background :pink' > DEEP LEARNING </span> 

In [None]:
import tensorflow as tf
from tensorflow import keras
import keras.backend as K
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("tensorflow version: " + str(tf.__version__))
print("keras version: " + str(keras.__version__))

**define lambda layer for inverse PCA as a DNN layer**

In [None]:
# get S and V matrices in tensor type
S_p_tensor = tf.convert_to_tensor(S_p)
Vt_p_tensor = tf.convert_to_tensor(Vt_p)
S_v_tensor = tf.convert_to_tensor(S_v)
Vt_v_tensor = tf.convert_to_tensor(Vt_v)

In [None]:
def inv_pressure_pca(x):
    x = tf.cast(x, tf.float64)
    n_modes = x.get_shape().as_list()[1]
    x_inv = tf.matmul(tf.matmul(x, tf.linalg.diag(S_p_tensor[:n_modes])), Vt_p_tensor[:n_modes, :])
    return x_inv

def inv_velocity_pca(x):
    x = tf.cast(x, tf.float64)
    n_modes = x.get_shape().as_list()[1]
    x_inv = tf.matmul(tf.matmul(x, tf.linalg.diag(S_v_tensor[:n_modes])), Vt_v_tensor[:n_modes, :])
    return x_inv

# <span style='background :lightgreen' > HYPERPARAMETER OPTIMISATION </span> 

In [None]:
import optuna
import plotly
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit

### <span style='background :lightgreen' > Initial Search </span>

Conduct initial coarse grid search for pressure and velocity in order to narrow down search space.

Assign fixed train/validation folds for sklearn.  

Take out portion of training set to make a fixed validation set.

In [None]:
train_indices = np.full((train_n-test_n,), -1, dtype=int)
val_indices = np.full((test_n,), 0, dtype=int)
train_val_fold = np.append(train_indices, val_indices)

print(train_val_fold)
print(train_val_fold.shape)

In [None]:
ps = PredefinedSplit(train_val_fold)
ps.get_n_splits()

Build models

In [None]:
def build_model_p(NUM_LAYERS, NUM_UNITS, LEARN_RATE):
    INPUT = ssm_modes_n
    OUTPUT = n_modes_p
    model = keras.models.Sequential()
    # input layer
    model.add(keras.layers.Dense(INPUT, activation="relu"))
    # hidden layers
    for layer in range(NUM_LAYERS):
        model.add(keras.layers.Dense(NUM_UNITS, activation="relu"))                                     
    # output layer
    model.add(keras.layers.Dense(OUTPUT, activation="linear"))
    # inverse pca
    model.add(keras.layers.Lambda(inv_pressure_pca))
    # optimiser and loss
    model.compile(loss="mae", optimizer='adam', run_eagerly=True)
    
    return model

In [None]:
def build_model_v(NUM_LAYERS, NUM_UNITS, LEARN_RATE):
    INPUT = ssm_modes_n
    OUTPUT = n_modes_v
    model = keras.models.Sequential()
    # input layer
    model.add(keras.layers.Dense(INPUT, activation="relu"))
    # hidden layers
    for layer in range(NUM_LAYERS):
        model.add(keras.layers.Dense(NUM_UNITS, activation="relu"))                                     
    # output layer
    model.add(keras.layers.Dense(OUTPUT, activation="linear"))
    # inverse pca
    model.add(keras.layers.Lambda(inv_velocity_pca))
    # optimiser and loss
    model.compile(loss="mae", optimizer='adam', run_eagerly=True)
    
    return model

In [None]:
model_p_grid = KerasRegressor(build_fn=build_model_p, epochs=100)
model_v_grid = KerasRegressor(build_fn=build_model_v, epochs=100)

In [None]:
param_grid = [
  {'NUM_LAYERS': [4, 5, 6], 
   'NUM_UNITS' : [200, 400, 600],
   'LEARN_RATE': [1e-4, 5e-4, 1e-3],
   'batch_size': [32]
  }]

Fit Pressure Models

In [None]:
gs_p = GridSearchCV(estimator=model_p_grid, param_grid=param_grid, cv=ps, verbose=1)

gs_p = gs_p.fit(X_train, P_train)

In [None]:
gs_p.best_params_

In [None]:
gs_p.best_score_

Fit Velocity Models

In [None]:
gs_v = GridSearchCV(estimator=model_v_grid, param_grid=param_grid, cv=ps, verbose=1)

gs_v = gs_v.fit(X_train, V_train)

In [None]:
gs_v.best_params_

In [None]:
gs_v.best_score_

### <span style='background :lightgreen' > Cross Validation </span>

2nd round of hyperparam search.  

refine models around initial search result and perform 5-fold CV.

### Models CV

In [None]:
def build_model_p_cv(trial):
    INPUT = ssm_modes_n
    OUTPUT = n_modes_p
    # hyperparams
    NUM_LAYERS = trial.suggest_int("n_layers", 4, 8, step=1)
    NUM_UNITS = trial.suggest_int("n_units", 300, 500, step=25)
    LEARN_RATE = trial.suggest_loguniform("learn_rate", 1e-5, 5e-3)
    # model
    model = keras.models.Sequential()
    # input layer
    model.add(keras.layers.Dense(INPUT, activation="relu"))
    # hidden layers
    for layer in range(NUM_LAYERS):
        model.add(keras.layers.Dense(NUM_UNITS, activation="relu"))                                     
    # output layer
    model.add(keras.layers.Dense(OUTPUT, activation="linear"))
    # inverse pca
    model.add(keras.layers.Lambda(inv_pressure_pca))
    # optimiser and loss
    model.compile(loss="mae", optimizer='adam', run_eagerly=True)
    
    return model

In [None]:
def build_model_v_cv(trial):
    INPUT = ssm_modes_n
    OUTPUT = n_modes_v
    # hyperparams
    NUM_LAYERS = trial.suggest_int("n_layers", 5, 8, step=1)
    NUM_UNITS = trial.suggest_int("n_units", 100, 200, step=10)
    LEARN_RATE = trial.suggest_loguniform("learn_rate", 5e-4, 5e-3)
    # model
    model = keras.models.Sequential()
    # input layer
    model.add(keras.layers.Dense(INPUT, activation="relu"))
    # hidden layers
    for layer in range(NUM_LAYERS):
        model.add(keras.layers.Dense(NUM_UNITS, activation="relu"))                                     
    # output layer
    model.add(keras.layers.Dense(OUTPUT, activation="linear"))
    # inverse pca
    model.add(keras.layers.Lambda(inv_velocity_pca))
    # optimiser and loss
    model.compile(loss="mae", optimizer='adam', run_eagerly=True)
    
    return model

### Objectives CV

In [None]:
def objective_p_cv(trial):
    
    NUM_FOLDS = 5
    EPOCHS = 50
    BATCH_SIZE = 32
    kfold = KFold(n_splits=NUM_FOLDS)
    
    cv_scores=[]
    for train_i, test_i in kfold.split(X_train):
        # pressure data
        x_train, x_valid = X_train[train_i], X_train[test_i]
        y_train, y_valid = P_train[train_i], P_train[test_i]
        
        model = build_model_p_cv(trial)
        
        model.fit(
        x_train,
        y_train,
        batch_size=BATCH_SIZE,
        validation_data=(x_valid, y_valid),
        epochs=EPOCHS,
        callbacks=[optuna.integration.TFKerasPruningCallback(trial, "val_loss"),
                   tf.keras.callbacks.EarlyStopping(patience=10)],
        verbose=1)
        
        score = model.evaluate(x_valid, y_valid, verbose=1)
        cv_scores.append(score)
    
    return np.mean(cv_scores)

In [None]:
def objective_v_cv(trial):
    
    NUM_FOLDS = 5
    EPOCHS = 50
    BATCH_SIZE = 32
    kfold = KFold(n_splits=NUM_FOLDS)
    
    cv_scores=[]
    for train_i, test_i in kfold.split(X_train):
        # pressure data
        x_train, x_valid = X_train[train_i], X_train[test_i]
        y_train, y_valid = V_train[train_i], V_train[test_i]
        
        model = build_model_v_cv(trial)
        
        model.fit(
        x_train,
        y_train,
        batch_size=BATCH_SIZE,
        validation_data=(x_valid, y_valid),
        epochs=EPOCHS,
        callbacks=[optuna.integration.TFKerasPruningCallback(trial, "val_loss"),
                   tf.keras.callbacks.EarlyStopping(patience=10)],
        verbose=1)
        
        score = model.evaluate(x_valid, y_valid, verbose=1)
        cv_scores.append(score)
    
    return np.mean(cv_scores)

### Pressure CV

In [None]:
study_p_cv = optuna.create_study(direction="minimize", 
                              sampler=optuna.samplers.TPESampler(), 
                              pruner=optuna.pruners.HyperbandPruner())

study_p_cv.optimize(objective_p_cv, n_trials=250, gc_after_trial=True)

In [None]:
pruned_trials_p_cv = study_p_cv.get_trials(deepcopy=False, states=[optuna.trial.TrialState.PRUNED])
complete_trials_p_cv = study_p_cv.get_trials(deepcopy=False, states=[optuna.trial.TrialState.COMPLETE])

In [None]:
print("Study statistics: ")
print("  Number of finished trials: ", len(study_p_cv.trials))
print("  Number of pruned trials: ", len(pruned_trials_p_cv))
print("  Number of complete trials: ", len(complete_trials_p_cv))

print("Best trial:")
trial = study_p_cv.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
optuna.visualization.plot_intermediate_values(study_p_cv)

In [None]:
optuna.visualization.plot_slice(study_p_cv)

In [None]:
optuna.visualization.plot_contour(study_p_cv)

In [None]:
optuna.visualization.plot_param_importances(study_p_cv)

### Velocity CV

In [None]:
study_v_cv = optuna.create_study(direction="minimize", 
                              sampler=optuna.samplers.TPESampler(), 
                              pruner=optuna.pruners.HyperbandPruner())

study_v_cv.optimize(objective_v_cv, n_trials=250, gc_after_trial=True)

In [None]:
pruned_trials_v_cv = study_v_cv.get_trials(deepcopy=False, states=[optuna.trial.TrialState.PRUNED])
complete_trials_v_cv = study_v_cv.get_trials(deepcopy=False, states=[optuna.trial.TrialState.COMPLETE])

In [None]:
print("Study statistics: ")
print("  Number of finished trials: ", len(study_v_cv.trials))
print("  Number of pruned trials: ", len(pruned_trials_v_cv))
print("  Number of complete trials: ", len(complete_trials_v_cv))

print("Best trial:")
trial = study_v_cv.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
optuna.visualization.plot_intermediate_values(study_v_cv)

In [None]:
optuna.visualization.plot_slice(study_v_cv)

In [None]:
optuna.visualization.plot_contour(study_v_cv)

In [None]:
optuna.visualization.plot_param_importances(study_v_cv)

# <span style='background :red' > TRAINING TUNED MODELS </span> 

In [None]:
run_attempt = 1

**TensorBoard Path**

In [None]:
root_logdir = os.getcwd() + "/Tensorflow/my_logs/"

# make a path using date/time
def get_run_logdir(metric):
    run_id = "run " + str(run_attempt) + "__" + time.strftime("run_%Y_%m_%d__%H-%M") + "__" + metric
    return os.path.join(root_logdir, run_id)

In [None]:
tensorboard_p_logdir = get_run_logdir("Pressure")
tensorboard_v_logdir = get_run_logdir("Velocity")

**DNN Model Path**

In [None]:
model_dir = os.getcwd() + "/Run_" + str(run_attempt) + "/model/"

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

**Define and Train Models**

In [None]:
def build_tuned_pressure_model():
    INPUT = ssm_modes_n
    OUTPUT = n_modes_p
    # hyperparameters from tuning
    NUM_LAYERS = study_p_cv.best_trial.params["n_layers"]
    NUM_UNITS = study_p_cv.best_trial.params["n_units"]
    LEARN_RATE = study_p_cv.best_trial.params["learn_rate"]
    # model
    model = keras.models.Sequential()
    # input layer
    model.add(keras.layers.Dense(INPUT, activation="relu"))
    # hidden layers
    for layer in range(NUM_LAYERS):
        model.add(keras.layers.Dense(NUM_UNITS, activation="relu"))                                     
    # output layer - cfd pca
    model.add(keras.layers.Dense(OUTPUT, activation="linear"))
    # pca conversion
    model.add(keras.layers.Lambda(inv_pressure_pca))
    # compile
    model.compile(loss="mae", optimizer="adam", run_eagerly=True)
    
    return model

In [None]:
def build_tuned_velocity_model():
    INPUT = ssm_modes_n
    OUTPUT = n_modes_v
    # hyperparameters from tuning
    NUM_LAYERS = study_v_cv.best_trial.params["n_layers"]
    NUM_UNITS = study_v_cv.best_trial.params["n_units"]
    LEARN_RATE = study_v_cv.best_trial.params["learn_rate"]
    # model
    model = keras.models.Sequential()
    # input layer
    model.add(keras.layers.Dense(INPUT, activation="relu"))
    # hidden layers
    for layer in range(NUM_LAYERS):
        model.add(keras.layers.Dense(NUM_UNITS, activation="relu"))                                     
    # output layer - cfd pca
    model.add(keras.layers.Dense(OUTPUT, activation="linear"))
    # pca conversion
    model.add(keras.layers.Lambda(inv_velocity_pca))
    # compile
    model.compile(loss="mae", optimizer="adam", run_eagerly=True)
    
    return model

In [None]:
# train pressure NN
dnn_p = build_tuned_pressure_model()

dnn_p.fit(X_train, 
          P_train, 
          epochs=1000, 
          batch_size=32,
          validation_split=0.1, 
          callbacks=[keras.callbacks.EarlyStopping(monitor="val_loss", patience=1000),
                     keras.callbacks.ModelCheckpoint(model_dir+"/dnn_p.h5", 
                                                     monitor="val_loss", 
                                                     save_best_only=True),
                     keras.callbacks.TensorBoard(log_dir=tensorboard_p_logdir, histogram_freq=1)])

In [None]:
# train velocity NN
dnn_v = build_tuned_velocity_model()

dnn_v.fit(X_train, 
          V_train, 
          epochs=1000,
          batch_size=32,           
          validation_split=0.1, 
          callbacks=[keras.callbacks.EarlyStopping(monitor="val_loss", patience=1000),                                                    
                     keras.callbacks.ModelCheckpoint(model_dir+"/dnn_v.h5", 
                                                     monitor="val_loss", 
                                                     save_best_only=True),
                     keras.callbacks.TensorBoard(log_dir=tensorboard_v_logdir, histogram_freq=1)])                                                 

**Tensorboard Visualisation**

In [None]:
# remember, set smoothing to 0
%reload_ext tensorboard
%tensorboard --logdir=./Tensorflow/my_logs/ --port=6007

**Saving Pipeline**

In [None]:
from sklearn.pipeline import Pipeline
import joblib

# save scalers in model dir
pipeline = Pipeline([
    # output
    ('scaler_p', scaler_p),
    ('scaler_v', scaler_v)
])
joblib.dump(pipeline, model_dir + 'pipeline.pkl')

# save U, S and Vt matrices in model dir
np.save(model_dir+'U_p', U_p, allow_pickle=True)
np.save(model_dir+'S_p', S_p, allow_pickle=True)
np.save(model_dir+'Vt_p', Vt_p, allow_pickle=True)
np.save(model_dir+'U_v', U_v, allow_pickle=True)
np.save(model_dir+'S_v', S_v, allow_pickle=True)
np.save(model_dir+'Vt_v', Vt_v, allow_pickle=True)