In [1]:
pip install numpy pandas scikit-learn tensorflow keras scikeras

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import re
import glob
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from scikeras.wrappers import KerasClassifier, KerasRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('processed/00_all_tests_filtered.csv')

In [4]:
df.isnull().sum()

time         933270
p1                0
p2                0
p3                0
p4                0
p5                0
p6                0
p7                0
p8                0
p9                0
p10               0
p11               0
p12               0
p13               0
p14               0
p15               0
p16               0
X                 0
Z                 0
flow-time      3003
dtype: int64

In [5]:
# Creates a new column "time_combined", which takes time when present, otherwise filling in from the flow-time
df['time_combined'] = df['time'].fillna(df['flow-time'])
# Remove the old time and flow-time
df = df.drop(columns = ['time', 'flow-time'])
df = df.rename(columns = {'time_combined': 'time'})
# drop any rows that still have no time value
df = df.dropna(subset = ['time'])
print(df.isnull().sum())

df.to_csv('processed/00_all_tests_filtered_clean.csv', index = False)

p1      0
p2      0
p3      0
p4      0
p5      0
p6      0
p7      0
p8      0
p9      0
p10     0
p11     0
p12     0
p13     0
p14     0
p15     0
p16     0
X       0
Z       0
time    0
dtype: int64


In [7]:
df

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16,X,Z,time
0,-107.526882,-226.000000,222.222222,175.627240,268.817204,-14.336918,86.021505,-57.347670,-78.853047,107.526882,-688.172043,-4505.376344,-4856.630824,258.064516,-408.602150,-25.089606,-4.856481,-4.322118,0.000
1,-107.526882,-151.000000,71.684588,175.627240,193.548387,60.931900,10.752688,-57.347670,-78.853047,32.258065,-612.903226,-4430.107527,-4781.362007,258.064516,-333.333333,-100.358423,-4.856481,-4.322118,1.000
2,-107.526882,-75.300000,71.684588,250.896057,193.548387,60.931900,10.752688,-57.347670,-78.853047,107.526882,-537.634409,-4354.838710,-4630.824373,182.795699,-333.333333,-25.089606,-4.856481,-4.322118,2.000
3,-107.526882,-150.537634,222.222222,326.164875,268.817204,-14.336918,161.290323,-57.347670,-78.853047,107.526882,-537.634409,-4354.838710,-4706.093190,182.795699,-258.064516,-25.089606,-4.856481,-4.322118,3.000
4,-32.258065,-75.268817,71.684588,326.164875,193.548387,60.931900,311.827957,-57.347670,-154.121864,107.526882,-537.634409,-4279.569892,-4630.824373,107.526882,-182.795699,50.179211,-4.856481,-4.322118,4.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936268,0.672068,0.000000,1121.263428,1115.966431,1162.688965,1044.396729,1187.366455,1170.466309,4.814718,0.036940,810.533569,3249.333984,2454.925781,3635.814209,4138.591309,4521.295410,-4.856481,-4.322118,999.958
936269,0.672087,0.000000,1121.266357,1115.972534,1162.691162,1044.400146,1187.375244,1170.469360,4.814291,0.036951,810.519714,3249.360840,2454.945068,3635.873291,4138.637207,4521.345215,-4.856481,-4.322118,999.968
936270,0.672072,0.000000,1121.269653,1115.978271,1162.693726,1044.403564,1187.384155,1170.472168,4.813863,0.036962,810.505737,3249.387451,2454.964355,3635.932617,4138.683594,4521.395020,-4.856481,-4.322118,999.978
936271,0.672060,0.000000,1121.272827,1115.984375,1162.696411,1044.406982,1187.393311,1170.474854,4.813434,0.036973,810.491699,3249.414062,2454.983398,3635.991699,4138.729980,4521.444336,-4.856481,-4.322118,999.988


In [8]:
inputs = [f'p{i}' for i in range(1, 17)]
X = df[inputs].values
y = df[['X', 'Z']].values

In [9]:
# 80% for training/tuning, 20% held out as a final test(X_test, y_test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state = 42)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# === PENALTY-AWARE CUSTOM LOSS FUNCTION WITH LOGGING ===
import tensorflow as tf
import keras.backend as K

# Tunable penalty factor, tracks how much extra "penalty" to add.
penalty_weight = 0.1  # You can tune this value



# Metric for logging penalty, track the average penalty per epoch
penalty_metric = tf.keras.metrics.Mean(name="penalty_mean")

def penalty_aware_loss(y_true, y_pred):
    # Compute MSE per sample (over X and Z)
    mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=-1)
    # Compute Euclidean distrance per sample between predicted and true value
    penalty = tf.norm(y_pred - y_true, ord='euclidean', axis=-1)
    # Updates our tracking metric with the penalty
    penalty_metric.update_state(penalty)

    # Return MSE + weighted penalty for each sample
    return mse + penalty_weight * penalty

# Custom callback to print penalty per epoch
# After each epoch, it will print the average penalty
# Then, resets the metric for the next epoch
class PenaltyLoggingCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        avg_penalty = penalty_metric.result().numpy()
        print(f" Epoch {epoch+1}: Penalty (Euclidean) = {avg_penalty:.4f}")
        penalty_metric.reset_state()

# Stops training early if the loss hasn't improved for 5 epochs, and restores the best-seen weights. 
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor = 'loss', patience=5, restore_best_weights = True)

In [None]:
def create_model(units_layer1 = 16, units_layer2 = 16, learning_rate = 1e-3):

    model = tf.keras.Sequential([
        layers.Dense(units_layer1, activation = "relu", input_shape = (X_train_scaled.shape[1],)),
        layers.Dense(units_layer2, activation = "relu"),
        layers.Dense(2, activation="linear")
    ])

    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate), loss = penalty_aware_loss, metrics = [tf.keras.metrics.MeanAbsoluteError(name = "mae")])

    

    return model

In [None]:
# Creates a scikit-learn-style regressor that:
# calls create_model under the hood. 
# Uses default hyperparameters (which RandomizedSearchCV will override)
keras_reg = KerasRegressor(
    model = create_model,
    units_layer1 = 16,
    units_layer2 = 16, 
    learning_rate = 1e-3, 
    batch_size = 32,
    epochs = 50, 
    verbose = 0
)

# param_dist = {
#     "model__units_layer1": [8, 16, 24, 32, 40, 48, 56, 64],
#     "model__units_layer2": [8, 16, 24, 32, 40, 48, 56, 64],
#     "optimizer__learning_rate": [1e-2, 1e-3, 1e-4],
#     "batch_size": [16, 32, 48, 64, 80, 96, 112, 128],
#     "epochs": [50, 100, 150]
# }

param_dist = {
    "model__units_layer1": [8, 32, 64, 128],
    "model__units_layer2": [8, 32, 64, 128],
    "optimizer__learning_rate": [1e-2, 1e-3, 5e-3, 1e-4],
    "batch_size": [32, 64, 96, 128],
    "epochs": [150]
}

In [None]:
# Uses 5-fold CV to score each combo
cv = KFold(n_splits = 5, shuffle=True, random_state = 42)
random_search = RandomizedSearchCV(
    estimator=keras_reg,
    param_distributions = param_dist,
    # Tries 10 random combinations from param_dist
    n_iter = 20,
    # Scores by negative MAE, (so higher = better)
    scoring = "neg_mean_absolute_error", 
    cv = cv, 
    random_state = 42, 
    n_jobs = -1
)

In [None]:
random_search.fit(X_train_scaled, y_train, callbacks=[early_stop, PenaltyLoggingCallback()])

In [None]:
best_model = random_search.best_estimator_.model_

In [None]:
best_model.save('best_model.h5')

In [None]:
cv_df = pd.DataFrame(random_search.cv_results_)
cv_df.to_csv('random_search_cv_results_v2.csv', index = False)

In [None]:
loss_h, mae_h = best_model.evaluate(X_test_scaled, y_test, verbose = 0)
print(f"\Test MAE = {mae_h:.4f}\n")

In [None]:
test_files = sorted(
    glob.glob('processed/*test-*_range.csv'),
    key = lambda fp: int(re.search(r'test-(\d+)_range', fp).group(1))
)

In [None]:
results = {}
for fp in test_files:
    df_t = pd.read_csv(fp)
    X_t = scaler.transform(df_t[inputs].values)
    y_t = df_t[['X', 'Z']].values
    loss, mae = best_model.evaluate(X_t, y_t, verbose = 0)
    results[os.path.basename(fp)] = mae

In [None]:
for fname, m in results.items():
    print(f"{fname}: MAE = {m:.4f}")

In [None]:
best_file = min(results, key=results.get)
print(f"\n {best_file} had the lowest MAE: {results[best_file]:.4f}")

In [None]:
# results_df = pd.DataFrame.from_records(
#     results.items(),
#     columns = ['test_file', 'mae']
# )

# results_df.to_csv('test_file_mae_results.csv', index=False)

# RELOAD MODEL

In [12]:
from tensorflow.keras.models import load_model
model1 = load_model(
    'best_model.h5',
    custom_objects = {'penalty_aware_loss': penalty_aware_loss}
)



In [13]:
loss_h, mae_h = model1.evaluate(X_test_scaled, y_test, verbose = 0)
print(f"\Test MAE = {mae_h:.4f}\n")

  print(f"\Test MAE = {mae_h:.4f}\n")


\Test MAE = 0.3596



In [14]:
test_files = sorted(
    glob.glob('processed/*test-*_range.csv'),
    key = lambda fp: int(re.search(r'test-(\d+)_range', fp).group(1))
)

In [15]:
results = {}
for fp in test_files:
    df_t = pd.read_csv(fp)
    X_t = scaler.transform(df_t[inputs].values)
    y_t = df_t[['X', 'Z']].values
    loss, mae = model1.evaluate(X_t, y_t, verbose = 0)
    results[os.path.basename(fp)] = mae

In [16]:
for fname, m in results.items():
    print(f"{fname}: MAE = {m:.4f}")

02_test-2_range.csv: MAE = 1.3146
03_test-3_range.csv: MAE = 0.5142
04_test-4_range.csv: MAE = 0.8113
05_test-5_range.csv: MAE = 0.6109
06_test-6_range.csv: MAE = 0.5118
07_test-7_range.csv: MAE = 0.1926
08_test-8_range.csv: MAE = 0.5627
09_test-9_range.csv: MAE = 0.3983
10_test-10_range.csv: MAE = 0.4155
11_test-11_range.csv: MAE = 0.1365
12_test-12_range.csv: MAE = 0.1096


In [17]:
best_file = min(results, key=results.get)
print(f"\n {best_file} had the lowest MAE: {results[best_file]:.4f}")


 12_test-12_range.csv had the lowest MAE: 0.1096


# Making bar graph for Hyperparameter Optimization

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import ast 

In [None]:
df = pd.read_csv('random_search_cv_results_v2.csv')

In [None]:
df

In [None]:
df['params_dict'] = df['params'].apply(ast.literal_eval)

def make_label(d):
    return (
        f"LR = {d['optimizer__learning_rate']}",
        f"HL1 = {d['model__units_layer1']}",
        f"HL2 = {d['model__units_layer2']}",
        f"EP = {d['epochs']}",
        f"BS = {d['batch_size']}")

df['short_label'] = df['params_dict'].apply(make_label)

labels = df['short_label']
scores = df['mean_test_score']
y_pos = range(len(scores))

fig, ax = plt.subplots(figsize = (9, 4))
bars = ax.barh(y_pos, scores, color = 'blue', height = 0.3)

# ax.set_ylim(-0.5, len(scores) - 0.5)
ax.set_yticks(y_pos)
ax.set_yticklabels(labels, fontsize = 10)

ax.set_xlim(0, -1)

ax.set_xlabel('Mean Test Score', color = 'black', fontstyle = 'italic', fontweight = 'bold')
ax.set_ylabel('Different Hyperparameter Combinations', color = 'black', fontstyle = 'italic', fontweight = 'bold')
ax.set_title('Hyperparameter Optimization - Mean Test Score', fontsize = 14, fontweight='bold')

for bar in bars:
    w = bar.get_width()
    y = bar.get_y() + bar.get_height() / 2
    offset = -0.08
    ax.text(
        w + offset, 
        y,
        f"{w:.3f}",
        va = 'center',
        ha = 'center',
        fontsize = 10,
        color = 'red'
    )
# for bar in bars:
#     x = bar.get_width()
#     y = bar.get_y() + bar.get_height() / 2
#     ax.text(x, y, f"{x:.3f}", va = 'center', ha = 'right', fontsize = 8, color = 'white')
plt.tight_layout()
plt.show()

In [None]:
# df['params_str'] = df['params'].astype(str)
# params = df['params_str']
# scores = df['mean_test_score']

# fig, ax = plt.subplots(figsize = (8, 10))
# y_pos = range(len(scores))
# ax.barh(y_pos, scores)
# # fig, ax = plt.subplots(figsize = (12, 6))
# # ax.bar(range(len(scores)), scores)
# # ax.set_xticks(range(len(scores)))
# # ax.set_xticklabels(params, rotation = 90, fontsize = 8)

# ax.set_yticks(y_pos)
# ax.set_yticklabels(params, fontsize = 8)

# ax.set_xlim(0, -0.6)

# ax.set_xlabel('Mean Test Score')
# ax.set_ylabel('Different Hyperparameter Combinations')
# ax.set_title('Hyperparameter Optimization vs. Mean Test Score')

# plt.tight_layout()
# plt.show()