IMPORTS

In [1]:
# Clone the Git repository and navigate to the project directory
#!git clone https://github.com/DanteMillerDS/MPA_Predictor.git
#%cd MPA_Predictor

# Install required packages
!pip install xgboost

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.io
import uuid
import torch
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf

# Set random seeds for reproducibility
np.random.seed(1000)
tf.random.set_seed(1000)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


PREPROCESS

In [2]:
file_paths = dict()
file_paths[0] = ["data/s11_dB_freq.txt"]

def process_file(file_path):
    datasets = []
    with open(file_path, 'r') as file:
        current_dataset = []
        for line in file:
            if line.startswith("#Parameters"):
                if current_dataset:
                    datasets.append(current_dataset)
                    current_dataset = []
                params_str = line[line.index("{") + 1:line.index("}")]
                params = dict(param.split('=') for param in params_str.split(';'))
                current_dataset.append(params)
            elif line.startswith("#"):
                continue
            else:
                data = line.strip().split('\t')
                current_dataset.append(data)
        if current_dataset:
            datasets.append(current_dataset)
    data = []
    for dataset in datasets:
        permittivity_values = dataset[0]
        df_data = dataset[1:]
        s_parameter_name = file_path.split("_parameter_data_for_")[0]
        df = pd.DataFrame(df_data, columns=["Frequency", f"{s_parameter_name}"])
        df["ID"] = str(uuid.uuid4())[:8]
        for key, value in permittivity_values.items():
            df[key.replace(" ", "")] = value
        data.append(df)
    return data
all_combined_data = []
for index in file_paths:
    for file in file_paths[index]:
        combined_data = process_file(file)
        combined_data = pd.concat(combined_data, ignore_index=True)
        all_combined_data.append(combined_data)
array = []
group_size = 4
for i in range(0, len(all_combined_data), group_size):
    combined_dataframe = pd.concat(all_combined_data[i:i+group_size], axis=1)
    combined_dataframe.reset_index(drop=True, inplace=True)
    array.append(combined_dataframe)
training_dataframe = pd.concat(array, axis=0)
training_dataframe.reset_index(drop=True, inplace=True)
training_dataframe = training_dataframe.loc[:, ~training_dataframe.columns.duplicated()]

In [3]:
print(training_dataframe.columns)

Index(['Frequency', 'data/s11_dB_freq.txt', 'ID', 'rprobe', 'rin', 'er22',
       'er2', 'cl', 'r0', 'w', 'L', 't', 'h', 'wy', 'wx', 'er11', 'er1'],
      dtype='object')


In [4]:
training_dataframe.describe()

Unnamed: 0,Frequency,data/s11_dB_freq.txt,ID,rprobe,rin,er22,er2,cl,r0,w,L,t,h,wy,wx,er11,er1
count,100100.0,100100.0,100100,100100.0,100100.0,100100.0,100100.0,100100,100100.0,100100.0,100100.0,100100.0,100100.0,100100,100100,100100.0,100100.0
unique,1001.0,100100.0,100,1.0,1.0,1.0,1.0,1,1.0,100.0,100.0,1.0,1.0,1,1,1.0,1.0
top,1.0,-0.16391811348113,6bd7f8e4,0.4,1.5,0.01,2.55,10,1.8,41.662565887693,66.660105420309,0.2,1.6,200,200,0.01,2.55
freq,100.0,1.0,1001,100100.0,100100.0,100100.0,100100.0,100100,100100.0,1001.0,1001.0,100100.0,100100.0,100100,100100,100100.0,100100.0


In [5]:
training_dataframe.head(5)

Unnamed: 0,Frequency,data/s11_dB_freq.txt,ID,rprobe,rin,er22,er2,cl,r0,w,L,t,h,wy,wx,er11,er1
0,1.0,-0.16391811348113,6bd7f8e4,0.4,1.5,0.01,2.55,10,1.8,41.662565887693,66.660105420309,0.2,1.6,200,200,0.01,2.55
1,1.0089999437332,-0.13135824869362,6bd7f8e4,0.4,1.5,0.01,2.55,10,1.8,41.662565887693,66.660105420309,0.2,1.6,200,200,0.01,2.55
2,1.0180000066757,-0.068439952547044,6bd7f8e4,0.4,1.5,0.01,2.55,10,1.8,41.662565887693,66.660105420309,0.2,1.6,200,200,0.01,2.55
3,1.0269999504089,-0.013417551672601,6bd7f8e4,0.4,1.5,0.01,2.55,10,1.8,41.662565887693,66.660105420309,0.2,1.6,200,200,0.01,2.55
4,1.0360000133514,-0.0001524579701212,6bd7f8e4,0.4,1.5,0.01,2.55,10,1.8,41.662565887693,66.660105420309,0.2,1.6,200,200,0.01,2.55


In [6]:
needed_columns = ["Frequency","data/s11_dB_freq.txt","h","t","er11","er1","w","L"]
training_dataframe = training_dataframe[needed_columns]

MODELS

In [None]:
LR = 0.0001
L2 = 0.0001
WD = 0.000001

In [None]:
columns = []
for column in training_dataframe.columns:
    if column not in ["Permittivity_Real","Permittivity_Imaginary","ID","Orientation","Frequency"]:
        columns.append(column)
columns

In [None]:
X_train = training_dataframe[columns].values
y_train = training_dataframe[['Permittivity_Real', 'Permittivity_Imaginary']].values
selected_indices = np.random.permutation(len(training_dataframe))
X_selected = X_train[selected_indices]
y_selected = y_train[selected_indices]
grouped_indices = {}
for idx in selected_indices:
    row = training_dataframe.iloc[idx]
    orientation = row['Orientation']
    frequency = row['Frequency']
    key = (orientation, frequency)
    if key not in grouped_indices:
        grouped_indices[key] = []
    grouped_indices[key].append(idx)
X_train_indices = []
X_val_indices = []
X_test_indices = []
for key, indices in grouped_indices.items():
    train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)
    val_indices, test_indices = train_test_split(val_indices, test_size=0.2, random_state=42)
    X_train_indices.extend(train_indices)
    X_val_indices.extend(val_indices)
    X_test_indices.extend(test_indices)
X_train = X_selected[X_train_indices]
y_train = y_selected[X_train_indices]
X_val = X_selected[X_val_indices]
y_val = y_selected[X_val_indices]
X_test_indices.sort()
X_test = X_selected[X_test_indices]
y_test = y_selected[X_test_indices]
X_train = X_train.astype(float)
y_train = y_train.astype(float)
X_val = X_val.astype(float)
y_val = y_val.astype(float)
X_test = X_test.astype(float)
y_test = y_test.astype(float)

In [None]:
num_obs_train = len(X_train)
num_obs_val = len(X_val)
num_obs_test = len(X_test)
total_obs = num_obs_train + num_obs_val + num_obs_test
data = {'Dataset': ['Training', 'Validation', 'Test', 'Total'],
        'Number of Observations': [num_obs_train, num_obs_val, num_obs_test, total_obs]}
observations_table = pd.DataFrame(data)
print(observations_table)

In [None]:
selected_orientations = training_dataframe.loc[X_train_indices, 'Orientation']
selected_frequencies = training_dataframe.loc[X_train_indices, 'Frequency']

plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
plt.hist(selected_orientations, bins=10, color='blue', edgecolor='black')
plt.title('Training: Histogram of Selected Orientations')
plt.xlabel('Orientation')
plt.ylabel('Frequency')

plt.subplot(2, 2, 2)
plt.hist(selected_frequencies, bins=20, color='green', edgecolor='black')
plt.title('Training: Histogram of Selected Frequencies')
plt.xlabel('Frequency')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

selected_orientations = training_dataframe.loc[X_val_indices, 'Orientation']
selected_frequencies = training_dataframe.loc[X_val_indices, 'Frequency']

plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
plt.hist(selected_orientations, bins=10, color='blue', edgecolor='black')
plt.title('Validation: Histogram of Selected Orientations')
plt.xlabel('Orientation')
plt.ylabel('Frequency')

plt.subplot(2, 2, 2)
plt.hist(selected_frequencies, bins=20, color='green', edgecolor='black')
plt.title('Validation: Histogram of Selected Frequencies')
plt.xlabel('Frequency')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

selected_orientations = training_dataframe.loc[X_test_indices, 'Orientation']
selected_frequencies = training_dataframe.loc[X_test_indices, 'Frequency']

plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
plt.hist(selected_orientations, bins=10, color='blue', edgecolor='black')
plt.title('Testing: Histogram of Selected Orientations')
plt.xlabel('Orientation')
plt.ylabel('Frequency')

plt.subplot(2, 2, 2)
plt.hist(selected_frequencies, bins=20, color='green', edgecolor='black')
plt.title('Testing: Histogram of Selected Frequencies')
plt.xlabel('Frequency')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
def rmse_loss(y_true, y_pred):
    squared_error = tf.square(y_true - y_pred)
    mean_squared_error = tf.reduce_mean(squared_error)
    root_mean_squared_error = tf.sqrt(mean_squared_error)
    return root_mean_squared_error

nn_model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],),kernel_initializer='he_uniform', kernel_regularizer=tf.keras.regularizers.l2(L2)),
    keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform',kernel_regularizer=tf.keras.regularizers.l2(L2)),
    keras.layers.Dense(32, activation='relu', kernel_initializer='he_uniform',kernel_regularizer=tf.keras.regularizers.l2(L2)),
    keras.layers.Dense(16, activation='relu', kernel_initializer='he_uniform',kernel_regularizer=tf.keras.regularizers.l2(L2)),
    keras.layers.Dense(y_train.shape[1], activation='linear')
])


nn_model.compile(optimizer=tf.optimizers.Adam(learning_rate=L2,weight_decay=WD),
              loss=[rmse_loss],
              metrics=[tf.metrics.MeanAbsoluteError(), tf.metrics.MeanSquaredError()])
nn_model.summary()
history = nn_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=1001,shuffle=True)
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['mean_absolute_error'])
plt.plot(history.history['val_mean_absolute_error'])
plt.title('Moon Rock FNN: Mean Absolute Error')
plt.xlabel('Epoch')
plt.ylabel('Mean Absolute Error')
plt.legend(['Train', 'Validation'])
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Moon Rock FNN: Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'])
plt.tight_layout()
plt.show()

In [None]:
xgb_model = xgb.XGBRegressor(n_jobs=-1,device="gpu",n_estimators=250,eval_metric=mean_absolute_error,verbosity=3,max_depth=5)
xgb_model.fit(X_train, y_train)

In [None]:
tabnet_model = tabnet(n_jobs=-1,device="gpu",n_estimators=250,eval_metric=mean_absolute_error,verbosity=3,max_depth=5)
tabnet_model.fit(X_train, y_train)

In [None]:
batch_size = 1001
y_pred_nn = nn_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)
y_pred_tab = tabnet_model.predict(X_test)

y_pred_nn = y_pred_nn.reshape(-1, batch_size, y_pred_nn.shape[-1])
y_pred_xgb = y_pred_xgb.reshape(-1, batch_size, y_pred_xgb.shape[-1])
y_pred_tab = y_pred_tab.reshape(-1, batch_size, y_pred_tab.shape[-1])

y_pred_nn = np.mean(y_pred_nn, axis=1)
y_pred_xgb = np.mean(y_pred_xgb, axis=1)
y_pred_tab = np.mean(y_pred_tab, axis=1)

y_test = np.mean(y_test, axis=1)

y_pred_0_nn, y_test_0 = y_pred_nn[:, 0], y_test[:, 0]
y_pred_1_nn, y_test_1 = y_pred_nn[:, 1], y_test[:, 1]
y_pred_0_xgb, _ = y_pred_xgb[:, 0], _
y_pred_1_xgb, _ = y_pred_xgb[:, 1], _
y_pred_0_tab, _ = y_pred_tab[:, 0], _
y_pred_1_tab, _ = y_pred_tab[:, 1], _

y_pred_0_nn = np.clip(y_pred_0_nn, 0.0, 0.2)
y_pred_1_nn = np.clip(y_pred_1_nn, 0.0, 0.2)
y_pred_0_xgb = np.clip(y_pred_0_xgb, 0.0, 0.2)
y_pred_1_xgb = np.clip(y_pred_1_xgb, 0.0, 0.2)
y_pred_0_tab = np.clip(y_pred_0_tab, 0.0, 0.2)
y_pred_1_tab = np.clip(y_pred_1_tab, 0.0, 0.2)

In [None]:
mse_xgb = mean_squared_error(y_test_0, y_pred_0_xgb)
mae_xgb = mean_absolute_error(y_test_0, y_pred_0_xgb)
r_squared_xgb = r2_score(y_test_0, y_pred_0_xgb)
mse_nn = mean_squared_error(y_test_0, y_pred_0_nn)
mae_nn = mean_absolute_error(y_test_0, y_pred_0_nn)
r_squared_nn = r2_score(y_test_0, y_pred_0_nn)
mse_tab = mean_squared_error(y_test_0, y_pred_0_tab)
mae_tab = mean_absolute_error(y_test_0, y_pred_0_tab)
r_squared_tab = r2_score(y_test_0, y_pred_0_tab)
print("Metrics for XGBoost Model:")
print(f"MSE: {mse_xgb:.4f}")
print(f"MAE: {mae_xgb:.4f}")
print(f"R-squared (R²): {r_squared_xgb:.4f}")
print("\nMetrics for FNN Model:")
print(f"MSE: {mse_nn:.4f}")
print(f"MAE: {mae_nn:.4f}")
print(f"R-squared (R²): {r_squared_nn:.4f}")
print("\nMetrics for TabNet Model:")
print(f"MSE: {mse_tab:.4f}")
print(f"MAE: {mae_tab:.4f}")
print(f"R-squared (R²): {r_squared_tab:.4f}")

plt.figure(figsize=(6, 4))
plt.plot(range(len(y_test_0)), y_test_0, label='Actual', color='blue', alpha=0.7)
plt.plot(range(len(y_test_0)), y_pred_0_xgb, label='Predicted (XGBoost)', color='green', alpha=0.7)
plt.plot(range(len(y_test_0)), y_pred_0_nn, label='Predicted (FNN)', color='purple', alpha=0.7)
plt.plot(range(len(y_test_0)), y_pred_0_tab, label='Predicted (TabNet)', color='green', alpha=0.7)
plt.plot([0, len(y_test_0) - 1], [np.mean(y_pred_0_xgb), np.mean(y_pred_0_xgb)], linestyle='dashed', color='red', label='Predicted Mean (XGBoost)')
plt.plot([0, len(y_test_0) - 1], [np.mean(y_pred_0_nn), np.mean(y_pred_0_nn)], linestyle='dashed', color='orange', label='Predicted Mean (FNN)')
plt.plot([0, len(y_test_0) - 1], [np.mean(y_pred_0_nn), np.mean(y_pred_0_nn)], linestyle='dashed', color='orange', label='Predicted Mean (FNN)')
plt.title('Moon Rock: Simulated Actual and Predicted Real Permittivity')
plt.xlabel('Data Point')
plt.ylabel('Value')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()