<a href="https://colab.research.google.com/github/BP-Development2024/BPredictor/blob/main/BPredictor_4_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1.1 Funktionen zum Import der Daten
Hier sind der korrekte Dateipfad und -name, sowie die Spalten der In- und Outputwerte anzugeben, mit welchen das NN trainiert werden soll. Aufruf erfolgt über GUI.

In [84]:
import pandas as pd
import numpy as np

def remove_outliers(data, allowed_deviation, max_outlier_cutoff):
    # Remove rows with missing values first
    data = data.dropna()

    # If the maximum value is selected on the slider, this is handled like no outlier detection should be performed
    if allowed_deviation == max_outlier_cutoff:
      print("No outlier detection performed")
      return data

    for column in data.columns:
        # Check for each column if it is numeric. If so, define lower and upper bounds and remove outliers outside of these bounds.
        if data[column].dtype.kind in 'biufc':
            mean = data[column].mean()
            std = data[column].std()
            lower_bound = mean - allowed_deviation * std # lower and upper bound depend on the allowed deviation specified in the GUI
            upper_bound = mean + allowed_deviation * std
            # Remove outliers from the data
            data = data.drop(data[(data[column] < lower_bound) | (data[column] > upper_bound)].index)
    return data

def normalize_columns(data):
    # normalize each column by dividing by it's max - min value.
    min_val = data.min()
    max_val = data.max()
    normalized_data = (data - min_val) / (max_val - min_val)
    return normalized_data, min_val, max_val # Return the data AND the min and max values. These will be needed to reverse the normalization later on, e.g. for the output values.

def reverse_column_normalization(normalized_data, original_min, original_max):
  # This function serves to reverse the normalization. This is applied to the outputs of a ML-Model to restore the correct units.
  # It is able to handle pandas dataframes and numpy arrays as normalized_data
    # Convert original_min and original_max to numpy arrays if they are pandas Series (this should usually be the case)
    if isinstance(original_min, pd.Series):
        original_min = original_min.values
    if isinstance(original_max, pd.Series):
        original_max = original_max.values

    if isinstance(normalized_data, pd.DataFrame): # check if the provided normalized data is a pandas dataframe. This may be the case depending on the ML-Algorithm used.
        reversed_data = normalized_data.copy()
        for column in normalized_data.columns:
            col_index = normalized_data.columns.get_loc(column)
            reversed_data[column] = normalized_data[column] * (original_max[col_index] - original_min[col_index]) + original_min[col_index] # apply the reverse normalization
    else:  # If it is not a pandas dataframe, it is assumed that it is a numpy-array. Whether it is a pandas dataframe or a numpy-array depends on the ML-Algorithm used.
        if normalized_data.ndim == 1:  # If the array is 1-dimensional
            reversed_data = normalized_data * (original_max - original_min) + original_min # apply the reverse normalization
        else:  # If the array is 2-dimensional (should this case occur, e.g. if the ouput of one ML-algorithm is handled differently)
            reversed_data = np.empty_like(normalized_data)
            for i in range(normalized_data.shape[1]):
                reversed_data[:, i] = normalized_data[:, i] * (original_max[i] - original_min[i]) + original_min[i] # apply the reverse normalization
    return reversed_data

def import_data(max_rows, filename, column_separator, outlierCutoffValue, max_outlier_cutoff):
  # This functions imports the data with the parameters specified in the GUI.
    # Read the file content as a string
    with open(filename, 'r', encoding='latin1') as file:
        file_content = file.read()

    # Replace all occurrences of '_' with '-'. This serves to later be able to name categorial columns correctly during the one-hot encoding.
    file_content = file_content.replace('_', '-')

    # Check if ',' is used as a decimal separator and replace it with '.'
    if column_separator == ',':
        # If the column separator is ',', then we assume '.' is already used for decimals
        pass
    else:
        # Replace ',' used as decimal separator with '.'
        file_content = file_content.replace(',', '.')
        # Replace all occurrences of ';' with the specified column separator
        file_content = file_content.replace(';', column_separator)

    # Write the modified content back to a temporary file
    temp_filename = 'temp_cleaned_file.csv'
    with open(temp_filename, 'w', encoding='latin1') as file:
        file.write(file_content)

    # Read the cleaned CSV file
    data = pd.read_csv(temp_filename, encoding='latin1', on_bad_lines='skip', sep=column_separator)

    # Rename empty column names to column_n
    data.columns = [f'column_{i}' if col == '' else col for i, col in enumerate(data.columns)]

    datasets_before = data.shape[0]
    # call the function to clean up outliers as specified by the user
    data = remove_outliers(data, outlierCutoffValue, max_outlier_cutoff)
    datasets_removed = datasets_before - data.shape[0]

    # Limit the number of rows if max_rows is defined and less than the total number of rows
    if max_rows < len(data):
        data = data.head(max_rows)
    print(f"{datasets_removed} Ausreißer entfernt. Verbliebene Datensätze: {data.shape[0]}")

    # Remove columns with the same value in every row to avoid redundant data.
    data = data.loc[:, (data != data.iloc[0]).any()]

    return data


def process_data(data, input_columns, output_columns):
  # This function performs different operations to prepare the input data for the ML-Algorithms.
  # In particular, it selects the in- and ouput columns specified by the user and performs a normalization on all the data.
    import numpy as np
    input_features = data.iloc[:, input_columns]  # Selecting columns for inputs. Depends on the data and the goal of the Machine Learning Application.
    output_features = data.iloc[:, output_columns]  # Selecting column for the output. Depends on the data and the goal of the Machine Learning Application.

    num_inputs = input_features.shape[1]
    input_features.head(), output_features.head()

    # Normalize the input features
    normalized_input_features, input_min, input_max = normalize_columns(input_features)
    normalized_output_features, output_min, output_max = normalize_columns(output_features)

    # Apply reverse normalization to check if it works properly.
    # This is not used in any output, but can be used to confirm that normalization and reverse normalization work properly.
    original_input_features = reverse_column_normalization(normalized_input_features, input_min, input_max)
    original_output_features = reverse_column_normalization(normalized_output_features, output_min, output_max)

    # Display the data heads in their original form. Uncomment for checking if reverse normalization does not appear to work.
    #print("Original Input Features:")
    #print(original_input_features.head())
    print("Original Output Features:")
    print(original_output_features.head())

    # Convert output features to 1D array if there is only one output column.
    # This allows the ML-Algorithms to handle single outputs equivalently to multiple outputs.
    if normalized_output_features.shape[1] == 1:
        normalized_output_features = np.squeeze(normalized_output_features.values)

    print(f"\n Dataset successfully imported. {normalized_input_features.shape[0]} entries, {num_inputs} input values.")
    # return the normalized in- and output data, as well as the max and min values to allow for a proper reverse normalization later on.
    return normalized_input_features, normalized_output_features, num_inputs, input_min, input_max, output_min, output_max

# 1.2 Daten in Trainings- und Testdaten aufspalten
Die resultierenden, getrennten Trainings- und Testdaten werden in allen implementierten ML-Algorithmen verwendet. Die Aufteilung erfolgt dabei zufällig mittels der sklearn-Bibliothek. Eine typische split ratio von 0.7/0.3 ist festgelegt.

In [85]:
def split_data(normalized_input_features, normalized_output_features):
  from sklearn.model_selection import train_test_split
  # Split the data into training and testing sets
  input_train, input_test, output_train, output_test = train_test_split(
      normalized_input_features, normalized_output_features, test_size=0.2, random_state=42
  )
  return input_train, input_test, output_train, output_test

# 1.3 Klassifikations-Inputs per one-hot-encoding umwandeln
Dies ist notwendig, da sonst kategoriale Inputs nicht angemessen verarbeitet werden können. So können sowohl nicht-numerische als auch numerische Klassifizierungen verarbeitet werden. In den Funktionen wird dabei angenommen, dass eine Spalte, die nur eine begrenzte Anzahl von Integers beinhaltet, eine Klassifizierung darstellt.

In [86]:
def detect_categorical_columns(df):
    # This function detects columns with classification values (categorical) or columns which only contain whole numbers in a DataFrame.
    # Returns: List of column indices that are categorical or contain only whole numbers.

    # Detect columns with 'object' or 'category' data types
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Detect columns with only whole numbers, exclude columns that only contain 0 and 1 from being defined as classification (would lead to errors)
    integer_columns = []
    for col in df.select_dtypes(include=['number']).columns:
        if pd.api.types.is_integer_dtype(df[col]) or (df[col] % 1 == 0).all(): # check if the column contains only integers
            unique_values = df[col].unique()
            # Check if a column contains more than 100 distinct integers. It is assumed not to be a classfication if this is the case.
            if any(unique_values > 100):
                continue
            # if it contains less than 100 unique integers and the values are not all the same or only 0 or one, it is identified as a classification column.
            if len(unique_values) > 1 and not (set(unique_values) == {0, 1} or set(unique_values) == {0} or set(unique_values) == {1}):
                integer_columns.append(col)

    # Combine both lists of columns
    all_categorical_columns = categorical_columns + integer_columns

    # Get the column indices of all categorial columns
    categorical_indices = [df.columns.get_loc(col) for col in all_categorical_columns]

    return categorical_indices


def encode_classification(input):
  # This function performs the one-hot encoding, using the detect_categorial_columns function to identify categorial columns.
  # Import the corresponding Library from sklearn
  from sklearn.preprocessing import OneHotEncoder

  # Function to remove columns that only contain a single unique value. Those columns don't provide useful data to the ML-Algorithms.
  def remove_single_value_columns(df):
      return df.loc[:, df.nunique() > 1]

  # Call previously defined function to remove columns with a single unique value
  input = remove_single_value_columns(input)

  # define the encoder from the sklearn-library
  encoder = OneHotEncoder(sparse_output=False, categories='auto')
  # call the detect_categorical_columns to identify categorial columns on which one-hot encoding should be used
  cat_features = detect_categorical_columns(input)

  # Extract categorical and continuous columns
  categorical = input.iloc[:, cat_features]
  continuous = input.drop(input.columns[cat_features], axis=1)

  # One-hot encode the categorical columns
  cat_input_encoded = encoder.fit_transform(categorical)

  # Create a DataFrame from the encoded categorical data
  cat_columns_encoded = encoder.get_feature_names_out(categorical.columns)
  cat_df_encoded = pd.DataFrame(cat_input_encoded, columns=cat_columns_encoded, index=input.index)

  # Concatenate the encoded categorical DataFrame with the continuous columns DataFrame
  input_encoded_df = pd.concat([cat_df_encoded, continuous], axis=1)

  return input_encoded_df

# 2.1 Erstellung und Training des FFNN

In [87]:
# Import Libraries used for Neural Networks. Here, primarily tensorflow with keras is used.
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from datetime import datetime
import time
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from tensorflow.keras.optimizers import Adam

def NN(input_train, input_test, output_train, output_test, num_inputs, folder, output_min, output_max, epochs=30):

    # Start the timer to detect the training time
    starttime = time.perf_counter()

    # Determine the number of outputs. Special case if there is only one output - the output dataset shape may be different in this case.
    if len(output_train.shape) == 1:
        num_outputs = 1
        output_train = np.expand_dims(output_train, axis=1) # adjust output dimensions for test and training dataset if there is only 1 output, so the training still works.
        output_test = np.expand_dims(output_test, axis=1)
    else:
        num_outputs = output_train.shape[1]

    # define the structure of the FFNN (Feed-Forward Neural Network)
    model = keras.Sequential([
    layers.Input(shape=(num_inputs,)), # define shape of the input layer
    layers.Dense(128, activation='relu'),  # Using ReLU activation, first layer after input
    layers.Dense(256, activation='relu'),  # first hidden layer
    layers.Dropout(0.1), # Dropout layer for increased robustness
    layers.Dense(128, activation='relu'),  # More hidden layers...
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(num_outputs, activation='linear')  # Output layer with linear activation function and the specified number of outputs
    ])

    # Compile the model with the following, specified optimizer and evaluation metrics
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

    # Enabling the following piece of code would enable a more in-depth visualization and evaluation of the model
    '''
    ######### needed for visualization
    log_dir = "logs/fit/"
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
    #########
    '''


    # Train the model
    # The following line should be uncommented if the enhanced visualization should be activated.
    # model.fit(input_train, output_train, epochs=epochs, batch_size=32, verbose=2, callbacks=[tensorboard_callback]) # callbacks needed for visualization

    model.fit(input_train, output_train, epochs=epochs, batch_size=32, verbose=0) # Disable this line if the other model.fit function for enhanced visualization is used.

    # More code to uncomment for enhanced visualization
    '''
    ####### visualization
    # Load the TensorBoard extension
    %load_ext tensorboard

    # Launch TensorBoard
    %tensorboard --logdir logs/fit
    #######
    '''

    # Evaluate the training performance
    loss, mae = model.evaluate(input_test, output_test)
    timeNN = time.perf_counter() - starttime

    # Make predictions based on the test dataset. These are used to evaluate the test performance metrics.
    output_pred = model.predict(input_test, verbose=0)

    # Reverse normalization for predictions and actual values
    output_pred_originalNN = reverse_column_normalization(output_pred, output_min, output_max)
    output_test_original = reverse_column_normalization(output_test, output_min, output_max)

    # Evaluate the performance metrics of the model (normalized outputs) based on the test dataset
    mseNormNN = mean_squared_error(output_test, output_pred, multioutput='raw_values')
    maeNormNN = mean_absolute_error(output_test, output_pred, multioutput='raw_values')

    # Evaluate the performance metrics of the model (reverse-normalized outputs).
    # This may be used to evaluate the error metrics mae & mse with the error reflecting the actual units.
    mseNN = mean_squared_error(output_test_original, output_pred_originalNN, multioutput='raw_values')
    maeNN = mean_absolute_error(output_test_original, output_pred_originalNN, multioutput='raw_values')

    # Evaluate the R² performance metric. It does not depend on the normalization, however here the reverse normalized test values are used.
    r2NN = r2_score(output_test_original, output_pred_originalNN, multioutput='raw_values')

    # Generate the current date and time as a string
    current_datetime = datetime.now().strftime('%Y%m%d_%H%M%S')
    # Create a filename with the current date and time
    filename = f'{folder}/nn_model_{current_datetime}.keras'
    # Save the model
    model.save(f'{filename}')

    # return the name of the model, training time and performance metrics
    return filename, timeNN, r2NN, maeNormNN, mseNormNN, maeNN, mseNN

# 2.2 Testen des FFNN mit Test- und eigenen Werten:

In [88]:
def testNN(model, input_test, output_test, output_min, output_max):
    # This function produces predictions for the first 20 values of the test dataset using the FFNN
    # The goal is to provide the user with tangible examples directly after finishing training

    # Determine how many test-values can be displayed (maximum 20, bound by number of test-datasets)
    testlaenge = 20
    if len(input_test) < testlaenge:
        testlaenge = len(input_test) - 1

    # Load the specified model. This should always be the previously trained model, so the testdata fits.
    TestModel = tf.keras.models.load_model(model)

    # Make predictions on the first 20 datasets of the test set
    predictions = TestModel.predict(input_test[:testlaenge], verbose=0)

    # Reverse normalization for predictions and actual values
    predictions_originalNN = reverse_column_normalization(predictions, output_min, output_max)
    output_test_originalNN = reverse_column_normalization(output_test[:testlaenge], output_min, output_max)

    # Obtain predictions for the specified amount of test outputs (20 or less)
    prediction = []
    for i in range(testlaenge):
        prediction.append(predictions_originalNN[i])

    return prediction

# 3.1 Training mit Random Forest Methode

In [89]:
# Import Libraries used for RF Regressors and evaluation
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# from tqdm import tqdm # could be used for visualization of training progress, but is more confusing than helpful in the GUI
import joblib
from joblib import parallel_backend
from datetime import datetime
import time

def RF(input_train, input_test, output_train, output_test, num_inputs, folder, output_min, output_max):
    # This function trains a Random Forest Model
    # Start the timer
    starttime = time.perf_counter()

    # Initialize the Random Forest Regressor
    rf = RandomForestRegressor(n_estimators=100, random_state=42)

    # Train the model
    with parallel_backend('loky', n_jobs=-1):
        rf.fit(input_train, output_train)

    # Evaluate the training time
    timeRF = time.perf_counter() - starttime

    # Make predictions
    output_pred = rf.predict(input_test)

    # Reverse normalization for predictions and actual values
    output_pred_original = reverse_column_normalization(output_pred, output_min, output_max)
    output_test_original = reverse_column_normalization(output_test, output_min, output_max)

    # Evaluate the performance metrics of the model (normalized outputs) based on the test dataset
    mseNormRF = mean_squared_error(output_test, output_pred, multioutput='raw_values')
    mseRF = mean_squared_error(output_test_original, output_pred_original, multioutput='raw_values')

    # Evaluate the performance metrics of the model (reverse-normalized outputs).
    # This may be used to evaluate the error metrics mae & mse with the error reflecting the actual units.
    maeNormRF = mean_absolute_error(output_test, output_pred, multioutput='raw_values')
    maeRF = mean_absolute_error(output_test_original, output_pred_original, multioutput='raw_values')

    # Evaluate the R² performance metric. It does not depend on the normalization, however here the reverse normalized test values are used.
    r2RF = r2_score(output_test_original, output_pred_original, multioutput='raw_values')

    # Generate the current date and time as a string
    current_datetime = datetime.now().strftime('%Y%m%d_%H%M%S')
    # Create a filename with the current date and time
    filename = f'{folder}/rf_model_{current_datetime}.pkl'

    # Save the model
    joblib.dump(rf, filename)

    # Return the filename of the model, training time and normalized performance metrics
    return filename, timeRF, r2RF, maeNormRF, mseNormRF

# 3.2 Testen des RF-Algorithmus

In [90]:
import joblib

def testRF(model, input_test, output_test, output_min, output_max):
    # This function produces predictions for the first 20 values of the test dataset using the RF
    # The goal is to provide the user with tangible examples directly after finishing training

    # Determine how many test-values can be displayed (maximum 20, bound by number of test-datasets)
    testlaenge = 20
    if len(input_test) < testlaenge:
        testlaenge = len(input_test) - 1

    # Load the specified model
    TestRF = joblib.load(model)

    # Make predictions
    output_pred = TestRF.predict(input_test[:testlaenge])

    # Reverse normalization for predictions and actual values
    output_pred_originalRF = reverse_column_normalization(output_pred, output_min, output_max)
    output_test_originalRF = reverse_column_normalization(output_test[:testlaenge], output_min, output_max)

    # Collect the first 20 (or less) predictions
    prediction = []
    for i in range(testlaenge):
        prediction.append(output_pred_originalRF[i])

    return prediction

# 4.1 Training mit DT mit Bagging

In [91]:
# Import libraries for Decision Tree and Bagging
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
from datetime import datetime
import time

def DT_Bagging(input_train, input_test, output_train, output_test, num_inputs, folder, output_min, output_max):
    # This function trains a Bagging-Ensemble of Decision Trees

    # Start the timer
    starttime = time.perf_counter()

    # Initialize the Bagging Regressor with Decision Trees
    dt_bagging = BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=100, random_state=42, n_jobs=-1)

    # Train the model
    dt_bagging.fit(input_train, output_train)

    # Evaluate the training time
    timeDTBagging = time.perf_counter() - starttime

    # Make predictions
    output_pred = dt_bagging.predict(input_test)

    # Reverse normalization for predictions and actual values
    output_pred_original = reverse_column_normalization(output_pred, output_min, output_max)
    output_test_original = reverse_column_normalization(output_test, output_min, output_max)

    # Evaluate the performance metrics of the model (normalized outputs) based on the test dataset
    mseNormDTBagging = mean_squared_error(output_test, output_pred, multioutput='raw_values')
    mseDTBagging = mean_squared_error(output_test_original, output_pred_original, multioutput='raw_values')

    # Evaluate the performance metrics of the model (reverse-normalized outputs).
    # This may be used to evaluate the error metrics mae & mse with the error reflecting the actual units.
    maeNormDTBagging = mean_absolute_error(output_test, output_pred, multioutput='raw_values')
    maeDTBagging = mean_absolute_error(output_test_original, output_pred_original, multioutput='raw_values')

    # Evaluate the R² performance metric. It does not depend on the normalization, however here the reverse normalized test values are used.
    r2DTBagging = r2_score(output_test_original, output_pred_original, multioutput='raw_values')

    # Generate the current date and time as a string
    current_datetime = datetime.now().strftime('%Y%m%d_%H%M%S')
    # Create a filename with the current date and time
    filename = f'{folder}/dt_bagging_model_{current_datetime}.pkl'
    # Save the model
    joblib.dump(dt_bagging, f'{filename}')

    # return the model name, training time and performance metrics based on the test dataset
    return filename, timeDTBagging, r2DTBagging, maeNormDTBagging, mseNormDTBagging

# 4.2 Testen des DT mit Bagging

In [92]:
import joblib

def testDT_Bagging(model, input_test, output_test, output_min, output_max):
  # This function produces predictions for the first 20 values of the test dataset using the DT with bagging
  # The goal is to provide the user with tangible examples directly after finishing training

  # Determine how many test-values can be displayed (maximum 20, bound by number of test-datasets)
  testlaenge = 20
  if len(input_test) < testlaenge:
      testlaenge = len(input_test) - 1

  TestDTBagging = joblib.load(model) # load the specified model

  # Make predictions
  output_pred = TestDTBagging.predict(input_test[:testlaenge])

  # Reverse normalization for predictions and actual values
  output_pred_originalDTBagging = reverse_column_normalization(output_pred, output_min, output_max)
  output_test_originalDTBagging = reverse_column_normalization(output_test[:testlaenge], output_min, output_max)

  # Print the first 20 (or less) predictions alongside actual values
  prediction = []
  for i in range(testlaenge):
      prediction.append(output_pred_originalDTBagging[i])

  return prediction

# 5.1 SVM-Modell erstellen und trainieren

In [93]:
 # Import the libraries for the Support Vector Machine Regressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
from datetime import datetime
import time
import numpy as np

def SVM(input_train, input_test, output_train, output_test, num_inputs, folder, output_min, output_max):
    # This function trains a Support Vector Machine Model

    # Start the timer
    starttime = time.perf_counter()

    # Determine if output is single or multi-dimensional
    if len(output_train.shape) == 1:
        output_train = np.expand_dims(output_train, axis=1)
        output_test = np.expand_dims(output_test, axis=1)

    # Initialize and train the SVR model
    svr_model = SVR(kernel='rbf')  # 'linear', 'poly', 'rbf' and 'sigmoid' kernels are also possible, but rbf has been the most reliable so far
    if output_train.shape[1] > 1: # Initialize for one output
        multi_output_svr = MultiOutputRegressor(svr_model)
        multi_output_svr.fit(input_train, output_train)
        predictions = multi_output_svr.predict(input_test)
    else: # initialize for multiple outputs if this is the case
        svr_model.fit(input_train, output_train.ravel())
        predictions = svr_model.predict(input_test)
        predictions = np.expand_dims(predictions, axis=1)

    # evaluate training time
    timeSVM = time.perf_counter() - starttime

    # Reverse normalization for predictions and actual values
    predictions_original = reverse_column_normalization(predictions, output_min, output_max)
    output_test_original = reverse_column_normalization(output_test, output_min, output_max)

    # Evaluate the performance metrics of the model (normalized outputs) based on the test dataset
    mseNormSVM = mean_squared_error(output_test, predictions, multioutput='raw_values')
    mseSVM = mean_squared_error(output_test_original, predictions_original, multioutput='raw_values')

    # Evaluate the performance metrics of the model (reverse-normalized outputs).
    # This may be used to evaluate the error metrics mae & mse with the error reflecting the actual units.
    maeNormSVM = mean_absolute_error(output_test, predictions, multioutput='raw_values')
    maeSVM = mean_absolute_error(output_test_original, predictions_original, multioutput='raw_values')

    # Evaluate the R² performance metric. It does not depend on the normalization, however here the reverse normalized test values are used.
    r2SVM = r2_score(output_test_original, predictions_original, multioutput='raw_values')

    # Generate the current date and time as a string
    current_datetime = datetime.now().strftime('%Y%m%d_%H%M%S')
    # Create a filename with the current date and time
    filename = f'{folder}/svm_model_{current_datetime}.pkl'

    # Save the model. Format depending on single or multiple outputs.
    if output_train.shape[1] > 1:
        joblib.dump(multi_output_svr, filename)
    else:
        joblib.dump(svr_model, filename)

    # return the model name, training time and performance metrics based on the test dataset
    return filename, timeSVM, r2SVM, maeNormSVM, mseNormSVM


# 5.2 Testen der SVM

In [94]:
import joblib

def testSVM(model, input_test, output_test, output_min, output_max):
    # This function produces predictions for the first 20 values of the test dataset using the SVM-Regressor
    # The goal is to provide the user with tangible examples directly after finishing training

    # Determine how many test-values can be displayed (maximum 20, bound by number of test-datasets)
    testlaenge = 20
    if len(input_test) < testlaenge:
        testlaenge = len(input_test) - 1

    # Load the specified model
    TestSVM = joblib.load(model)

    # Make predictions
    output_pred = TestSVM.predict(input_test[:testlaenge])

    # Reverse normalization for predictions and actual values
    output_pred_originalSVM = reverse_column_normalization(output_pred, output_min, output_max)
    output_test_originalSVM = reverse_column_normalization(output_test[:testlaenge], output_min, output_max)

    # Collect the first 20 (or less) predictions
    prediction = []
    for i in range(testlaenge):
        prediction.append(output_pred_originalSVM[i])

    return prediction

# 6.1 Linear Regression Modell erstellen und trainieren

In [95]:
# Import the libraries needed for linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
from datetime import datetime
import time

def LR(input_train, input_test, output_train, output_test, num_inputs, folder, output_min, output_max):
    # This function trains a linear regression model

    # Start the timer
    starttime = time.perf_counter()

    # Initialize and train the Linear Regression model
    lr_model = LinearRegression()
    lr_model.fit(input_train, output_train)

    # evaluate the training time
    timeLR = time.perf_counter() - starttime

    # Make predictions on the test set
    predictions_lr = lr_model.predict(input_test)

    # Reverse normalization for predictions and actual values
    predictions_original_lr = reverse_column_normalization(predictions_lr, output_min, output_max)
    output_test_original = reverse_column_normalization(output_test, output_min, output_max)

    # Evaluate the performance metrics of the model (normalized outputs) based on the test dataset
    mseNormLR = mean_squared_error(output_test, predictions_lr, multioutput='raw_values')
    mseLR = mean_squared_error(output_test_original, predictions_original_lr, multioutput='raw_values')

    # Evaluate the performance metrics of the model (reverse-normalized outputs).
    # This may be used to evaluate the error metrics mae & mse with the error reflecting the actual units.
    maeNormLR = mean_absolute_error(output_test, predictions_lr, multioutput='raw_values')
    maeLR = mean_absolute_error(output_test_original, predictions_original_lr, multioutput='raw_values')

    # Evaluate the R² performance metric. It does not depend on the normalization, however here the reverse normalized test values are used.
    r2LR = r2_score(output_test_original, predictions_original_lr, multioutput='raw_values')

    # Generate the current date and time as a string
    current_datetime = datetime.now().strftime('%Y%m%d_%H%M%S')
    # Create a filename with the current date and time
    filename = f'{folder}/lr_model_{current_datetime}.pkl'

    # Save the model
    joblib.dump(lr_model, filename)

    # Return the model name, training time and performance metrics (evaluated on normalized data)
    return filename, timeLR, r2LR, maeNormLR, mseNormLR

# 6.2 Testen der Linearen Regression

In [96]:
import joblib
def testLR(model, input_test, output_test, output_min, output_max):
    # This function produces predictions for the first 20 values of the test dataset using the Linear Regression
    # The goal is to provide the user with tangible examples directly after finishing training

    # Determine how many test-values can be displayed (maximum 20, bound by number of test-datasets)
    testlaenge = 20
    if len(input_test) < testlaenge:
        testlaenge = len(input_test) - 1

    # Load the specified model
    TestLR = joblib.load(model)

    # Make predictions
    output_pred = TestLR.predict(input_test[:testlaenge])

    # Reverse normalization for predictions and actual values
    output_pred_originalLR = reverse_column_normalization(output_pred, output_min, output_max)
    output_test_originalLR = reverse_column_normalization(output_test[:testlaenge], output_min, output_max)

    # Collect the first 20 (or less) predictions
    prediction = []
    for i in range(testlaenge):
        prediction.append(output_pred_originalLR[i])

    return prediction

# 7.1 Performanceparameter und Testwerte der verschiedenen ML-Modelle anzeigen

In [97]:
import numpy as np
def print_results(res_NN, res_RF, res_DTb, res_SVM, res_LR, test_NN, test_RF, test_DTb, test_SVM, test_LR, output_name, output_test, output_min, output_max):
    # This function gathers the performance metrics & test values of every trained Model and prints them

    # determine how many test values should be printed (at most 20)
    testlaenge = 20
    if len(test_NN) < testlaenge:
        testlaenge = len(test_NN) - 1

    if len(output_name) == 1:
        output_name = output_name[0]

    # print the performance metrics (obtained with the test dataset) of all ML-algorithms and print them
    print(f"Outputgröße: {output_name}")
    print(f"Trainingszeit Vergleich (in s): NN: {round(res_NN[1], 2)};   RF: {round(res_RF[1], 2)};   DT (bagging): {round(res_DTb[1], 2)};   SVM: {round(res_SVM[1],2)};   LR: {round(res_LR[1],2)}")
    print(f"R² Vergleich: NN: {res_NN[2]};    RF: {res_RF[2]};   DT (bagging): {res_DTb[2]};   SVM: {res_SVM[2]};   LR: {res_LR[2]}")
    print(f"Mean Absolute Error Vergleich: NN: {res_NN[3]};    RF: {res_RF[3]};   DT (bagging): {res_DTb[3]};   SVM: {res_SVM[3]};   LR: {res_LR[3]}")
    print(f"Mean Squared Error Vergleich: NN: {res_NN[4]};    RF: {res_RF[4]};   DT (bagging): {res_DTb[4]};   SVM: {res_SVM[4]};   LR: {res_LR[4]}")

    # preform reverse normalization to obtain the original values of the "correct" values from the test dataset
    output_test_original = reverse_column_normalization(output_test[:20], output_min, output_max)

    if isinstance(output_test_original, pd.DataFrame): # get number of outputs if it is a pandas dataframe
        num_columns = output_test_original.shape[1]
    else:
        output_test_original = pd.DataFrame(output_test_original) # get number of outputs if it is a numpy array
        num_columns = output_test_original.shape[1]

    for k in range(num_columns):
        # print all up to 20 test values of all outputs in this loop
        for i in range(testlaenge):
            actual_value = round(output_test_original.iloc[i, k], 3) # Get the (rounded) actual output value from the test dataset
            if isinstance(test_NN[0], np.ndarray) and test_NN[0].ndim == 1: # Get predictions of FFNN for the 20 test values
            # check the number of dimensions of the outputs and print accordingly. Equivalent for all other ML-Algorithms
                if test_NN[0][0] != -1:  # Assuming first element for comparison
                    prediction_nn = round(float(test_NN[i][k]), 3)
                else:
                    prediction_nn = np.nan # no prediction was made (= FFNN not trained and tested)
            elif test_NN[0] != -1:
                prediction_nn = round(float(test_NN[i]), 3)
            else:
                prediction_nn = np.nan # no prediction was made (= FFNN not trained and tested)

            if isinstance(test_RF[0], np.ndarray) and test_RF[0].ndim == 1: # Get predictions of RF for the 20 test values
                if test_RF[0][0] != -1:  # Assuming first element for comparison
                    prediction_rf = round(test_RF[i][k], 3)
                else:
                    prediction_rf = np.nan
            elif test_RF[0] != -1:
                prediction_rf = round(test_RF[i], 3)
            else:
                prediction_rf = np.nan

            if isinstance(test_DTb[0], np.ndarray) and test_DTb[0].ndim == 1: # Get predictions of DT with bagging for the 20 test values
                if test_DTb[0][0] != -1:  # Assuming first element for comparison
                    prediction_dtb = round(test_DTb[i][k], 3)
                else:
                    prediction_dtb = np.nan
            elif test_DTb[0] != -1:
                prediction_dtb = round(test_DTb[i], 3)
            else:
                prediction_dtb = np.nan

            if isinstance(test_SVM[0], np.ndarray) and test_SVM[0].ndim == 1: # Get predictions of SVM for the 20 test values
                if test_SVM[0][0] != -1:  # Assuming first element for comparison
                    prediction_svm = round(test_SVM[i][k], 3)
                else:
                    prediction_svm = np.nan
            elif test_SVM[0] != -1:
                prediction_svm = round(test_SVM[i], 3)
            else:
                prediction_svm = np.nan

            if isinstance(test_LR[0], np.ndarray) and test_LR[0].ndim == 1: # Get predictions of LR for the 20 test values
                if test_LR[0][0] != -1:  # Assuming first element for comparison
                    prediction_lr = round(test_LR[i][k], 3)
                else:
                    prediction_lr = np.nan
            elif test_LR[0] != -1:
                prediction_lr = round(test_LR[i], 3)
            else:
                prediction_lr = np.nan

            # Now, print all the collected predictions
            print(f'Testwert: {actual_value}, Vorhersage NN: {prediction_nn}, Vorhersage RF: {prediction_rf}, Vorhersage DT (bagging): {prediction_dtb}, Vorhersage SVM: {prediction_svm}, Vorhersage LR: {prediction_lr}')


# 7.2 Auswertung der Performance-Parameter und Beschreibung der Modelle im csv-Format speichern

In [98]:
import pandas as pd
import os

def round_list(x, decimal_places):
    # This function round an entire python list. Also able to handle single numbers and numpy arrays.
    if isinstance(x, list):
        if len(x) == 1:
            # If it's a list with one element, round that single element
            return round(x[0], decimal_places)
        else:
            # Apply round to each element in the list
            return [round(num, decimal_places) for num in x]
    elif isinstance(x, np.ndarray):
        # If it's a numpy array, apply round element-wise
        return np.round(x, decimal_places).tolist()
    else:
        # If it's a single number, just round it
        return round(x, decimal_places)


def save_results(res_NN, res_RF, res_DTb, res_SVM, res_LR, output_name, datensatz_groesse, num_inputs, folder):
  # Save all the performance results to a specified folder.
  # All values are written to a results.csv file. If one already exists, the values are appended.
  # This serves the purpose that all metrics of one project are in the same file, allowing for the generation of comprehensive diagrams.

  # Get the results from the function inputs
  results = {
      "Größe Datensatz": datensatz_groesse,
      "Anzahl Inputs": num_inputs,
      "Outputgröße(n)": output_name,  # maybe change later if several outputs exist
      "Trainingszeit NN": res_NN[1],
      "Trainingszeit RF": res_RF[1],
      "Trainingszeit DT (bagging)": res_DTb[1],
      "Trainingszeit SVM": res_SVM[1],
      "Trainingszeit LR": res_LR[1],
      "R² NN": round_list(res_NN[2], 5),
      "R² RF": round_list(res_RF[2], 5),
      "R² DT (bagging)": round_list(res_DTb[2], 5),
      "R² SVM": round_list(res_SVM[2], 5),
      "R² LR": round_list(res_LR[2], 5),
      "MAE NN": round_list(res_NN[3], 5),
      "MAE RF": round_list(res_RF[3], 5),
      "MAE DT (bagging)": round_list(res_DTb[3], 5),
      "MAE SVM": round_list(res_SVM[3], 5),
      "MAE LR": round_list(res_LR[3], 5),
      "MSE NN": round_list(res_NN[4], 5),
      "MSE RF": round_list(res_RF[4], 5),
      "MSE DT (bagging)": round_list(res_DTb[4], 5),
      "MSE SVM": round_list(res_SVM[4], 5),
      "MSE LR": round_list(res_LR[4], 5)
  }

  # Define csv-filepath
  csv_file = f'{folder}/results.csv'

  # Check if file already exists
  file_exists = os.path.isfile(csv_file)

  # Save results as pandas-dataframe
  df = pd.DataFrame([results])

  # Write results from dataframe to .csv-file
  if file_exists:
    df.to_csv(csv_file, mode='a', header=False, index=False)
  else:
    df.to_csv(csv_file, mode='w', header=True, index=False)

  print(f"Results saved to {csv_file}")

def model_table(res_NN, res_RF, res_DTb, res_SVM, res_LR, output_name, datensatz_groesse, num_inputs, input_names, model_folder, classifications, in_min, in_max, out_min, out_max):
    # This function saves the properties of trained models in a dedicated csv-file
    # This allows for selection of the models when making predictions, as well as obtaining information while doing so
    # Important info includes the type of model (NN, RF and so on), performance of the model (R²-Value), Input and Output names
    # Furthermore, min and max values are also saved to the file to allow the prediction GUI to perform the reverse normalization

    # Convert input_names to a list for proper CSV formatting
    input_names_list = input_names.tolist()

    out_min = out_min.tolist() if isinstance(out_min, pd.Series) else [out_min]
    out_max = out_max.tolist() if isinstance(out_max, pd.Series) else [out_max]

    # Define individual results for each model
    results = [
        {"Model Typ": "NN-Model", "Model Name": res_NN[0], "R² Value": round_list(res_NN[2], 5), "Größe Trainingsdatensatz": datensatz_groesse, "Anzahl Inputs": num_inputs, "Inputgrößen": ';'.join(map(str, input_names_list)), "Outputgrößen": output_name, "Klassifizierung": classifications, "Inputs Min": ';'.join(map(str, in_min)), "Inputs Max": ';'.join(map(str, in_max)), "Outputs Min": out_min, "Outputs Max": out_max},
        {"Model Typ": "RF-Model", "Model Name": res_RF[0], "R² Value": round_list(res_RF[2], 5), "Größe Trainingsdatensatz": datensatz_groesse, "Anzahl Inputs": num_inputs, "Inputgrößen": ';'.join(map(str, input_names_list)), "Outputgrößen": output_name, "Klassifizierung": classifications, "Inputs Min": ';'.join(map(str, in_min)), "Inputs Max": ';'.join(map(str, in_max)), "Outputs Min": out_min, "Outputs Max": out_max},
        {"Model Typ": "DT mit Bagging Model", "Model Name": res_DTb[0], "R² Value": round_list(res_DTb[2], 5), "Größe Trainingsdatensatz": datensatz_groesse, "Anzahl Inputs": num_inputs, "Inputgrößen": ';'.join(map(str, input_names_list)), "Outputgrößen": output_name, "Klassifizierung": classifications, "Inputs Min": ';'.join(map(str, in_min)), "Inputs Max": ';'.join(map(str, in_max)), "Outputs Min": out_min, "Outputs Max": out_max},
        {"Model Typ": "SVM-Model", "Model Name": res_SVM[0], "R² Value": round_list(res_SVM[2], 5), "Größe Trainingsdatensatz": datensatz_groesse, "Anzahl Inputs": num_inputs, "Inputgrößen": ';'.join(map(str, input_names_list)), "Outputgrößen": output_name, "Klassifizierung": classifications, "Inputs Min": ';'.join(map(str, in_min)), "Inputs Max": ';'.join(map(str, in_max)), "Outputs Min": out_min, "Outputs Max": out_max},
        {"Model Typ": "LR-Model", "Model Name": res_LR[0], "R² Value": round_list(res_LR[2], 5), "Größe Trainingsdatensatz": datensatz_groesse, "Anzahl Inputs": num_inputs, "Inputgrößen": ';'.join(map(str, input_names_list)), "Outputgrößen": output_name, "Klassifizierung": classifications, "Inputs Min": ';'.join(map(str, in_min)), "Inputs Max": ';'.join(map(str, in_max)), "Outputs Min": out_min, "Outputs Max": out_max},
    ]

    # CSV file path
    csv_file = f'{model_folder}/models.csv'
    # check if file already exists
    file_exists = os.path.isfile(csv_file)
    # turn results into a pandas dataframe
    df = pd.DataFrame(results)

    # Save the results from dataframe to CSV file
    if file_exists:
        df.to_csv(csv_file, mode='a', header=False, index=False)
    else:
        df.to_csv(csv_file, mode='w', header=True, index=False)

    print(f"Results saved to {csv_file}")


# 7.3 Diagramme der Performance-Parameter aus der Auswertungs-datei generieren

In [99]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np

def plot_results(folder, show_plots):
    # This functions serves to plot the performance metrics saved in a results.csv-file in the specified folder
    # It saves the diagrams and can display the plots directly if specified (show_plots = True)

    # Check if plots should be displayed
    if show_plots == True:
        plt.ion()
    else:
        plt.ioff()

    def clean_filename(filename):
        # Function to clean file names
        if isinstance(filename, str):
            # Replace all non-alphanumeric characters (except spaces) with underscores
            return re.sub(r'[^\w\s]', '_', filename).replace(' ', '_')
        else:
            return "undefined"  # Return a default or sanitized string for non-string inputs

    # Define CSV file path in which the results are saved
    csv_file = f'{folder}/results.csv'
    # Read CSV file
    df = pd.read_csv(csv_file)

    # Function to ensure values are lists
    def ensure_list(value):
        if pd.isna(value):
            return [np.nan] * 1000  # Return placeholder list for NaN values. Make it 1000 elements long avoid errors in use cases with a big amount of outputs.
        elif isinstance(value, str):
            try:
                # Attempt to convert a string representation of a list into a list
                return eval(value)
            except:
                return [np.nan, np.nan]  # Return placeholder list if conversion fails
        elif isinstance(value, list):
            return value  # Return the list directly if it's already a list
        else:
            return [np.nan] * 1000  # Return placeholder list for NaN values. Make it 1000 elements long avoid errors in use cases with a big amount of outputs.

    # Ensure 'Outputgröße' and corresponding columns are lists
    df['Outputgröße(n)'] = df['Outputgröße(n)'].apply(ensure_list)
    df['R² NN'] = df['R² NN'].apply(ensure_list)
    df['R² RF'] = df['R² RF'].apply(ensure_list)
    df['R² DT (bagging)'] = df['R² DT (bagging)'].apply(ensure_list)
    df['R² SVM'] = df['R² SVM'].apply(ensure_list)
    df['R² LR'] = df['R² LR'].apply(ensure_list)
    df['MAE NN'] = df['MAE NN'].apply(ensure_list)
    df['MAE RF'] = df['MAE RF'].apply(ensure_list)
    df['MAE DT (bagging)'] = df['MAE DT (bagging)'].apply(ensure_list)
    df['MAE SVM'] = df['MAE SVM'].apply(ensure_list)
    df['MAE LR'] = df['MAE LR'].apply(ensure_list)
    df['MSE NN'] = df['MSE NN'].apply(ensure_list)
    df['MSE RF'] = df['MSE RF'].apply(ensure_list)
    df['MSE DT (bagging)'] = df['MSE DT (bagging)'].apply(ensure_list)
    df['MSE SVM'] = df['MSE SVM'].apply(ensure_list)
    df['MSE LR'] = df['MSE LR'].apply(ensure_list)

    # Flatten the lists in 'Outputgröße' and create a set of unique output features
    unique_output_features = set()
    for sublist in df['Outputgröße(n)']:
        unique_output_features.update([item for item in sublist if isinstance(item, str)])

    # For each unique output feature, create plots
    for output_feature in unique_output_features:
        if show_plots == True:
          print("Eigenschaft: ", output_feature) # print the plotted property if a plot is produced

        df_feature = df[df['Outputgröße(n)'].apply(lambda x: output_feature in x)]
        clean_feature = clean_filename(output_feature)

        # Extract corresponding values for R², MAE, and MSE
        def extract_values(df, column, output_feature):
            return df.apply(lambda row: row[column][row['Outputgröße(n)'].index(output_feature)], axis=1)

        # Diagram 1: R² vs. size of the dataset. Different markers for every ML-Algorithm.
        plt.figure(figsize=(10, 6))
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'R² NN', output_feature), label='NN', marker='o')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'R² RF', output_feature), label='RF', marker='s')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'R² DT (bagging)', output_feature), label='DT (bagging)', marker='x')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'R² SVM', output_feature), label='SVM', marker='^')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'R² LR', output_feature), label='LR', marker='D')
        # Set the origin of the y-axis to zero
        plt.ylim(bottom=0)
        plt.xlabel('Größe Datensatz')
        plt.ylabel('R²')
        plt.title(f'R² vs. Größe Datensatz für {output_feature}')
        plt.legend()
        plt.grid(True)
        plt.savefig(f'{folder}/R2_vs_Größe_Datensatz_{clean_feature}.png')
        if show_plots == True:
          plt.show()

        # Diagram 2: MAE vs. size of the dataset. Different markers for every ML-Algorithm.
        plt.figure(figsize=(10, 6))
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'MAE NN', output_feature), label='NN', marker='o')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'MAE RF', output_feature), label='RF', marker='s')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'MAE DT (bagging)', output_feature), label='DT (bagging)', marker='x')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'MAE SVM', output_feature), label='SVM', marker='^')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'MAE LR', output_feature), label='LR', marker='D')
        # Set the origin of the y-axis to zero
        plt.ylim(bottom=0)
        plt.xlabel('Größe Datensatz')
        plt.ylabel('MAE')
        plt.title(f'MAE vs. Größe Datensatz für {output_feature}')
        plt.legend()
        plt.grid(True)
        plt.savefig(f'{folder}/MAE_vs_Größe_Datensatz_{clean_feature}.png')
        if show_plots == True:
          plt.show()

        # Diagram 3: MSE vs. Größe Datensatz. Different markers for every ML-Algorithm.
        plt.figure(figsize=(10, 6))
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'MSE NN', output_feature), label='NN', marker='o')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'MSE RF', output_feature), label='RF', marker='s')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'MSE DT (bagging)', output_feature), label='DT (bagging)', marker='x')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'MSE SVM', output_feature), label='SVM', marker='^')
        plt.scatter(df_feature['Größe Datensatz'], extract_values(df_feature, 'MSE LR', output_feature), label='LR', marker='D')
        # Set the origin of the y-axis to zero
        plt.ylim(bottom=0)
        plt.xlabel('Größe Datensatz')
        plt.ylabel('MSE')
        plt.title(f'MSE vs. Größe Datensatz für {output_feature}')
        plt.legend()
        plt.grid(True)
        plt.savefig(f'{folder}/MSE_vs_Größe_Datensatz_{clean_feature}.png')
        if show_plots == True:
          plt.show()

# Activate the following line and enter the correct folder if you want to create and show plots from some existing file.
#plot_results('/content/drive/MyDrive/Masterarbeit_ML/Auswertung/Fall1_Reproduce', True)

# 8. Create the GUI and start the program to train the ML-Models

In [100]:
# Import libraries for the creation of the GUI, access to drive and others
import ipywidgets as widgets
from IPython.display import display, clear_output
import os
import pandas as pd
import numpy as np

importedData = []

# Initialize variables to store user inputs
show_plots = False
model_path = ''
analysis_path = ''
csv_file_path = ''
maxRows_value = 0
input_checkboxes = []
output_checkboxes = []
fixed_checkbox_values = [False, False, False, False]
sample_in_norm = [] # TODO: Make this function its own input field which is used for actually useful predictions in the end.
outlier_cutoff = 3  # Initial value, same as the slider's starting value
max_cutoff = 10 # maximum standard deviations allowed in outlier detection before no outlier detection is performed

# Functions to handle button clicks
def select_file(change):
    # This function allows to enter a csv file path
    global csv_file_path
    uploaded_file = next(iter(change['new']))
    csv_file_path = os.path.join('/content', uploaded_file)
    with open(csv_file_path, 'wb') as f:
        f.write(change['new'][uploaded_file]['content'])
    file_label.value = f"Ausgewählte CSV-Datei: {csv_file_path}"  # Line updated to reflect chosen file


def start_execution(button):
    # This function is called when the "Start" button is clicked.
    # It takes values from the GUI elements, displays information, gets in- and output columns from the checkboxes
    # Calls the training- and test functions and calls the functions to display the results

    global model_path, analysis_path, input_checkboxes, output_checkboxes, fixed_checkbox_values, importedData, csv_file_path

    model_path = model_folder.value
    analysis_path = analysis_folder.value
    csv_file_path = drive_file_path.value  # Use the inputted Google Drive file path
    in_checkbox_values = [cb.value for cb in input_checkboxes]
    out_checkbox_values = [cb.value for cb in output_checkboxes]
    fixed_checkbox_values = [cb.value for cb in ML_checkboxes]
    show_plots = show_plots_checkbox.value  # Retrieve the checkbox value whether plots should be shown or not

    output.clear_output()
    with output:
        # display information for the training process
        print(f"Ordner zum Abspeichern der Modelle: {model_path}")
        print(f"Ordner zum Abspeichern der Auswertung der Modelle: {analysis_path}")
        print(f"CSV-Datei: {csv_file_path}")
        print(f"Maximale Anzahl Datensätze: {maxRows_value}")
        #print(f"imported Data: {importedData}")

        # get input columns from checkboxes
        input_columns = []
        column_counter = 0
        for i in in_checkbox_values:
          if i == True:
            input_columns.append(column_counter)
          column_counter += 1
        input_names = importedData.columns[input_columns]

        # get output columns from checkboxes
        output_columns = []
        column_counter = 0
        for i in out_checkbox_values:
          if i == True:
            output_columns.append(column_counter) # makes selection of multiple outputs possible
          column_counter += 1
        output_names = importedData.columns[output_columns]

        # Detect classification columns and use the encode-function to convert them into one-hot-encoded inputs
        importedData = encode_classification(importedData)
        classification_columns = list(importedData.columns)

        # Re-assign the input columns according to the new structure including classifications
        input_columns_new = []
        for i, col_name in enumerate(importedData.columns):
          # Check if column names are a substring of the current input column name (= one-hot-encoding classification). If it is the case, add them to input columns
          if any(input_name in col_name for input_name in input_names):
              input_columns_new.append(i)

        # Re-assign the output columns according to the new structure including classifications
        output_columns_new = []
        for i, col_name in enumerate(importedData.columns):
        # Check if column names are a substring of the current output column name
          if any(output_name in col_name for output_name in output_names):
              output_columns_new.append(i)

        # finish pre-processing of dataset (normalization, selection of previously determined in- and otput columns)
        processed_dataset = process_data(importedData, input_columns_new, output_columns_new)

        # split dataset into training- and test data
        train_test_data = split_data(processed_dataset[0], processed_dataset[1])

        # Initialize result list for NN - equivalent for all other ML-Algorithms
        NNresults = [np.nan, np.nan, np.nan, np.nan, np.nan]
        NNtest = [-1]
        if fixed_checkbox_values[0] == True: # Check if checkbox for NN is checked - equivalent for all other ML-Algorithms
          print("Training des Neuronalen Netzwerks... Dies kann bei großen Datensätzen einige Minuten dauern.")
          # Train the NN-model by calling the appropriate function - equivalent for all other ML-Algorithms
          # Parameters: input values for training, input values for testing, output values for training, output values for testing, number of inputs, model folder, output_min, output_max
          NNresults = NN(train_test_data[0], train_test_data[1], train_test_data[2], train_test_data[3], processed_dataset[2], model_path, processed_dataset[5], processed_dataset[6])
          # Produce test values with the trained model
          NNtest = testNN(NNresults[0], train_test_data[1], train_test_data[2], processed_dataset[5], processed_dataset[6])

        RFresults = [np.nan, np.nan, np.nan, np.nan, np.nan] # equivalent to NN
        RFtest = [-1]
        if fixed_checkbox_values[1] == True:
          print("Training des Random Forest...")
          RFresults = RF(train_test_data[0], train_test_data[1], train_test_data[2], train_test_data[3], processed_dataset[2], model_path, processed_dataset[5], processed_dataset[6])
          RFtest = testRF(RFresults[0], train_test_data[1], train_test_data[2], processed_dataset[5], processed_dataset[6])

        DTbaggingresults = [np.nan, np.nan, np.nan, np.nan, np.nan] # equivalent to NN
        DTbaggingtest = [-1]
        if fixed_checkbox_values[2] == True:
          print("Training des Decision Tree Bagging...")
          DTbaggingresults = DT_Bagging(train_test_data[0], train_test_data[1], train_test_data[2], train_test_data[3], processed_dataset[2], model_path, processed_dataset[5], processed_dataset[6])
          DTbaggingtest = testDT_Bagging(DTbaggingresults[0], train_test_data[1], train_test_data[2], processed_dataset[5], processed_dataset[6])

        SVMresults = [np.nan, np.nan, np.nan, np.nan, np.nan] # equivalent to NN
        SVMtest = [-1]
        if fixed_checkbox_values[3] == True:
          print("Training der SVM...")
          SVMresults = SVM(train_test_data[0], train_test_data[1], train_test_data[2], train_test_data[3], processed_dataset[2], model_path, processed_dataset[5], processed_dataset[6])
          SVMtest = testSVM(SVMresults[0], train_test_data[1], train_test_data[2], processed_dataset[5], processed_dataset[6])

        LRresults = [np.nan, np.nan, np.nan, np.nan, np.nan] # equivalent to NN
        LRtest = [-1]
        if fixed_checkbox_values[4] == True:
          print("Training der linearen Regression...")
          LRresults = LR(train_test_data[0], train_test_data[1], train_test_data[2], train_test_data[3], processed_dataset[2], model_path, processed_dataset[5], processed_dataset[6])
          LRtest = testLR(LRresults[0], train_test_data[1], train_test_data[2], processed_dataset[5], processed_dataset[6])

        print("Training aller ML-Modelle abgeschlossen.")
        output_names = output_names.tolist()

        # call functions to print & save the performance values
        print_results(NNresults, RFresults, DTbaggingresults, SVMresults, LRresults, NNtest, RFtest, DTbaggingtest, SVMtest, LRtest, output_names, train_test_data[3], processed_dataset[5], processed_dataset[6])
        save_results(NNresults, RFresults, DTbaggingresults, SVMresults, LRresults, output_names, maxRows_value, processed_dataset[2], analysis_path)
        # call functions to write the model properties to the "model" table
        model_table(NNresults, RFresults, DTbaggingresults, SVMresults, LRresults, output_names, maxRows_value, processed_dataset[2], input_names, model_path, classification_columns, processed_dataset[3], processed_dataset[4], processed_dataset[5], processed_dataset[6])
        # call function to plot the performance diagrams
        plot_results(analysis_path, show_plots)


def import_data_and_reveal(button):
    # This function is called when the "import" button is pressed
    # It mainly creates checkboxes to select inputs, outputs and ML-Algorithms

    global csv_file_path, maxRows_value, input_checkboxes, importedData, output_checkboxes, buttons_box

    maxRows_value = maxRows_input.value
    if csv_file_path:  # Condition to check if a file path has been set by upload
        csv_file_path = csv_file_path
    else:
        csv_file_path = drive_file_path.value  # Use the inputted Google Drive file path only if no file was uploaded
    delimiter = delimiter_input.value  # Get the delimiter value from the user

    # Import the data using the appropraite function.
    importedData = import_data(maxRows_value, csv_file_path, delimiter, outlier_cutoff, max_cutoff)
    if maxRows_value > len(importedData):
      maxRows_value = len(importedData)
      print("Less than specified maximum of rows of data available. New training data size: ", maxRows_value)

    # Create checkboxes for each column in the CSV file to select the inputs
    input_checkboxes = []
    for column in importedData.columns:
        cb = widgets.Checkbox(value=False, description=column)
        input_checkboxes.append(cb)
    input_checkboxes_box.children = input_checkboxes

    # Create checkboxes for each column in the CSV file to select the outputs
    output_checkboxes = []
    for column in importedData.columns:
        cb = widgets.Checkbox(value=False, description=column)
        output_checkboxes.append(cb)
    output_checkboxes_box.children = output_checkboxes

    # cosmetic display options for the new UI elements
    select_inputs_label.layout.display = 'block'
    input_checkboxes_box.layout.display = 'block'
    select_outputs_label.layout.display = 'block'
    output_checkboxes_box.layout.display = 'block'
    ML_checkboxes_label.layout.display = 'block'
    ML_checkboxes_box.layout.display = 'block'
    show_plots_checkbox.layout.display = 'block'
    
    # extra treatment for start-button & Performance-Diagram box: .display = 'flex' has proven to work better here.
    buttons_box.layout.display = 'flex'


# UI elements for folder input
model_folder = widgets.Text(
    description="Speicherort für trainierte Modelle:",
    placeholder='/content/drive/MyDrive/pfad/zu/modell-ordner',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='60%')
)

analysis_folder = widgets.Text(
    description="Speicherort für Auswertung der Modelle:",
    placeholder='/content/drive/MyDrive/pfad/zu/auswertungs-ordner',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='60%')
)

# UI elements for file selection
file_upload = widgets.FileUpload(accept='.csv', multiple=False)
file_upload.observe(select_file, names='value')
file_label = widgets.Label(value="Ausgewählte CSV-Datei: None")

# UI elements for file path input
drive_file_path = widgets.Text(
    description="Pfad der CSV-Datei oder Upload:",
    placeholder='/content/drive/MyDrive/path/to/file.csv',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='60%')
)

# UI element for csv-delimiter input
delimiter_input = widgets.Text(
    description="CSV-Datei Spalten-Trennzeichen:",
    value=',',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='30%')
)


# Function to update the label and cutoff value based on the value of the 'Ausreißerdetektion'-slider
def update_label(change):
    global outlier_cutoff
    if change['new'] == max_cutoff:
        value_label.value = 'Keine Ausreißerdetektion'
    else:
        value_label.value = f'{change["new"]} Standardabweichungen'
    outlier_cutoff = change.new
    #outlier_cutoff = 6 - outlier_cutoff

# UI element (slider) for setting the outlier detection cutoff
outlier_cutoff_slider = widgets.FloatSlider(
    value=3,
    min=1,
    max=max_cutoff,
    step=0.1,
    description='Empfindlichkeit der Ausreißerdetektion:',
    readout=False,
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='40%')
)

# Label to display the current value of the Slider
value_label = widgets.Label(value='3.0 Standardabweichungen')

# Observe changes in the 'Ausreißerdetektion'-slider's value
outlier_cutoff_slider.observe(update_label, names='value')

# UI elements for integer input - number of rows of the dataset that should be used)
maxRows_input = widgets.BoundedIntText(
    value=0,  # Default value
    min=0,    # Minimum value allowed
    max = 1000000000, # Maximum value - arbitrary, very high number. If this is not specified, the program may automatically set a too low max value.
    description="Maximale Anzahl der zu verwendenden Reihen:",
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='40%')
)

# UI elements for in-/output checkboxes (variable names & amount, further specified in the function import_data_and_reveal)
select_inputs_label = widgets.Label(value="Input-Spalten auswählen", layout=widgets.Layout(display='none'))
input_checkboxes_box = widgets.VBox(layout=widgets.Layout(display='none'))
select_outputs_label = widgets.Label(value="Output-Spalten auswählen", layout=widgets.Layout(display='none'))
output_checkboxes_box = widgets.VBox(layout=widgets.Layout(display='none'))

# UI elements for checkboxes to select the ML-Algorithms
ML_checkboxes_label = widgets.Label(value="ML-Algorithmen auswählen", layout=widgets.Layout(display='none'))
ML_checkboxes = [
    widgets.Checkbox(value=False, description="Neuronales Netzwerk"),
    widgets.Checkbox(value=False, description="Random Forest"),
    widgets.Checkbox(value=False, description="Decision Tree mit Bagging"),
    widgets.Checkbox(value=False, description="Support Vector Machine"),
    widgets.Checkbox(value=False, description="Linear Regression")
]
ML_checkboxes_box = widgets.VBox(ML_checkboxes, layout=widgets.Layout(display='none'))

# Import Data button
import_data_button = widgets.Button(description="Datenimport starten")
import_data_button.on_click(import_data_and_reveal)

# Start button
start_button = widgets.Button(description="Start", layout=widgets.Layout(display='block'))
start_button.on_click(start_execution)

# Create a Checkbox to control wether the diagrams should be plotted
show_plots_checkbox = widgets.Checkbox(value=False, description='Zeige Performance-Diagramme', style={'description_width': 'initial'}, layout=widgets.Layout(display='none'))

# Output area
output = widgets.Output()

# Group each set of checkboxes with its label into a separate VBox
input_group = widgets.VBox([select_inputs_label, input_checkboxes_box])
output_group = widgets.VBox([select_outputs_label, output_checkboxes_box])
ml_algorithms_group = widgets.VBox([ML_checkboxes_label, ML_checkboxes_box])

# Combine the individual grouped VBoxes into a single HBox for horizontal alignment
combined_checkboxes_box = widgets.HBox([input_group, output_group, ml_algorithms_group])

# Add the show_plots_checkbox next to the start button in the UI layout
buttons_box = widgets.HBox([start_button, show_plots_checkbox])  # Group Start button and Checkbox in a horizontal layout
buttons_box.layout.display = 'none' # hide it at first

# Add a horizontal slider box to display the changing label next to instead below the slider
slider_box = widgets.HBox([outlier_cutoff_slider, value_label])

# display function to show all the elements
display(widgets.VBox([
    model_folder,
    analysis_folder,
    drive_file_path,
    file_upload, file_label,
    maxRows_input,
    delimiter_input,
    slider_box,
    import_data_button,
    combined_checkboxes_box,
    buttons_box,
    output
]))


VBox(children=(Text(value='', description='Speicherort für trainierte Modelle:', layout=Layout(width='60%'), p…

# 9. Create GUI to make predictions based on manual inputs

In [120]:
# Import GUI-Libraries & ML-Libraries to be able to call trained models
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
import numpy as np
import joblib
import ast

# Global variables
input_min = 0
input_max = 0
output_min = 0
output_max = 0
CalcModel_path = ''
CalcModel = []
custom_input = []
Classes = {}

# Create an Output widget
output = widgets.Output()

# GUI Field to specify a folder path
path_text = widgets.Text(
    value = '',
    placeholder='Pfad zum Ordner, der eine "models"-CSV-Datei und die entsprechenden Modelle enthält.',
    description="Ordner zum Aufrufen der trainierten Modelle: ",
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='40%'),
    disabled = False
)

# Button to start the selection of the model
start_button = widgets.Button(
    description='Start',
    disabled=False,
    button_style='',
    tooltip='Click to start',
    icon='play'
)

def load_csv(path):
    # Function to load CSV
    try:
        df = pd.read_csv(os.path.join(path, 'models.csv')) # read the "models"-csv and import as a pandas dataframe
        # Convert delimited strings back to lists. This is due how the models.csv file is structured when saving data to it.
        df['Inputgrößen'] = df['Inputgrößen'].apply(lambda x: x.split(';')) # split contents of the "Inputgrößen" column by ";"
        df['Inputs Min'] = df['Inputs Min'].apply(lambda x: list(map(float, x.split(';')))) # same for Inputs Min and Inputs Max column
        df['Inputs Max'] = df['Inputs Max'].apply(lambda x: list(map(float, x.split(';'))))

        return df
    except FileNotFoundError: # Error message if there is no models.csv-file in the specified folder
        print(f"No 'models.csv' found in the folder: {path}")
        return None

def create_input_fields(inputs_list):
    # Function to create input fields for the input data
    fields = []
    for item in inputs_list:
        if item in Classes:
            fields.append(widgets.Dropdown(
                options=Classes[item],
                description=f'{item}:',
                disabled=False,
                style={'description_width': 'initial'},
                layout=widgets.Layout(width='40%')
            ))
        else:
            fields.append(widgets.Text(
                value='',
                placeholder=f'Enter {item}',
                description=f'{item}:',
                disabled=False,
                style={'description_width': 'initial'},
                layout=widgets.Layout(width='40%')
            ))
    return fields

def validate_fields(fields):
# Function to validate if a value was entered in every input field
    for field in fields:
        if isinstance(field, widgets.Text) and field.value == '':
            return False
        elif isinstance(field, widgets.Dropdown) and field.value is None:
            return False
    return True

def collect_input(fields):
    # Function to get input from input fields

    # initialize lists for classification columns and columns with continuous values
    classified_data = []
    non_classified_data = []

    for field in fields:
        if isinstance(field, widgets.Dropdown): # A dropdown field indicates a classifying input
            options = field.options
            selected_value = field.value
            # enter "1" as classification input in the column that corresponds to the one-hot encoded classifying input
            one_hot_list = [1 if option == selected_value else 0 for option in options]
            classified_data.extend(one_hot_list)
        elif isinstance(field, widgets.Text): # A text field indicates a continuous input
            try:
                non_classified_data.append(float(field.value)) # should be provided as a float
            except ValueError:
                non_classified_data.append(field.value)  # Keep as string if it cannot be converted

    # Combine classified data at the beginning and non-classified data at the end
    return classified_data + non_classified_data

def parse_klassifizierung(klassifizierung_str):
    # Function to parse the Klassifizierung column and create Classes dictionary
    # Used to provide dropdown menues for classification inputs
    items = klassifizierung_str.strip('[]').split(', ')
    classes_dict = {}
    for item in items:
        if '_' in item:
            # columns that have an underscore "_" in their name indicate a classification.
            # In front of the _, the column category; e.g. "Material" is indicated. The part after _ indicates the classification possibility, e.g. "Holz"
            category, name = item.split('_')
            category = category.strip("'") # remove ' from the names & categories
            name = name.strip("'")
            if category not in classes_dict:
                classes_dict[category] = []
            classes_dict[category].append(name)
        #else:
            #print(f"Warning: Skipping item '{item}' due to missing underscore.")  # Notify about skipped items; can be uncommented if necessary
    return classes_dict

# Function to parse input/output values
def parse_input_output_values(value):
    if isinstance(value, str):
        # Handle the multi-line string case
        lines = value.split('\n')
        values = [line.split()[-1] for line in lines if line.strip() and not line.strip().endswith('float64')]
        return [float(v) for v in values]
    try:
        # Try converting to float
        return float(value)
    except ValueError:
        # If conversion fails, it's likely a list of values
        value = value.strip('[]').split(', ')
        return [float(v) for v in value]

# Start button action
def start_action(button):
    # This function is called when the "Start" button is clicked
    global CalcModel_path
    with output:
        clear_output()
    
        # Get the model-csv path from the corresponding text field
        CalcModel_path = path_text.value
        # Load the model.csv file as a dataframe
        df = load_csv(CalcModel_path)

        if df is not None:
            # Create a dropdown menu with row options to select the model from the models.csv file (which correspond to actual models in th same folder)
            options = [tuple(row) for row in df.values]
            dropdown = widgets.Dropdown(
                options=options,
                description='Select Model:',
                disabled=False,
                style={'description_width': 'initial'},
                layout=widgets.Layout(width='40%')
            )

            def on_row_selected(change):
                with output:
                    # This function is called when a model was selected
                    # It provides some info on the selected model
                    # And calls some of the previously defined functions to create input fields that allow for continuous and classfying inputs
                    # As well as calling the functions handling the inputs given in these fields accordingly

                    global CalcModel, Classes, input_min, input_max, output_min, output_max, Model_Typ, Model_Name, Output_Name

                    CalcModel = list(change.new)
                    #print(f"Selected Model: {CalcModel[df.columns.get_loc('Model Name')}") # display the model selected from the dropdown-menu
                    selected_index = change.owner.options.index(change.new)
                    klassifizierung_str = CalcModel[df.columns.get_loc('Klassifizierung')]
                    # Get the classifying inputs
                    Classes = parse_klassifizierung(klassifizierung_str)
                    #print(f"Classes: {Classes}")

                    # extract the min and max values from the imported models.csv needed for the normalization of inputs
                    # and for the reverse normalization of outputs lateron
                    input_min = CalcModel[df.columns.get_loc('Inputs Min')]
                    input_max = CalcModel[df.columns.get_loc('Inputs Max')]
                    output_min = df['Outputs Min'].apply(ast.literal_eval).apply(np.array)[selected_index]
                    output_max = df['Outputs Max'].apply(ast.literal_eval).apply(np.array)[selected_index]

                    Model_Typ = CalcModel[df.columns.get_loc('Model Typ')]
                    Model_Name = CalcModel[df.columns.get_loc('Model Name')]
                    Output_Name = df['Outputgrößen'].apply(ast.literal_eval).apply(np.array)[selected_index]

                    # Get all the inputs
                    inputs_list = CalcModel[5]
                    # And create input fields for all of them
                    input_fields = create_input_fields(inputs_list)

                    #display(path_text)
                    #display(start_button)
                    #display(dropdown)

                    for field in input_fields:
                        display(field)

                    # Predict button
                    predict_button = widgets.Button(
                        description='Vorhersagen',
                        disabled=False,
                        button_style='',
                        tooltip='Click to predict',
                        icon='check'
                    )

                    def predict_action(button):
                        with output:
                            # Function called when the "Vorhersagen" button is clicked
                            global custom_input
                            if validate_fields(input_fields): # check if all input fields have been entered
                                custom_input = collect_input(input_fields)
                                make_predictions() # make predictions using the specified model if all inputs are provided
                            else:
                                print("Please fill all fields.")

                    predict_button.on_click(predict_action)
                    display(predict_button)

            dropdown.observe(on_row_selected, names='value')
            display(dropdown)

start_button.on_click(start_action)

# Display widgets
display(path_text)
display(start_button)
display(output)  # Display the output widget

# Placeholder for the user-defined function
def make_predictions():
    with output:
        # This function is (indirectly) called by clicking the "Vorhersagen" button
        # It finally makes a prediction based on the inputs

        global custom_input, input_min, input_max

        # Print information on predicting model (Name, Performance, ML-Algorithm Type, Training dataset size, Output variable)
        #print(f"Predicting with model: {CalcModel}")
        print("Information on selected Model:")
        print("Model Type: ", CalcModel[0])
        print("Model Name: ", CalcModel[1])
        print("R²-Value(s): ", CalcModel[2])
        print("Training dataset size: ", CalcModel[3])
        print("Variable(s) to predict: ", CalcModel[6])
        #print(f"Custom input: {custom_input}")

        # Normalize custom input
        normalized_input = [0] * len(custom_input)
        for i in range(len(custom_input)):
            try:
                normalized_input[i] = (float(custom_input[i]) - input_min[i]) / (input_max[i] - input_min[i])
            except (TypeError, ValueError) as e:
                normalized_input[i] = custom_input[i]

        # Reshape the input to a 2D array
        normalized_input = np.array(normalized_input).reshape(1, -1)  # Reshape to (1, number_of_features)

        if Model_Typ == 'NN-Model': # Tensorflow/keras-based NN and sklearn-based other algorithms have to be treated differently when making predictions
            # Use the NN model to make predictions for the custom input using the tensorflow/keras-library
            TestModel = tf.keras.models.load_model(Model_Name)
            predicted_output = TestModel.predict(normalized_input)
            # Flatten the predicted output to 1D
            predicted_output_flat = predicted_output.flatten()
        else:
            # Use one of the other models to make predictions. They are all loaded using the joblib-library.
            TestLR = joblib.load(Model_Name)
            # Make predictions
            predicted_output = TestLR.predict(normalized_input)

        predicted_output = predicted_output[0]
        # Ensure predicted_output is an array
        if isinstance(predicted_output, float):
            predicted_output = np.array([predicted_output])
        elif isinstance(predicted_output, list):
            predicted_output = np.array(predicted_output)

        # To get an actual, de-normalized output with a unit, perform a reverse normalization
        final_output = reverse_column_normalization(predicted_output, output_min, output_max)

        # print the predicted output(s)
        for i in range(0, len(Output_Name)):
            print(f"Der vorhergesagte Wert für {Output_Name[i]} beträgt: {final_output[i]}")


Text(value='', description='Ordner zum Aufrufen der trainierten Modelle: ', layout=Layout(width='40%'), placeh…

Button(description='Start', icon='play', style=ButtonStyle(), tooltip='Click to start')

Output()