In [None]:
!pip install google-auth
from google.colab import auth
auth.authenticate_user()
from google.colab import drive
drive.mount('/content/drive')
!ls /content/drive/MyDrive
import os

path = '/content/drive/My Drive'

if os.path.isdir(path):
    contents = os.listdir(path)
    print(f"Contents of {path}:")
    for item in contents:
        print(item)

files = os.listdir('/content/drive/My Drive/Sensor_dataset')
files = os.listdir('/content/drive/My Drive/Sensor_dataset/SP_Sensor')
files = os.listdir('/content/drive/My Drive/Sensor_dataset/BP_Sensor')
print(files)

In [1]:
#This code utilizes PyOperon and RFECV to determine the best features and their importances.
# It then calculates metrics and the best equation using PyOperon for each fold. Finally, it computes the averages of values for SR-pyoperon.

!pip install pandas
!pip install pyoperon
!pip install scikit-learn

import os

import re
import matplotlib.pyplot as plt
import pyoperon
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from pyoperon.sklearn import SymbolicRegressor
from sklearn.linear_model import LogisticRegression



def load_datasets():
    dataset_paths = [
        '/content/drive/My Drive/Sensor_dataset/feat_fold_0.csv',
        '/content/drive/My Drive/Sensor_dataset/feat_fold_1.csv',
        '/content/drive/My Drive/Sensor_dataset/feat_fold_2.csv',
        '/content/drive/My Drive/Sensor_dataset/feat_fold_3.csv',
        '/content/drive/My Drive/Sensor_dataset/feat_fold_4.csv'
    ]
    return [pd.read_csv(path) for path in dataset_paths]

def split_datasets(folds):
    num_folds = len(folds)
    splits = []
    for r in range(num_folds):
        train_indices = list(range(num_folds))
        train_indices.remove(r)
        test_indices = [r]
        train_dfs = [folds[i] for i in train_indices]
        test_dfs = [folds[i] for i in test_indices]
        splits.append((pd.concat(train_dfs), pd.concat(test_dfs)))
    return splits

def extract_variables(equation):
    return set(re.findall(r'X\d+', equation))




def perform_analysis(splits):
    fold_equations = []
    selected_feature_importances = {}
    results = []
    variable_mappings = {}
    for i, (train_data, test_data) in enumerate(splits):
        # Ensuring SP and DP are excluded properly from features
        X_train = train_data.drop(['SP', 'DP', 'trial', 'patient'], axis=1)
        y_train = train_data['DP']
        X_test = test_data.drop(['SP', 'DP', 'trial', 'patient'], axis=1)
        y_test = test_data['DP']
        selector = RFECV(LinearRegression(), step=1, cv=KFold(5), scoring='neg_mean_squared_error').fit(X_train, y_train)
        best_features = X_train.columns[selector.support_]
        # Symbolic Regressor Configuration (Adjusted to Exclude SP and DP)
        default_params = {
            'allowed_symbols': 'add,sub,mul,div,constant,variable,pow,exp,log,sin,cos,tan,tanh,sqrt,cbrt,square',
            'offspring_generator': 'brood',
            'initialization_method': 'btc',
            'comparison_factor': 0,
            'crossover_internal_probability': 0.9,
            'epsilon': 1e-05,
            'female_selector': 'tournament',
            'objectives': ['r2', 'mse', 'mae', 'length', 'rmse', 'c2'],

            'mutation_probability': 0.1,
            'reinserter': 'keep-best',
            'max_evaluations': int(1e6),
            'tournament_size': 3,
            'pool_size': 50,
            'population_size': 100,
            'generations': 500,
            'time_limit': 90,
            'crossover_probability': 1.0,
            'mutation_probability': 0.8,
            'max_depth':5

        }
        model = SymbolicRegressor(**default_params).fit(X_train[best_features], y_train)
        predictions = model.predict(X_test[best_features])
        m = model.model_
        best_equation = model.get_model_string(m)
        print(best_equation)
        fold_equations.append(best_equation)
        if predictions.ndim > 1:
            predictions = predictions.ravel()

        predictions_df = pd.DataFrame({
            'Actual': y_test,
            'Predicted': predictions
        })




        mse = mean_squared_error(y_test, predictions)
        mse_naive = mean_squared_error(y_test, [np.mean(y_test)] * len(y_test))
        mae = mean_absolute_error(y_test, predictions)
        mae_naive = mean_absolute_error(y_test, [np.mean(y_test)] * len(y_test))
        score = r2_score(y_test, predictions)
        errors = predictions.reshape(y_test.shape) - y_test
        ME = errors.mean()
        SD = errors.std()


        variables_used = extract_variables(best_equation)
        print("Variables used in the equation:", variables_used)

        variable_mappings[f"Fold {i}"] = {f"X{idx}": X_train.columns[int(idx[1:])] for idx in extract_variables(best_equation) if int(idx[1:]) < len(X_train.columns)}
        results.append({
            'fold': i,
            'MSE': mse,
            'MAE': mae,
            'R2_score': score,
            'MMSE': mse / mse_naive,
            'MASE': mae / mae_naive,
            'ME': ME,
            'SD': SD,
            'best_equation': best_equation,
            'best_features': list(best_features)
                  })



        # Store importances from LinearRegression used in RFECV
        selected_importances = dict(zip(best_features, abs(selector.estimator_.coef_)))
        fold_equations.append(best_equation)


        # Store importances

        for feature, importance in selected_importances.items():
            if feature in selected_feature_importances:
                selected_feature_importances[feature].append(importance)
            else:
                selected_feature_importances[feature] = [importance]

    # Average the importances across folds

    averaged_selected_importances = {k: np.mean(v) for k, v in selected_feature_importances.items()}
    return results,  averaged_selected_importances,   best_equation, fold_equations, variable_mappings


def save_variable_mapping_to_csv(variable_mapping, file_path):
    # Ensure the directory exists where the file will be saved
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    # Prepare a list to hold all rows before writing to CSV
    rows = []
    for fold, variables in variable_mapping.items():
        for index, name in variables.items():
            rows.append({'Fold': fold, 'Variable Index': index, 'Variable Name': name})

    # Create a DataFrame and write it to CSV
    df = pd.DataFrame(rows)
    df.to_csv(file_path, index=False)
    print("Data saved to CSV successfully.")





def main():
    folds = load_datasets()
    splits = split_datasets(folds)
    results, selected_importances,  best_equation, fold_equations,  variable_mapping  = perform_analysis(splits)
    save_variable_mapping_to_csv(variable_mapping, '/content/drive/My Drive/Sensor_dataset/DP_Sensor/variable_mapping_sensor_dp.csv')



    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/My Drive/Sensor_dataset/DP_Sensor/model_performance_results_Sensor_DP_Selected_sensor.csv', index=False)
    numeric_columns = results_df.select_dtypes(exclude='object').columns

    average_results = results_df[numeric_columns].mean().to_frame().transpose()
    average_results.to_csv('/content/drive/My Drive/Sensor_dataset/DP_Sensor/average_model_performance_results_DP_Sensor_Selected_sensor.csv', index=False)
    print("Average results:")
    print(average_results)


    selected_importances_df = pd.DataFrame(list(selected_importances.items()), columns=['Feature_DP', 'Importance_DP']).sort_values(by='Importance_DP', ascending=False)
    print("Results and importances saved to CSV.")
    print(results_df)


    print("Selected Feature Importances_DP:")
    print(selected_importances)

    selected_importances_df.to_csv('/content/drive/My Drive/Sensor_dataset/DP_Sensor/selected_feature_importances_DP_sensor.csv', index=False)



if __name__ == "__main__":
    main()


Collecting pyoperon
  Downloading pyoperon-0.3.4-cp310-cp310-manylinux_2_27_x86_64.whl (891 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m892.0/892.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyoperon
Successfully installed pyoperon-0.3.4


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Sensor_dataset/feat_fold_0.csv'

In [None]:
#This code is employed to generate new folds by extracting features from the best equations obtained from each fold and the features obtained via RFECV.
import pandas as pd
# Load the CSV file from a specified path
file_path = '/content/drive/My Drive/Sensor_dataset/DP_Sensor/variable_mapping_sensor_dp.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
print(data.head())

# Select the 'Variable Name' column and drop duplicates to get unique names
unique_variable_names = data['Variable Name'].drop_duplicates()

# Convert the Series of unique variable names to a Python list
unique_variable_names_list = unique_variable_names.tolist()
print("unique_variable_names_list",unique_variable_names_list )
# Define the dataset paths
dataset_paths = [
     '/content/drive/My Drive/Sensor_dataset/feat_fold_0.csv',
        '/content/drive/My Drive/Sensor_dataset/feat_fold_1.csv',
        '/content/drive/My Drive/Sensor_dataset/feat_fold_2.csv',
        '/content/drive/My Drive/Sensor_dataset/feat_fold_3.csv',
        '/content/drive/My Drive/Sensor_dataset/feat_fold_4.csv'
    ]


# Extract the feature names from the DataFrame
feature_names = unique_variable_names_list+['SP', 'DP', 'trial', 'patient']

# Loop through each dataset file
for i, path in enumerate(dataset_paths):
    # Load the dataset
    df = pd.read_csv(path)

    # Select the required columns based on feature names
    selected_df = df.loc[:, df.columns.isin(feature_names)]

    # Save the new DataFrame with selected features to a new CSV file
    new_file_path = f'/content/drive/My Drive/Sensor_dataset/DP_Sensor/feat_fold_{i}_selected_DP_sensor.csv'
    selected_df.to_csv(new_file_path, index=False)

    print(f"Saved selected features for fold {i} to {new_file_path}")

In [None]:
# This code evaluates the performance of traditional machine learning models using a training dataset across new folds.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb

def load_data(file_paths):
    return [pd.read_csv(f) for f in file_paths]

def train_evaluate_models(X_train, y_train, X_test, y_test):
    models = {
        'Naive': None,
        'LightGBM': lgb.LGBMRegressor(),
        'SVR': SVR(),
        'Random Forest': RandomForestRegressor(),
        'MLP': MLPRegressor(max_iter=500),
        'AdaBoost': AdaBoostRegressor()
    }

    results = []
    for name, model in models.items():
        if name == 'Naive':
            y_pred_test = np.full(shape=(len(y_test),), fill_value=np.mean(y_train))
        else:
            model.fit(X_train, y_train)
            y_pred_test = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred_test)
        mae = mean_absolute_error(y_test, y_pred_test)
        score = r2_score(y_test, y_pred_test)
        errors = y_pred_test - y_test
        ME = errors.mean()
        SD = errors.std()

        mse_naive = mean_squared_error(y_test, [np.mean(y_test)] * len(y_test))

        mae_naive = mean_absolute_error(y_test, [np.mean(y_test)] * len(y_test))

        results.append({
            'Model': name,
            'MSE': mse,
            'MAE': mae,
            'R2_Score': score,
            #'MSE_Naive': mse_naive,
            #'MAE_Naive': mae_naive ,
            'MMSE': mse / mse_naive,
            'MASE': mae / mae_naive,
            'ME': ME,
            'SD': SD
        })
    return results

def main():

    selected_files = [
        '/content/drive/My Drive/Sensor_dataset/SP_Sensor/feat_fold_0_selected_SP_sensor.csv',
        '/content/drive/My Drive/Sensor_dataset/SP_Sensor/feat_fold_1_selected_SP_sensor.csv',
        '/content/drive/My Drive/Sensor_dataset/SP_Sensor/feat_fold_2_selected_SP_sensor.csv',
        '/content/drive/My Drive/Sensor_dataset/SP_Sensor/feat_fold_3_selected_SP_sensor.csv',
        '/content/drive/My Drive/Sensor_dataset/SP_Sensor/feat_fold_4_selected_SP_sensor.csv'
    ]


    selected_data = load_data(selected_files)


    all_selected_results = []

    for i in range(len(  selected_data)):

        # Selected data processing
        train_data, test_data = train_test_split(selected_data[i], test_size=0.2, random_state=42)
        X_train_selected = train_data.drop(['SP', 'DP', 'trial', 'patient'], axis=1)
        y_train_selected = train_data['SP']
        X_test_selected = test_data.drop(['SP', 'DP', 'trial', 'patient'], axis=1)
        y_test_selected = test_data['SP']
        selected_results = train_evaluate_models(X_train_selected, y_train_selected, X_test_selected, y_test_selected)
        all_selected_results.extend(selected_results)

    pd.DataFrame(all_selected_results).to_csv('/content/drive/My Drive/Sensor_dataset/SP_Sensor/results_selected_SP_sensor.csv', index=False)
    print("Results for original and selected datasets have been saved.")


if __name__ == "__main__":
    main()


In [2]:
# This code computes the means of the metrics acquired from traditional models.
import pandas as pd

def load_results(file_path):
    # Load results from a CSV file
    return pd.read_csv(file_path)

def average_results_by_model(results_df):
    # Assuming there's a 'Model' column that labels each row with the model's name
    # Group by 'Model' and calculate the mean for each group
    model_averages = results_df.groupby('Model').mean()
    return model_averages

def main():
    # File paths

    selected_file = '/content/drive/My Drive/Sensor_dataset/DP_Sensor/results_selected_DP_sensor_models.csv'
    # Load results

    selected_results = load_results(selected_file)

    # Calculate averages per model

    average_selected = average_results_by_model(selected_results)

    # Save the average results per model

    average_selected.to_csv('/content/drive/My Drive/Sensor_dataset/DP_Sensor/average_results_per_model_selected_DP_sensor_models.csv')

    # Display the results

    print("Average results per model for selected data:")
    print(average_selected)

if __name__ == "__main__":
    main()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Sensor_dataset/DP_Sensor/results_selected_DP_sensor_models.csv'

In [None]:
# This code is designed to calculate metrics for SR models by combining data obtained from multiple folds, ultimately identifying the best equation.
!pip install pandas
!pip install pyoperon
!pip install scikit-learn

import re
import pyoperon
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from pyoperon.sklearn import SymbolicRegressor

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from pyoperon.sklearn import SymbolicRegressor
from pyoperon import R2, MSE, InfixFormatter, FitLeastSquares, Interpreter



def load_datasets():
    dataset_paths = [
         '/content/drive/My Drive/DP_BCG/feat_fold_0_selected_DP.csv',
        '/content/drive/My Drive/DP_BCG/feat_fold_1_selected_DP.csv',
        '/content/drive/My Drive/DP_BCG/feat_fold_2_selected_DP.csv',
        '/content/drive/My Drive/DP_BCG/feat_fold_3_selected_DP.csv',
        '/content/drive/My Drive/DP_BCG/feat_fold_4_selected_DP.csv'

    ]
    return [pd.read_csv(path) for path in dataset_paths]

def extract_variable_indices(equation):
    return set(int(num) for num in re.findall(r'X(\d+)', equation))
def replace_variable_names_with_features(equation, features):
    # This function will replace 'Xn' in the equation with actual feature names.
    def replace_match(match):
        index = int(match.group(1))  # Extract the index number from 'Xn'
        if index < len(features):
            return features[index]  # Replace 'Xn' with the corresponding feature name
        else:
            return match.group(0)  # No change if index is out of bounds

    # Use a regular expression to replace all occurrences of 'Xn' with feature names
    return re.sub(r'X(\d+)', replace_match, equation)

def map_indices_to_names(indices, features):
    return {index: features[index] for index in indices if index < len(features)}

def run_final_model(folds):

    train_data = pd.concat(folds, ignore_index=True)
    columns_to_exclude = ['SP', 'DP', 'trial', 'patient']
    features_to_use = [col for col in train_data.columns if col not in columns_to_exclude]
    print( features_to_use)
    X_train = train_data.drop(['SP', 'DP', 'trial', 'patient'], axis=1)
    y_train = train_data['DP']
    model = SymbolicRegressor(
            allowed_symbols="add,sub,mul,div,constant,variable,pow,exp,log,sin,cos,tan,tanh,sqrt,cbrt,square",
            #brood_size=10,
            comparison_factor=0,
            crossover_internal_probability=0.9,
            crossover_probability=1.0,
            initialization_max_depth= 5,
            initialization_max_length= 10,
            initialization_method= "btc",
            irregularity_bias= 0.0,
            #optimizer_iterations= 5,
            #optimizer='lm',
            male_selector= "tournament",
            epsilon=1e-05,
            female_selector="tournament",
            max_depth= 10,
            max_evaluations= 500,
            max_length= 50,
            max_selection_pressure= 100,
           # model_selection_criterion= "minimum_description_length",
            mutation_probability= 0.25,
            n_threads= 32,
            offspring_generator= "basic",
            reinserter= "keep-best",
            generations=300,
            pool_size=100,
            population_size=100,
            random_state=None,
            #reinserter="keep-best",
            time_limit=90,
            tournament_size=3,
            #uncertainty= [sErr]
    )

    model.fit(X_train, y_train)
    m = model.model_
    best_equation =  model.get_model_string(m)
    print(best_equation)

    best_equation_with_features = replace_variable_names_with_features(best_equation, features_to_use)
    print("Best Equation with Feature Names:", best_equation_with_features)


    predictions = model.predict(X_train)
    if predictions.ndim > 1:
            predictions = predictions.ravel()

    variable_indices = extract_variable_indices(best_equation)
    print("Extracted Variable Indices:", variable_indices)
    variable_names = map_indices_to_names(variable_indices, features_to_use)
    print("Variable Names in Best Equation (Dictionary):", variable_names)

    if predictions.ndim > 1:
            predictions = predictions.ravel()

    # Assuming predictions and y_train are defined from your model output




    mse = mean_squared_error(y_train, predictions)
    mse_naive = mean_squared_error(y_train, [np.mean(y_train)] * len(y_train))
    mae = mean_absolute_error(y_train, predictions)
    mae_naive = mean_absolute_error(y_train, [np.mean(y_train)] * len(y_train))
    score = r2_score(y_train, predictions)
    errors = predictions.reshape(y_train.shape) -y_train
    ME = errors.mean()
    SD = errors.std()

    results={               'MSE': mse,     'MAE': mae,  'R2_score': score,       'MMSE': mse / mse_naive,                                 'MASE':mae / mae_naive,
                                  'ME': ME,  'SD': SD,  'best_equation': best_equation,'features_to_use':features_to_use,'best_equation_with_features':best_equation_with_features}
    Variable_names_df=  pd.DataFrame([ map_indices_to_names(variable_indices, features_to_use)])
    Variable_names_df.to_csv('/content/drive/My Drive/Sensor_dataset/DP_Sensor/bcg_Combine_Variable_names_dP_sensor.csv', index=False)
    print("Results saved to '/content/drive/My Drive/Sensor_dataset/DP_Sensor/bcg_Combine_Variable_names_dP_sensor.csv'")

    results_df = pd.DataFrame([results])
    results_df.to_csv('/content/drive/My Drive/Sensor_dataset/DP_Sensor/bcg_Combine_model_results_dp_sensor.csv', index=False)
    print("Results saved to '/content/drive/My Drive/Sensor_dataset/DP_Sensor/bcg_Combine_model_results-dp.csv'")

    return  best_equation, results_df
def main():
    folds = load_datasets()
    best_equation, results_df=run_final_model(folds)
    print(results_df)

if __name__ == "__main__":
    main()




In [None]:
# This code is designed to calculate metrics for SR models for multiple folds, ultimately identifying the best equations.
!pip install pandas
!pip install pyoperon
!pip install scikit-learn

import os

import re
import matplotlib.pyplot as plt
import pyoperon
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from pyoperon.sklearn import SymbolicRegressor
from sklearn.linear_model import LogisticRegression



def load_datasets():
    dataset_paths = [
        '/content/drive/My Drive/Sensor_dataset/SP_Sensor/best_feat_fold_0_selected_SP_sensor.csv',
        '/content/drive/My Drive/Sensor_dataset/SP_Sensor/best_feat_fold_1_selected_SP_sensor.csv',
        '/content/drive/My Drive/Sensor_dataset/SP_Sensor/best_feat_fold_2_selected_SP_sensor.csv',
        '/content/drive/My Drive/Sensor_dataset/SP_Sensor/best_feat_fold_3_selected_SP_sensor.csv',
        '/content/drive/My Drive/Sensor_dataset/SP_Sensor/best_feat_fold_4_selected_SP_sensor.csv'
    ]

    return [pd.read_csv(path) for path in dataset_paths]

def split_datasets(folds):
    num_folds = len(folds)
    splits = []
    for r in range(num_folds):
        train_indices = list(range(num_folds))
        train_indices.remove(r)
        test_indices = [r]
        train_dfs = [folds[i] for i in train_indices]
        test_dfs = [folds[i] for i in test_indices]
        splits.append((pd.concat(train_dfs), pd.concat(test_dfs)))
    return splits

def extract_variables(equation):
    return set(re.findall(r'X\d+', equation))




def perform_analysis(splits):
    fold_equations = []
    selected_feature_importances = {}
    results = []
    variable_mappings = {}
    for i, (train_data, test_data) in enumerate(splits):
        # Ensuring SP and DP are excluded properly from features
        X_train = train_data.drop(['SP', 'DP', 'trial', 'patient'], axis=1)
        y_train = train_data['SP']
        X_test = test_data.drop(['SP', 'DP', 'trial', 'patient'], axis=1)
        y_test = test_data['SP']
        #selector = RFECV(LinearRegression(), step=1, cv=KFold(5), scoring='neg_mean_squared_error').fit(X_train, y_train)
        #best_features = X_train.columns[selector.support_]
        # Symbolic Regressor Configuration (Adjusted to Exclude SP and DP)
        default_params = {
            'allowed_symbols': 'add,sub,mul,div,constant,variable,pow,exp,log,sin,cos,tan,tanh,sqrt,cbrt,square',
            'offspring_generator': 'brood',
            'initialization_method': 'btc',
            'comparison_factor': 0,
            'crossover_internal_probability': 0.9,
            'epsilon': 1e-05,
            'female_selector': 'tournament',
            'objectives': ['r2', 'mse', 'mae', 'length', 'rmse', 'c2'],

            'mutation_probability': 0.1,
            'reinserter': 'keep-best',
            'max_evaluations': int(1e6),
            'tournament_size': 3,
            'pool_size': 50,
            'population_size': 100,
            'generations': 500,
            'time_limit': 90,
            'crossover_probability': 1.0,
            'mutation_probability': 0.8,
            'max_depth':5

        }
        model = SymbolicRegressor(**default_params).fit(X_train, y_train)
        predictions = model.predict(X_test)
        m = model.model_
        best_equation = model.get_model_string(m)
        print(best_equation)
        fold_equations.append(best_equation)
        if predictions.ndim > 1:
            predictions = predictions.ravel()

        predictions_df = pd.DataFrame({
            'Actual': y_test,
            'Predicted': predictions
        })




        mse = mean_squared_error(y_test, predictions)
        mse_naive = mean_squared_error(y_test, [np.mean(y_test)] * len(y_test))
        mae = mean_absolute_error(y_test, predictions)
        mae_naive = mean_absolute_error(y_test, [np.mean(y_test)] * len(y_test))
        score = r2_score(y_test, predictions)
        errors = predictions.reshape(y_test.shape) - y_test
        ME = errors.mean()
        SD = errors.std()


        variables_used = extract_variables(best_equation)
        print("Variables used in the equation:", variables_used)

        variable_mappings[f"Fold {i}"] = {f"X{idx}": X_train.columns[int(idx[1:])] for idx in extract_variables(best_equation) if int(idx[1:]) < len(X_train.columns)}
        results.append({
            'fold': i,
            'MSE': mse,
            'MAE': mae,
            'R2_score': score,
            'MMSE': mse / mse_naive,
            'MASE': mae / mae_naive,
            'ME': ME,
            'SD': SD,
            'best_equation': best_equation,

                  })


    return results,     best_equation, fold_equations, variable_mappings


def save_variable_mapping_to_csv(variable_mapping, file_path):
    # Ensure the directory exists where the file will be saved
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    # Prepare a list to hold all rows before writing to CSV
    rows = []
    for fold, variables in variable_mapping.items():
        for index, name in variables.items():
            rows.append({'Fold': fold, 'Variable Index': index, 'Variable Name': name})

    # Create a DataFrame and write it to CSV
    df = pd.DataFrame(rows)
    df.to_csv(file_path, index=False)
    print("Data saved to CSV successfully.")





def main():
    folds = load_datasets()
    splits = split_datasets(folds)
    results,   best_equation, fold_equations,  variable_mapping  = perform_analysis(splits)
    save_variable_mapping_to_csv(variable_mapping, '/content/drive/My Drive/Sensor_dataset/SP_Sensor/best2_variable_mapping_sensor_dp.csv')



    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/My Drive/Sensor_dataset/SP_Sensor/best2_model_performance_results_Sensor_SP_Selected_sensor.csv', index=False)
    numeric_columns = results_df.select_dtypes(exclude='object').columns

    average_results = results_df[numeric_columns].mean().to_frame().transpose()
    average_results.to_csv('/content/drive/My Drive/Sensor_dataset/SP_Sensor/best2_average_model_performance_results_SP_Sensor_Selected_sensor.csv', index=False)
    print("Average results:")
    print(average_results)


    print("Results and importances saved to CSV.")
    print(results_df)






if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Data preparation
data = pd.DataFrame({
    'Model': ['AdaBoost', 'AdaBoost', 'LightGBM', 'LightGBM', 'MLP', 'MLP', 'RF', 'RF', 'SVR', 'SVR', 'SR-fold', 'SR-fold','SR-combine', 'SR-combine'],
    'Feature Selection': ['RFECV', 'EFBEQ-SR', 'RFECV', 'EFBEQ-SR', 'RFECV', 'EFBEQ-SR', 'RFECV', 'EFBEQ-SR', 'RFECV', 'EFBEQ-SR', 'RFECV', 'EFBEQ-SR','RFECV', 'EFBEQ-SR'],
    'MASE% SBP': [90.36, 93.90, 82.32, 88.42, 89.97, 92.74, 83.25, 88.22, 97.80, 96.88, 93.04, 96.02,91.06,94.12],
    'MASE% DBP': [108.39, 106.22, 92.82, 92.82, 97.17, 97.41, 92.61, 92.90, 96.70, 96.70, 94.82, 96.82,96.73,97.04]
})

# Plotting setup
fig, ax = plt.subplots(2, 1, figsize=(14, 12))

# SBP Plot
sbp_plot = data.pivot(index="Model", columns="Feature Selection", values="MASE% SBP")
sbp_bar = sbp_plot.plot(kind='bar', ax=ax[0], colormap='viridis')
ax[0].set_title('MASE% for SBP by Feature Selection Method')
ax[0].set_ylabel('MASE%')
ax[0].legend(title='Feature Selection', loc='upper left', bbox_to_anchor=(1,1))

# DBP Plot
dbp_plot = data.pivot(index="Model", columns="Feature Selection", values="MASE% DBP")
dbp_bar = dbp_plot.plot(kind='bar', ax=ax[1], colormap='viridis')
ax[1].set_title('MASE% for DBP by Feature Selection Method')
ax[1].set_ylabel('MASE%')
ax[1].legend(title='Feature Selection', loc='upper left', bbox_to_anchor=(1,1))

# General settings
for a in ax:
    a.set_xticklabels(a.get_xticklabels(), rotation=45)
    a.set_xlabel('Models')

plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust the right margin to fit the legend

# Save the plot to a file
plt.savefig('/content/drive/My Drive/Sensor_dataset/SP_Sensor/performance_comparison_mase_sorted.png')

# Show the plot
plt.show()

# Confirm the plot is saved
print("Plot saved successfully at '/content/drive/My Drive/Sensor_dataset/SP_Sensor/performance_comparison_mase_sorted.png'")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Data preparation
data = pd.DataFrame({
    'Model': ['AdaBoost', 'LightGBM', 'MLP', 'Naive', 'RF', 'SVR', 'SR'],
    'MASE% SBP': [42.48, 38.55, 82.96, 101.11, 38.89, 89.39, 109.81],
    'MASE% DBP': [78.94, 64.37, 127.68, 102.25, 60.85, 93.31, 117.89]
})

# Sort data by MASE% for SBP and DBP
data_sorted_sbp = data.sort_values('MASE% SBP').reset_index(drop=True)
data_sorted_dbp = data.sort_values('MASE% DBP').reset_index(drop=True)

# Plot setup
fig, ax = plt.subplots(2, 1, figsize=(10, 10), sharex=True)

# Assigning a unique color for each model using a colormap
colors_sbp = plt.cm.viridis(np.linspace(0, 1, len(data_sorted_sbp)))
colors_dbp = plt.cm.plasma(np.linspace(0, 1, len(data_sorted_dbp)))

# SBP plot
ax[0].bar(data_sorted_sbp['Model'], data_sorted_sbp['MASE% SBP'], color=colors_sbp)
ax[0].set_title('MASE% for SBP')
ax[0].set_ylabel('MASE%')
ax[0].set_xticks(range(len(data_sorted_sbp['Model'])))
ax[0].set_xticklabels(data_sorted_sbp['Model'], rotation=45)

# DBP plot
ax[1].bar(data_sorted_dbp['Model'], data_sorted_dbp['MASE% DBP'], color=colors_dbp)
ax[1].set_title('MASE% for DBP')
ax[1].set_ylabel('MASE%')
ax[1].set_xticks(range(len(data_sorted_dbp['Model'])))
ax[1].set_xticklabels(data_sorted_dbp['Model'], rotation=45)

plt.tight_layout()
plt.savefig('/content/drive/My Drive/Sensor_dataset/SP_Sensor/performance_bcg-comparison_mase_sorted.png')

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt



# Sample DataFrame based on Table X structure, adjust according to actual data
data = pd.DataFrame({
    'Category': ['Histogram Features', 'Frequency Features', 'Amplitude Features', 'Time Features'],
    'Number of Elements': [15, 4, 2, 1]  # Example values, replace with actual counts
})

# Sorting the data by the number of elements
data_sorted = data.sort_values('Number of Elements', ascending=True)

# Define a color for each category
colors = ['red', 'green', 'blue', 'purple']

# Creating a bar plot
plt.figure(figsize=(8, 4))
plt.barh(data_sorted['Category'], data_sorted['Number of Elements'], color=colors)
plt.xlabel('Number of Elements')
plt.title('Feature Set Distribution by Category For DBP')
plt.tight_layout()

# Save and show plot
plt.savefig('/content/drive/My Drive/Sensor_dataset/SP_Sensor/featur-bp.png')
plt.show()

# Confirm the plot is saved
print("Plot saved successfully at '/content/drive/My Drive/Sensor_dataset/SP_Sensor-bp/featur.png'")
