In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import joblib
import os
import pickle

In [None]:
# Step 1: Calculate ERT and RELERT
def load_data(file_path, sheet_name=0):
    """Load data from an Excel file."""
    if not os.path.exists(file_path):
        print(f"Error: File {file_path} does not exist.")
        return None
    try:
        data = pd.read_excel(file_path, sheet_name=sheet_name)
        data['Measured Fitness'] = pd.to_numeric(data['Measured Fitness'], errors='coerce')
        data['Fopt'] = pd.to_numeric(data['Fopt'], errors='coerce')
        data['Function Evaluations'] = pd.to_numeric(data['Function Evaluations'], errors='coerce')
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

In [None]:
def calculate_ert(data, epsilon=1e-2):
    """Calculate the Expected Runtime (ERT) using a specific success criterion."""
    if data is None:
        return None
    try:
        data['success'] = data['Measured Fitness'] <= (data['Fopt'] + epsilon)
        ert_data = data.groupby(['Algorithm', 'Function', 'Instance', 'Dimension']).apply(
            lambda x: pd.Series({
                'ERT': x['Function Evaluations'].sum() / x['success'].sum() if x['success'].sum() > 0 else float('inf')
            }), include_groups=False
        ).reset_index()
        return ert_data
    except KeyError as e:
        print(f"Missing column: {e}")
        return None

In [None]:
def calculate_relert(ert_data):
    """Calculate the Relative Expected Runtime (RELERT)."""
    if ert_data is None:
        return None
    try:
        min_ert = ert_data.groupby(['Function', 'Dimension'])['ERT'].min().reset_index()
        min_ert.rename(columns={'ERT': 'min_ERT'}, inplace=True)
        merged_data = pd.merge(ert_data, min_ert, on=['Function', 'Dimension'])
        merged_data['RELERT'] = merged_data['ERT'] / merged_data['min_ERT']
        par10 = 1000000
        merged_data['RELERT'] = merged_data['RELERT'].replace([float('inf')], par10).fillna(par10)
        return merged_data
    except KeyError as e:
        print(f"Missing column: {e}")
        return None

In [None]:
def process_data(file_path, sheet_name=0, epsilon=1e-2):
    data = load_data(file_path, sheet_name)
    if data is not None:
        ert_data = calculate_ert(data, epsilon)
        if ert_data is not None:
            relert_data = calculate_relert(ert_data)
            if relert_data is not None:
                output_path = file_path.replace('.xlsx', '_relert.csv')
                relert_data.to_csv(output_path, index=False)
                print(f"RELERT results saved to {output_path}")
                return relert_data
            else:
                print("Failed to calculate RELERT.")
        else:
            print("Failed to calculate ERT.")
    return None

In [None]:
# Step 2: Select Top 10 Algorithms
def select_top_algorithms(relert_data, top_n=11):
    """Select top N algorithms based on mean ERT."""
    if relert_data is None:
        return None
    try:
        # Replace infinite ERT values with a large number
        relert_data['ERT'] = relert_data['ERT'].replace([float('inf')], 1e6)
        # Calculate mean ERT per algorithm
        mean_ert = relert_data.groupby('Algorithm')['ERT'].mean().reset_index()
        # Select top N algorithms
        top_algorithms = mean_ert.sort_values('ERT').head(top_n)['Algorithm'].tolist()
        return top_algorithms
    except KeyError as e:
        print(f"Error: Missing column {e}")
        return None

In [None]:
def filter_top_algorithms_data(relert_data, top_algorithms):
    """Filter data for top algorithms."""
    if relert_data is None or top_algorithms is None:
        return None
    try:
        filtered_data = relert_data[relert_data['Algorithm'].isin(top_algorithms)]
        output_path = '/content/drive/MyDrive/Wail-Projet-F/Data-04/top_10_algorithms_data.csv'
        filtered_data.to_csv(output_path, index=False)
        print(f"Filtered data for top 10 algorithms saved to {output_path}")
        return output_path
    except KeyError as e:
        print(f"Error: Missing column {e}")
        return None

In [None]:
# Step 3: Identify Best Algorithms for Top 10
def label_best_algorithms(relert_file):
    if not os.path.exists(relert_file):
        print(f"Error: File {relert_file} does not exist.")
        return None
    try:
        ert_results = pd.read_csv(relert_file)
        idx = ert_results.groupby(['Function', 'Instance', 'Dimension'])['RELERT'].idxmin()
        best_algorithms = ert_results.loc[idx]
        best_algorithms.rename(columns={'Algorithm': 'Best Algorithm'}, inplace=True)
        final_data = pd.merge(ert_results, best_algorithms[['Function', 'Instance', 'Dimension', 'Best Algorithm']],
                              on=['Function', 'Instance', 'Dimension'], how='left')
        final_data_csv_path = '/content/drive/MyDrive/Wail-Projet-F/Data-04/labeledfeatures_top10.csv'
        final_data.to_csv(final_data_csv_path, index=False)
        print("Final labeled data saved to:", final_data_csv_path)
        return final_data
    except KeyError as e:
        print(f"Error: {e} - Check column names and ensure they are correct.")
        return None

In [None]:
# Step 4: Merge with Features and Filter
def merge_and_filter(features_file, labeled_file):
    if not os.path.exists(features_file):
        print(f"Error: File {features_file} does not exist. Skipping merge_and_filter.")
        return None
    if not os.path.exists(labeled_file):
        print(f"Error: File {labeled_file} does not exist. Skipping merge_and_filter.")
        return None
    try:
        features_data = pd.read_excel(features_file)
        labeled_performance_data = pd.read_csv(labeled_file)
        print("Features Data Columns:", features_data.columns.tolist())
        print("Labeled Performance Data Columns:", labeled_performance_data.columns.tolist())
        merged_data = pd.merge(features_data, labeled_performance_data, left_on=['FID', 'IID', 'Dimension'],
                               right_on=['Function', 'Instance', 'Dimension'])
        merged_data.to_csv('/content/drive/MyDrive/Wail-Projet-F/Data-04/merged_dataset_top10.csv', index=False)
        print("Merged data saved to 'merged_dataset_top11.csv'.")

        data = pd.read_csv('/content/drive/MyDrive/Wail-Projet-F/Data-04/merged_dataset_top10.csv')
        data.replace([np.inf, -np.inf], np.nan, inplace=True)
        numeric_cols = data.select_dtypes(include=[np.number]).columns
        column_medians = data[numeric_cols].median()
        data[numeric_cols] = data[numeric_cols].fillna(column_medians)
        filtered_df = data[data['Algorithm'] == data['Best Algorithm']]
        final_df = filtered_df.drop(columns=['Algorithm'])
        final_df.to_csv('/content/drive/MyDrive/Wail-Projet-F/Data-04/final_filtered_dataset_top10.csv', index=False)
        print("Final filtered dataset saved with", len(final_df), "rows.")
        return final_df
    except Exception as e:
        print(f"Error in merge_and_filter: {e}")
        return None

In [None]:
# Step 5: Normalize and Encode
def normalize_and_encode(train_file):
    if not os.path.exists(train_file):
        print(f"Error: File {train_file} does not exist. Skipping normalize_and_encode.")
        return None
    try:
        train_data = pd.read_csv(train_file)

        columns_to_remove = [
            'bt.near.attractor_dists.sd',
            'bt.near.basin_intersection.sd',
            'gcm.near.basin_certain.sd',
            'gcm.near.basin_prob.sd',
            'gcm.near.basin_uncertain.sd'
        ]
        train_data = train_data.drop(columns=[col for col in columns_to_remove if col in train_data.columns])

        train_features = train_data.drop(columns=['Best Algorithm'])
        train_target = train_data['Best Algorithm']

        train_features = train_features.clip(-1e6, 1e6)

        scaler = StandardScaler()
        train_features_scaled = scaler.fit_transform(train_features)

        normalizer = MinMaxScaler()
        train_features_normalized = normalizer.fit_transform(train_features_scaled)

        train_features_normalized = np.nan_to_num(train_features_normalized, nan=0.0, posinf=0.0, neginf=0.0)

        with open('/content/drive/MyDrive/Wail-Projet-F/Data-04/scalerc_top10.pkl', 'wb') as f:
            pickle.dump(scaler, f)
        with open('/content/drive/MyDrive/Wail-Projet-F/Data-04/normalizerc_top10.pkl', 'wb') as f:
            pickle.dump(normalizer, f)

        train_features_normalized_df = pd.DataFrame(train_features_normalized, columns=train_features.columns)

        train_data_normalized = train_features_normalized_df.copy()
        train_data_normalized['Best Algorithm'] = train_target.values

        le = LabelEncoder()
        train_data_normalized['Best Algorithm'] = le.fit_transform(train_data_normalized['Best Algorithm'])
        joblib.dump(le, '/content/drive/MyDrive/Wail-Projet-F/Data-04/label_encoderc_top10.pkl')

        train_data_normalized.to_csv('/content/drive/MyDrive/Wail-Projet-F/Data-04/normalized_datasetc_top10.csv', index=False)

        print("Normalized train dataset saved with", len(train_data_normalized), "rows.")
        print("Unique classes in train:", train_data_normalized['Best Algorithm'].unique())
        print("Class mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
        return train_data_normalized
    except Exception as e:
        print(f"Error in normalize_and_encode: {e}")
        return None

In [None]:
# Main execution
if __name__ == "__main__":
    optimization_file = '/content/drive/MyDrive/Wail-Projet-F/Data-04/optimization_data.xlsx'
    features_file = '/content/drive/MyDrive/Wail-Projet-F/Data-04/features.xlsx'

    relert_data = process_data(optimization_file, sheet_name=0, epsilon=1e-2)
    if relert_data is not None:
        top_algorithms = select_top_algorithms(relert_data, top_n=11)
        if top_algorithms is not None:
            print(f"Top 11 Algorithms by Mean ERT: {top_algorithms}")
            top_11_file = filter_top_algorithms_data(relert_data, top_algorithms)
            if top_11_file is not None:
                labeled_data = label_best_algorithms(top_11_file)
                if labeled_data is not None:
                    filtered_data = merge_and_filter(features_file, '/content/drive/MyDrive/Wail-Projet-F/Data-04/labeledfeatures_top10.csv')
                    if filtered_data is not None:
                        normalize_and_encode('/content/drive/MyDrive/Wail-Projet-F/Data-04/final_filtered_dataset_top10.csv')
                    else:
                        print("Skipping normalize_and_encode due to missing filtered data.")
                else:
                    print("Skipping merge and normalization due to missing labeled data.")
            else:
                print("Skipping labeling, merge, and normalization due to missing top 11 data.")
        else:
            print("Skipping filtering, labeling, merge, and normalization due to failure in selecting top algorithms.")
    else:
        print("Skipping all steps due to failure in ERT/RELERT calculation.")

RELERT results saved to /content/drive/MyDrive/Wail-Projet-F/Data-04/optimization_data_relert.csv
Top 11 Algorithms by Mean ERT: ['CMAES-APOP-KMA_Nguyen', 'DE-BFGS_voglis_noiseless', 'ad-CMA-ES_Gissler', 'adm-CMA-ES_Gissler', 's-CMA-ES_Gissler', 'dm-CMA-ES_Gissler', 'a-CMA-ES', 'a-CMA-ES_Gissler', 'sd-CMA-ES_Gissler', 'BIPOP-CMA-ES', 'CMA-CSA_Atamna']
Filtered data for top 11 algorithms saved to /content/drive/MyDrive/Wail-Projet-F/Data-04/top_11_algorithms_data.csv
Final labeled data saved to: /content/drive/MyDrive/Wail-Projet-F/Data-04/labeledfeatures_top11.csv
Features Data Columns: ['FID', 'IID', 'Dimension', 'basic.blocks_max', 'basic.blocks_min', 'basic.cells_filled', 'basic.cells_total', 'basic.costs_fun_evals', 'basic.costs_runtime', 'basic.dim', 'basic.lower_max', 'basic.lower_min', 'basic.minimize_fun', 'basic.objective_max', 'basic.objective_min', 'basic.observations', 'basic.upper_max', 'basic.upper_min', 'bt.mean.attractor_dists.max', 'bt.mean.attractor_dists.mean', 'bt.m