Import the libraries

In [5]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostRegressor, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, f1_score
import lightgbm as lgb
import xgboost as xgb
import warnings

Specify the dataset types

In [6]:
# Define paths and datasets
regression_datasets = [ 'fri_c3_1000_50', 'fri_c2_1000_25', 'fri_c4_500_50', 'fri_c4_1000_50', 'fri_c1_1000_25', 'fri_c1_500_50', 'fri_c3_1000_25', 'auto93', 'pyrim', 'autoPrice', 'boston', 'Concrete_Compressive_Strength', 'Auto_MPG', 'Forest Fires', 'Servo', 'Airfoil_Self_Noise', 'Wine_Quality', 'BodyFat', 'California_Housing', 'Quake']
multi_class_classification_datasets = ['Balance_Scale', 'Iris']

Define the models

In [7]:
# Classification models
classification_models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "KNN Classification": KNeighborsClassifier(),
    "SVM Classification": SVC(random_state=42),
    "Random Forest Classification": RandomForestClassifier(random_state=42),
    "AdaBoost Classification": AdaBoostClassifier(random_state=42),
    "MLP Classification": MLPClassifier(random_state=42),
    "Decision Tree Classification": DecisionTreeClassifier(random_state=42),
    "Extremely Randomized Trees Classification": ExtraTreesClassifier(random_state=42),
    "Gradient Boosting Classification": GradientBoostingClassifier(random_state=42),
    "LightGBM Classification": lgb.LGBMClassifier(random_state=42),
    "XGBoost Classification": xgb.XGBClassifier(random_state=42)
}

# Regression models
regression_models = {
    "Linear Regression": LinearRegression(),
    "KNN Regression": KNeighborsRegressor(),
    "SVM Regression": SVR(),
    "Random Forest Regression": RandomForestRegressor(random_state=42),
    "AdaBoost Regression": AdaBoostRegressor(random_state=42),
    "MLP Regression": MLPRegressor(random_state=42),
    "Decision Tree Regression": DecisionTreeRegressor(random_state=42),
    "Extremely Randomized Trees Regression": ExtraTreesRegressor(random_state=42),
    "Gradient Boosting Regression": GradientBoostingRegressor(random_state=42),
    "LightGBM Regression": lgb.LGBMRegressor(random_state=42),
    "XGBoost Regression": xgb.XGBRegressor(random_state=42)
}

# Multi-Class Classification Models with appropriate settings for multi-class scenarios
classification_models_multi = {
    "Logistic Regression Multi-Class": LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs'),
    "KNN Classification Multi-Class": KNeighborsClassifier(),
    "SVM Classification Multi-Class": SVC(probability=True, random_state=42, decision_function_shape='ovr'),
    "Random Forest Classification Multi-Class": RandomForestClassifier(random_state=42),
    "AdaBoost Classification Multi-Class": AdaBoostClassifier(random_state=42),  # Note: AdaBoost by default is not the best for multi-class but used here for completeness
    "MLP Classification Multi-Class": MLPClassifier(random_state=42),
    "Decision Tree Classification Multi-Class": DecisionTreeClassifier(random_state=42),
    "Extremely Randomized Trees Classification Multi-Class": ExtraTreesClassifier(random_state=42),
    "Gradient Boosting Classification Multi-Class": GradientBoostingClassifier(random_state=42),
    "LightGBM Classification Multi-Class": lgb.LGBMClassifier(random_state=42),
    "XGBoost Classification Multi-Class": xgb.XGBClassifier(eval_metric='mlogloss', random_state=42)
}


Define functions for loading data and computing metrics

In [8]:
# Function to load dataset
def load_data(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

In [9]:
# Function to compute regression metrics
def compute_regression_metrics(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Calculate R-squared
    r2 = r2_score(y_test, predictions)
    
    # Calculate MAE
    mae = mean_absolute_error(y_test, predictions)
    
    # Calculate the range of y_test
    range_of_y_test = np.max(y_test) - np.min(y_test)
    
    # Avoid division by zero in case y_test is constant
    if range_of_y_test == 0:
        normalized_mae = mae
    else:
        # Calculate Normalized MAE
        normalized_mae = mae / range_of_y_test
    
    return r2, normalized_mae

In [10]:
# Function to compute classification metrics
def compute_classification_metrics(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='macro')
    return accuracy, f1

Creating lists to store results

In [11]:
# Initialize empty lists for storing results
regression_results = []
classification_results = []

Function to fit ML Models

In [19]:
# Function to process each dataset file
def process_file(filename):
    data = load_data(os.path.join(directory, filename))
    dataset_name = filename[:-14]
    if dataset_name in regression_datasets:
        models = regression_models
        metric_function = compute_regression_metrics
        result_list = regression_results
        metric_names = ['R2','NMAE']
    elif dataset_name in multi_class_classification_datasets:
        models = classification_models_multi
        metric_function = compute_classification_metrics
        result_list = classification_results
        metric_names = ['Accuracy', 'F1']
    else:
        models = classification_models
        metric_function = compute_classification_metrics
        result_list = classification_results
        metric_names = ['Accuracy', 'F1']

    # Compute metrics for each model and fold
    for model_name, model in models.items():
        for i in range(1, 6):  # Assuming there are 5 folds
            fold = f'fold{i}'
            X_train = data[fold]['Training_Independent']
            y_train = data[fold]['Training_Dependent']
            X_test = data[fold]['Testing_Independent']
            y_test = data[fold]['Testing_Dependent']
            metrics = metric_function(model, X_train, X_test, y_train, y_test)
            result_list.append([dataset_name, model_name] + list(metrics))


Function to get time

In [13]:
def read_pkl_files(directory):
    """
    Reads all .pkl files in the given directory, extracts 'Timing' data from each fold,
    and returns a DataFrame containing this data along with the corresponding file names.

    Parameters:
    directory (str): The path to the directory containing the .pkl files.

    Returns:
    pd.DataFrame: A DataFrame with columns 'DataFrame' and 'Time', where 'DataFrame' is the
    name of the .pkl file and 'Time' is the extracted timing data.
    """
    # Initialize an empty list to store the data
    data_list = []

    # Iterate over each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.pkl'):
            # Construct the full file path
            file_path = os.path.join(directory, filename)
            
            # Load the .pkl file
            with open(file_path, 'rb') as file:
                data = pickle.load(file)
            
            # Extract 'Timing' from each fold and append to the list
            for key in ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']:
                time = data[key]['Timing']
                data_list.append({'DataFrame': filename, 'Time': time})

    # Create a DataFrame from the list
    df = pd.DataFrame(data_list)

    return df

# Fitting the ML Models

OpenFE

In [None]:
directory = '../Data/OPENFE'

# Suppress all warnings (not recommended for development)
warnings.filterwarnings('ignore')

# Process each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.pkl'):
        print(filename)
        process_file(filename)

# Convert results to DataFrame
regression_df = pd.DataFrame(regression_results, columns=['Dataframe', 'Model'] + ['R2','NMAE'])
classification_df = pd.DataFrame(classification_results, columns=['Dataframe', 'Model'] + ['Accuracy', 'F1'])

# Grouping the regression DataFrame by 'Dataframe' and 'Model', and calculating the mean of other columns
regression_results_mean = regression_df.groupby(['Dataframe', 'Model']).mean().reset_index()

# Grouping the classification DataFrame by 'Dataframe' and 'Model', and calculating the mean of other columns
classification_results_mean = classification_df.groupby(['Dataframe', 'Model']).mean().reset_index()

# Grouping the regression DataFrame by 'Dataframe' and 'Model', and calculating the mean of other columns
regression_results_std = regression_df.groupby(['Dataframe', 'Model']).std().reset_index()

# Grouping the classification DataFrame by 'Dataframe' and 'Model', and calculating the mean of other columns
classification_results_std = classification_df.groupby(['Dataframe', 'Model']).std().reset_index()

# Collect the times
time_df = read_pkl_files(directory)

# Create the dictionary
results = {
    'classification_df': classification_results_mean,
    'regression_df': regression_results_mean,
    'classification_df_std': classification_results_std,
    'regression_df_std': regression_results_std,
    'classification_all': classification_df,
    'regression_all': regression_df,
    'time_df': time_df
}

#Save the result
pickle.dump(results, open("../Results/OpenFE_Results.pkl", "wb"))