Importing the necessary libraries

In [1]:
import pandas as pd
from openfe import OpenFE, transform
import pickle
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import warnings
from sklearn.preprocessing import MinMaxScaler
import traceback


Import the dataset

In [2]:
# Paths to the pickle files
independent_path = '../Data/Cleaned Data/Independent_Data_dictionary.pkl'
dependent_path = '../Data/Cleaned Data/Dependent_Data_dictionary.pkl'

# Load the data
with open(independent_path, 'rb') as f:
    Independent_Data = pickle.load(f)

with open(dependent_path, 'rb') as f:
    Dependent_Data = pickle.load(f)

Encode the dependent variable

In [3]:
Dependent_Data_Encoded = {}

for name, df in Dependent_Data.items():
    # Creating a copy of the dataframe to modify
    encoded_df = df.copy()
    
    # Initialize the LabelEncoder
    le = LabelEncoder()
    
    # Iterate through each column in the dataframe
    for column in df.columns:
        # Encode the column if it is of type 'object' or 'category'
        if df[column].dtype == 'object' or df[column].dtype.name == 'category':
            # Fit label encoder and return encoded labels
            encoded_df[column] = le.fit_transform(df[column])
    
    # Add the encoded dataframe to the new dictionary
    Dependent_Data_Encoded[name] = encoded_df

# Now Dependent_Data_Encoded contains all the label-encoded dataframes


Applying OpenFE

In [4]:
# Suppress warnings for metrics that might be undefined for some classifiers
warnings.filterwarnings("ignore")

# Initialize cross validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Set names for datasets
regression_datasets = ['fri_c3_1000_50',
 'fri_c2_1000_25',
 'fri_c4_500_50',
 'fri_c4_1000_50',
 'fri_c1_1000_25',
 'fri_c1_500_50',
 'fri_c3_1000_25',
 'auto93',
 'pyrim',
 'autoPrice',
 'boston',
 'Concrete_Compressive_Strength',
 'Auto_MPG',
 'Forest Fires',
 'Servo',
 'Airfoil_Self_Noise',
 'Wine_Quality',
 'BodyFat',
 'California_Housing',
 'Quake']

#Initiate the model to normalize data
scaler = MinMaxScaler()

#Initiate the OpenFE Model
ofe = OpenFE()

Creating a function to take categorical variables out

In [5]:
def select_object_category_vars(df):
    """
    This function takes a pandas DataFrame and returns a list of column names
    that are of 'object' or 'category' data types.

    Parameters:
    df (pd.DataFrame): The DataFrame from which to select columns.

    Returns:
    list: A list of column names that are of 'object' or 'category' data types.
    """
    # Select columns of data types 'object' and 'category'
    obj_cat_columns = df.select_dtypes(include=['object', 'category'])
    return obj_cat_columns.columns.tolist()


Creating a function to label encode the data

In [6]:
def encode_and_split(X_train, X_test):
    # Concatenate the dataframes while preserving the index
    combined = pd.concat([X_train, X_test], keys=['train', 'test'])

    # Remove columns with any NaN values
    combined = combined.dropna(axis=1, how='any')

    # Apply label encoding to each categorical column
    le = LabelEncoder()
    for column in combined.select_dtypes(include=['object', 'category']).columns:
        combined[column] = le.fit_transform(combined[column].astype(str))

    # Split the combined dataframe back into X_train and X_test
    X_train_encoded = combined.xs('train')
    X_test_encoded = combined.xs('test')

    return X_train_encoded, X_test_encoded


Performing FE

In [None]:
# Processing datasets
for dataset_name, X in Independent_Data.items():
    if dataset_name in ['gina']:
        try:
            times = []
            fold_data = {}  # Dictionary to store fold data
            print(f'Processing dataset: {dataset_name}')
            y = Dependent_Data_Encoded[dataset_name]

            cat_cols = select_object_category_vars(X)

            fold_count = 1
            # Manually handling cross-validation to incorporate AutoFeat
            for train_index, test_index in kf.split(X):
                fold_key = f'fold{fold_count}'
                fold_count += 1
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
                # OpenFE feature engineering
                # Start timing for OpenFE
                if dataset_name in regression_datasets:
                    start_time = time.time()
                    features = ofe.fit(data = X_train, label = y_train,task = 'regression',categorical_features=cat_cols,n_jobs=10)
                    # End timing after fitting the model
                    transform_time = time.time() - start_time
                    times.append(transform_time)
                else:
                    start_time = time.time()
                    features = ofe.fit(data = X_train, label = y_train,task = 'classification',categorical_features=cat_cols,n_jobs=10)
                    # End timing after fitting the model
                    transform_time = time.time() - start_time
                    times.append(transform_time)

                X_train_transformed, X_test_transformed = transform(X_train,X_test,features,n_jobs=10)

                #Label encode the dataframes
                X_train_encoded, X_test_encoded = encode_and_split(X_train_transformed, X_test_transformed)

                # Apply normalization on each DataFrame
                X_train_transformed = pd.DataFrame(scaler.fit_transform(X_train_encoded), columns=X_train_encoded.columns, index=X_train_encoded.index)
                X_test_transformed = pd.DataFrame(scaler.transform(X_test_encoded), columns=X_test_encoded.columns, index=X_test_encoded.index)

                fold_data[fold_key] = {
                    'Training_Independent': X_train_transformed,
                    'Training_Dependent': y_train,
                    'Testing_Independent': X_test_transformed,
                    'Testing_Dependent': y_test,
                    'Timing': transform_time
                }

            with open(f'../Data/OPENFE/{dataset_name}_fold_data.pkl', 'wb') as file:
                pickle.dump(fold_data, file)
            
        except (Exception,SystemExit) as e:
            print(f"Error processing {dataset_name}: {e}")
            traceback.print_exc()  # This will print the traceback of the exception

