Import Libraries

In [2]:
import pickle
from SafeTransformer import SafeTransformer
import pandas as pd
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesRegressor
from sklearn.model_selection import KFold
import warnings
from sklearn.preprocessing import MinMaxScaler


Get Data

In [3]:
# Paths to the pickle files
independent_path = '../Data/Cleaned Data/Independent_Data_dictionary.pkl'
dependent_path = '../Data/Cleaned Data/Dependent_Data_dictionary.pkl'

# Load the data
with open(independent_path, 'rb') as f:
    Independent_Data = pickle.load(f)

with open(dependent_path, 'rb') as f:
    Dependent_Data = pickle.load(f)

Encoding the data

In [4]:
Dependent_Data_Encoded = {}

for name, df in Dependent_Data.items():
    # Creating a copy of the dataframe to modify
    encoded_df = df.copy()
    
    # Initialize the LabelEncoder
    le = LabelEncoder()
    
    # Iterate through each column in the dataframe
    for column in df.columns:
        # Encode the column if it is of type 'object' or 'category'
        if df[column].dtype == 'object' or df[column].dtype.name == 'category':
            # Fit label encoder and return encoded labels
            encoded_df[column] = le.fit_transform(df[column])
    
    # Add the encoded dataframe to the new dictionary
    Dependent_Data_Encoded[name] = encoded_df

# Now Dependent_Data_Encoded contains all the label-encoded dataframes


In [5]:
Indpendent_Data_Encoded = {}

for name, df in Independent_Data.items():
    # Creating a copy of the dataframe to modify
    encoded_df = df.copy()
    
    # Initialize the LabelEncoder
    le = LabelEncoder()
    
    # Iterate through each column in the dataframe
    for column in df.columns:
        # Encode the column if it is of type 'object' or 'category'
        if df[column].dtype == 'object' or df[column].dtype.name == 'category':
            # Fit label encoder and return encoded labels
            encoded_df[column] = le.fit_transform(df[column])
    
    # Add the encoded dataframe to the new dictionary
    Indpendent_Data_Encoded[name] = encoded_df

# Now Independent_Data_Encoded contains all the label-encoded dataframes


Applying SAFE

In [6]:
# Suppress warnings for metrics that might be undefined for some classifiers
warnings.filterwarnings("ignore")

# Initialize cross validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Set names for datasets
regression_datasets = ['fri_c3_1000_50',
 'fri_c2_1000_25',
 'fri_c4_500_50',
 'fri_c4_1000_50',
 'fri_c1_1000_25',
 'fri_c1_500_50',
 'fri_c3_1000_25',
 'auto93',
 'pyrim',
 'autoPrice',
 'boston',
 'Concrete_Compressive_Strength',
 'Auto_MPG',
 'Forest Fires',
 'Servo',
 'Airfoil_Self_Noise',
 'Wine_Quality',
 'BodyFat',
 'California_Housing',
 'Quake']

multi_class_classification_datasets = ['Balance_Scale', 'Iris']

#Initiate the model to normalize data
scaler = MinMaxScaler()

In [None]:
# Processing datasets
for dataset_name, X in Independent_Data.items():
    try:
        fold_data = {}  # Dictionary to store fold data
        times = []
        print(f'Processing dataset: {dataset_name}')
        y = Dependent_Data_Encoded[dataset_name]
        X_E = Indpendent_Data_Encoded[dataset_name]
        if dataset_name in regression_datasets:
            model = ExtraTreesRegressor(random_state=42)
        elif dataset_name in multi_class_classification_datasets:
            model = RandomForestClassifier(random_state=42)
        else:
            model = RandomForestClassifier(random_state=42)

        fold_count = 1
        # Manually handling cross-validation to incorporate SAFE
        for train_index, test_index in kf.split(X):
            fold_key = f'fold{fold_count}'
            fold_count += 1
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            X_train_E, X_test_E = X_E.iloc[train_index], X_E.iloc[test_index]
            
            # SAFE feature engineering
            # Start timing for SAFE
            start_time = time.time()
            model.fit(X_train_E, y_train)
            safe_transformer = SafeTransformer(model=model)
            safe_transformer = safe_transformer.fit(X_train,y_train)
            # End timing after fitting the model
            fit_time = time.time() - start_time
            times.append(fit_time)

            X_train_transformed = safe_transformer.transform(X_train)
            X_test_transformed = safe_transformer.transform(X_test)

            # Apply normalization on each DataFrame
            X_train_transformed = pd.DataFrame(scaler.fit_transform(X_train_transformed), columns=X_train_transformed.columns, index=X_train_transformed.index)
            X_test_transformed = pd.DataFrame(scaler.transform(X_test_transformed), columns=X_test_transformed.columns, index=X_test_transformed.index)
            
            fold_data[fold_key] = {
                'Training_Independent': X_train_transformed,
                'Training_Dependent': y_train,
                'Testing_Independent': X_test_transformed,
                'Testing_Dependent': y_test,
                'Timing': fit_time
            }
                
        # Serialize fold data to a .pkl file
        with open(f'../Data/SAFE/{dataset_name}_fold_data.pkl', 'wb') as file:
            pickle.dump(fold_data, file)
        
    except Exception as e:
        print(f"Error processing {dataset_name}: {e}")
