Import the libraries

In [1]:
import pickle
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate, KFold
from sklearn.preprocessing import MinMaxScaler
import warnings
import traceback

Import the data

In [3]:
# Paths to the pickle files
independent_path = 'Data/Independent_Data_dictionary.pkl'
dependent_path = 'Data/Dependent_Data_dictionary.pkl'

# Load the data
with open(independent_path, 'rb') as f:
    Independent_Data = pickle.load(f)

with open(dependent_path, 'rb') as f:
    Dependent_Data = pickle.load(f)

Encoding the dependent data

In [4]:
Dependent_Data_Encoded = {}

for name, df in Dependent_Data.items():
    # Creating a copy of the dataframe to modify
    encoded_df = df.copy()
    
    # Initialize the LabelEncoder
    le = LabelEncoder()
    
    # Iterate through each column in the dataframe
    for column in df.columns:
        # Encode the column if it is of type 'object' or 'category'
        if df[column].dtype == 'object' or df[column].dtype.name == 'category':
            # Fit label encoder and return encoded labels
            encoded_df[column] = le.fit_transform(df[column])
    
    # Add the encoded dataframe to the new dictionary
    Dependent_Data_Encoded[name] = encoded_df

# Now Dependent_Data_Encoded contains all the label-encoded dataframes


Encode the independent data

In [5]:
Independent_Data_Encoded = {}

for key, df in Independent_Data.items():
    df_encoded = df.copy()  # Create a copy of the dataframe to encode
    for column in df_encoded.columns:
        if df_encoded[column].dtype == 'object' or df_encoded[column].dtype.name == 'category':
            le = LabelEncoder()
            # Fit label encoder and return encoded labels
            df_encoded[column] = le.fit_transform(df_encoded[column])
    # Store the encoded dataframe in the new dictionary
    Independent_Data_Encoded[key] = df_encoded

# Now Independent_Data_Encoded contains all the encoded dataframes


Apply Cross Validation

In [6]:
# Initialize cross validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

#Initiate the model to normalize data
scaler = MinMaxScaler()

Keeping on the regression datasets

In [7]:
regression_datasets = [
    'fri_c3_1000_50', 'fri_c2_1000_25', 'fri_c4_500_50', 'fri_c4_1000_50', 
    'fri_c1_1000_25', 'fri_c1_500_50', 'fri_c3_1000_25', 'auto93', 'pyrim', 
    'autoPrice', 'boston', 'Concrete_Compressive_Strength', 'Auto_MPG', 
    'Forest Fires', 'Servo', 'Airfoil_Self_Noise', 'Wine_Quality', 
    'BodyFat', 'California_Housing', 'Quake'
]

Independent_Data_Encoded = {
    k: v for k, v in Independent_Data_Encoded.items() if k in regression_datasets
}

Dependent_Data_Encoded = {
    k: v for k, v in Dependent_Data_Encoded.items() if k in regression_datasets
}


Breaking and Saving the Data

In [8]:
# Suppress warnings
warnings.filterwarnings("ignore")

all_data = {}
# Processing datasets
for dataset_name, X in Independent_Data_Encoded.items():
    all_data[dataset_name] = {}
    try:
        fold_data = {}  # Dictionary to store fold data
        times = []
        print(f'Processing dataset: {dataset_name}')
        y = Dependent_Data_Encoded[dataset_name]

        fold_count = 1
        # Manually handling cross-validation
        for train_index, test_index in kf.split(X):
            fold_key = f'fold{fold_count}'
            fold_count += 1
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Apply normalization on each DataFrame
            X_train_transformed = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
            X_test_transformed = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

            fold_data[fold_key] = {
                    'Training_Independent': X_train_transformed,
                    'Training_Dependent': y_train,
                    'Testing_Independent': X_test_transformed,
                    'Testing_Dependent': y_test
            }

            all_data[dataset_name][fold_key] = {
                'Training_Independent': X_train_transformed,
                'Training_Dependent': y_train,
                'Testing_Independent': X_test_transformed,
                'Testing_Dependent': y_test
            }
    
    except Exception as e:
        print(f"Error processing {dataset_name}: {e}")
        traceback.print_exc()  # This will print the traceback of the exception



Processing dataset: fri_c3_1000_50
Processing dataset: fri_c2_1000_25
Processing dataset: fri_c4_500_50
Processing dataset: fri_c4_1000_50
Processing dataset: fri_c1_1000_25
Processing dataset: fri_c1_500_50
Processing dataset: fri_c3_1000_25
Processing dataset: auto93
Processing dataset: pyrim
Processing dataset: autoPrice
Processing dataset: boston
Processing dataset: Concrete_Compressive_Strength
Processing dataset: Auto_MPG
Processing dataset: Forest Fires
Processing dataset: Servo
Processing dataset: Airfoil_Self_Noise
Processing dataset: Wine_Quality
Processing dataset: BodyFat
Processing dataset: California_Housing
Processing dataset: Quake


Saving the dataframe

In [10]:
import pickle

# Save all_data as Data.pkl
with open('Data/Data.pkl', 'wb') as f:
    pickle.dump(all_data, f)


# Breaking Validation

In [None]:
import pickle
from sklearn.model_selection import train_test_split

# Load Data
with open('Data/Data.pkl', 'rb') as f:
    Data = pickle.load(f)

# Process each dataset and fold
for dataset_name, folds in Data.items():
    for fold_name, fold_data in folds.items():
        # Extract original splits
        X_train = fold_data.pop('Training_Independent')
        y_train = fold_data.pop('Training_Dependent')
        X_test = fold_data.pop('Testing_Independent')
        y_test = fold_data.pop('Testing_Dependent')

        # Save full training and testing sets
        fold_data['Training_Independent_Full'] = X_train
        fold_data['Training_Dependent_Full'] = y_train
        fold_data['Testing_Independent_Full'] = X_test
        fold_data['Testing_Dependent_Full'] = y_test

        # Split training into broken train/val
        X_train_broken, X_val_broken, y_train_broken, y_val_broken = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )

        # Save broken train/val sets
        fold_data['Training_Independent_Broken'] = X_train_broken
        fold_data['Training_Dependent_Broken'] = y_train_broken
        fold_data['Validation_Independent_Broken'] = X_val_broken
        fold_data['Validation_Dependent_Broken'] = y_val_broken




In [4]:
# Optional: Save modified Data
with open('Data/Validation_Data.pkl', 'wb') as f:
    pickle.dump(Data, f)