Checking if BASIC features have been dropped through AutoFeat

In [10]:
import os
import pickle

def check_columns(basic_dir="Data/BASIC", autofeat_dir="Data/AutoFeat_Data"):
    """
    Compare Training_Independent columns between files in BASIC and AutoFeat_Data.
    
    For each file in AutoFeat_Data, find the corresponding file in BASIC.
    Both files are expected to be dictionaries with keys fold1..fold5.
    Inside each fold, there should be 'Training_Independent' (a pandas DataFrame).
    The function checks if all columns in BASIC are present in AutoFeat.
    """
    
    # List all files in both folders
    basic_files = os.listdir(basic_dir)
    autofeat_files = os.listdir(autofeat_dir)

    # Process only files that exist in both folders
    common_files = set(basic_files).intersection(autofeat_files)

    for file_name in common_files:
        print(f"\nChecking file: {file_name}")
        
        # Load the BASIC file
        with open(os.path.join(basic_dir, file_name), "rb") as f:
            basic_data = pickle.load(f)
        
        # Load the AutoFeat file
        with open(os.path.join(autofeat_dir, file_name), "rb") as f:
            autofeat_data = pickle.load(f)

        # Check each fold
        for fold in [f"fold{i}" for i in range(1, 6)]:
            if fold in basic_data and fold in autofeat_data:
                basic_cols = set(basic_data[fold]["Training_Independent"].columns)
                autofeat_cols = set(autofeat_data[fold]["Training_Independent"].columns)

                # Find missing columns
                missing = basic_cols - autofeat_cols

                if not missing:
                    print(f"  {fold}: All columns are present ✅")
                else:
                    print(f"  {fold}: Missing columns ❌ -> {missing}")
            else:
                print(f"  {fold}: Not found in one of the files")

# Run the function
check_columns()



Checking file: Concrete_Compressive_Strength_fold_data.pkl
  fold1: All columns are present ✅
  fold2: All columns are present ✅
  fold3: All columns are present ✅
  fold4: All columns are present ✅
  fold5: All columns are present ✅

Checking file: Forest Fires_fold_data.pkl
  fold1: All columns are present ✅
  fold2: All columns are present ✅
  fold3: All columns are present ✅
  fold4: All columns are present ✅
  fold5: All columns are present ✅

Checking file: California_Housing_fold_data.pkl
  fold1: All columns are present ✅
  fold2: All columns are present ✅
  fold3: All columns are present ✅
  fold4: All columns are present ✅
  fold5: All columns are present ✅

Checking file: fri_c1_1000_25_fold_data.pkl
  fold1: All columns are present ✅
  fold2: All columns are present ✅
  fold3: All columns are present ✅
  fold4: All columns are present ✅
  fold5: All columns are present ✅

Checking file: fri_c1_500_50_fold_data.pkl
  fold1: All columns are present ✅
  fold2: All columns are