In [1]:
import pandas as pd
import numpy as np
import os 
import sys

current_dir = os.getcwd()
# Get the parent directory (which contains both 'src' and 'notebooks')
parent_dir = os.path.dirname(current_dir)
# Add the parent directory to sys.path
sys.path.append(parent_dir)

from src.preprocessing import AutoPreProcessor
from src.feature_eng import AutoFeatureEngine

In [2]:
test_df = pd.DataFrame({
    'Age': [25, np.nan, 30],             # Numeric with missing value
    'Salary': [50000, 60000, 100000],    # Numeric needs scaling
    'City': ['New York', 'Paris', np.nan], # Categorical with missing value
    'Color': ['Red', 'Blue', 'Red']      # Categorical regular}) 
})

print(test_df)

    Age  Salary      City Color
0  25.0   50000  New York   Red
1   NaN   60000     Paris  Blue
2  30.0  100000       NaN   Red


In [3]:
preprocessor = AutoPreProcessor()

try:
    X_clean = preprocessor.fit_transform(test_df)
    
    print("\nShape of Output:", X_clean.shape)
    
    feature_names = preprocessor.get_feature_names_out()
    print("\nNew Feature Names:", feature_names)
    
    print("\nTransformed Data (First row):")
    print(X_clean[0])
    
    print("\nSUCCESS: The pipeline processed the data without crashing.")

except Exception as e:
    print(f"\nFAILURE: {e}")


Shape of Output: (3, 7)

New Feature Names: ['Age' 'Salary' 'City_New York' 'City_Paris' 'City_missing' 'Color_Blue'
 'Color_Red']

Transformed Data (First row):
[-1.22474487 -0.9258201   1.          0.          0.          0.
  1.        ]

SUCCESS: The pipeline processed the data without crashing.


In [11]:
print("-" * 30)
print("TESTING FEATURE ENGINEERING")

fe_engine = AutoFeatureEngine(use_poly=True, degree=2, use_log=False, use_pca=False)

X_engineered = fe_engine.fit_transform(X_clean)

print(f"Original Clean Shape: {X_clean.shape}")     
print(f"Engineered Shape:     {X_engineered.shape}") 

# --- VISUALIZATION CODE ---

input_names = preprocessor.get_feature_names_out()
print(f"input_names: {input_names}")
output_names = fe_engine.get_feature_names_out(input_names)
print(f"output_names: {output_names}")

df_engineered = pd.DataFrame(X_engineered, columns=output_names)

print("\nFirst 5 rows of engineered features:")
display(df_engineered.head())

print("\nGenerated Feature Names:")
print(output_names)


------------------------------
TESTING FEATURE ENGINEERING
Original Clean Shape: (3, 7)
Engineered Shape:     (3, 35)
input_names: ['Age' 'Salary' 'City_New York' 'City_Paris' 'City_missing' 'Color_Blue'
 'Color_Red']
output_names: ['Age' 'Salary' 'City_New York' 'City_Paris' 'City_missing' 'Color_Blue'
 'Color_Red' 'Age^2' 'Age Salary' 'Age City_New York' 'Age City_Paris'
 'Age City_missing' 'Age Color_Blue' 'Age Color_Red' 'Salary^2'
 'Salary City_New York' 'Salary City_Paris' 'Salary City_missing'
 'Salary Color_Blue' 'Salary Color_Red' 'City_New York^2'
 'City_New York City_Paris' 'City_New York City_missing'
 'City_New York Color_Blue' 'City_New York Color_Red' 'City_Paris^2'
 'City_Paris City_missing' 'City_Paris Color_Blue' 'City_Paris Color_Red'
 'City_missing^2' 'City_missing Color_Blue' 'City_missing Color_Red'
 'Color_Blue^2' 'Color_Blue Color_Red' 'Color_Red^2']

First 5 rows of engineered features:


Unnamed: 0,Age,Salary,City_New York,City_Paris,City_missing,Color_Blue,Color_Red,Age^2,Age Salary,Age City_New York,...,City_Paris^2,City_Paris City_missing,City_Paris Color_Blue,City_Paris Color_Red,City_missing^2,City_missing Color_Blue,City_missing Color_Red,Color_Blue^2,Color_Blue Color_Red,Color_Red^2
0,-1.224745,-0.92582,1.0,0.0,0.0,0.0,1.0,1.5,1.133893,-1.224745,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,-0.46291,0.0,1.0,0.0,1.0,0.0,0.0,-0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.224745,1.38873,0.0,0.0,1.0,0.0,1.0,1.5,1.70084,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0



Generated Feature Names:
['Age' 'Salary' 'City_New York' 'City_Paris' 'City_missing' 'Color_Blue'
 'Color_Red' 'Age^2' 'Age Salary' 'Age City_New York' 'Age City_Paris'
 'Age City_missing' 'Age Color_Blue' 'Age Color_Red' 'Salary^2'
 'Salary City_New York' 'Salary City_Paris' 'Salary City_missing'
 'Salary Color_Blue' 'Salary Color_Red' 'City_New York^2'
 'City_New York City_Paris' 'City_New York City_missing'
 'City_New York Color_Blue' 'City_New York Color_Red' 'City_Paris^2'
 'City_Paris City_missing' 'City_Paris Color_Blue' 'City_Paris Color_Red'
 'City_missing^2' 'City_missing Color_Blue' 'City_missing Color_Red'
 'Color_Blue^2' 'Color_Blue Color_Red' 'Color_Red^2']
