In [10]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

data = fetch_california_housing(as_frame=True)
df = data.frame


In [22]:
# Import necessary libraries

from tidyflow import (
    clean_missing, encode_categoricals, scale_features, feature_engineer,
    handle_outliers, auto_dtype, suggest_pipeline, build_pipeline
)

In [23]:
# Load California Housing dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Display the first few rows
print("Original DataFrame:")
display(df.head())

Original DataFrame:


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [24]:
# Check for missing values before preprocessing
print("\nMissing Values:\n")
print(df.isnull().sum())


Missing Values:

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [25]:
# Apply TidyFlow preprocessing
df = clean_missing(df)  # Handle missing values
df = scale_features(df, method='standard')  # Scale numerical features
df = feature_engineer(df, method='poly', degree=2)  # Generate polynomial features
df = handle_outliers(df, method='iqr', strategy='drop')  # Handle outliers
df = auto_dtype(df)  # Convert data types automatically

2025-03-03 14:48:14,940 - INFO - Handling missing values using strategy: mean
2025-03-03 14:48:14,947 - INFO - Scaling features using method: standard
2025-03-03 14:48:14,954 - INFO - Applying feature engineering using method: poly, degree: 2
2025-03-03 14:48:14,972 - INFO - Handling outliers using method: iqr, strategy: drop
2025-03-03 14:48:15,032 - INFO - Automatically detecting and converting data types


In [26]:
# Generate preprocessing suggestions
suggestions = suggest_pipeline(df)

2025-03-03 14:48:16,771 - INFO - Generating preprocessing suggestions


In [27]:
# Build a preprocessing pipeline
pipeline = build_pipeline(df)

2025-03-03 14:48:30,324 - INFO - Building preprocessing pipeline


In [28]:
# Display processed DataFrame
print("\nProcessed DataFrame:")
display(df.head())


Processed DataFrame:


Unnamed: 0,1,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,...,AveOccup^2,AveOccup Latitude,AveOccup Longitude,AveOccup MedHouseVal,Latitude^2,Latitude Longitude,Latitude MedHouseVal,Longitude^2,Longitude MedHouseVal,MedHouseVal^2
15,1.0,-0.918883,1.697265,-0.479596,-0.052132,-0.643284,-0.041451,1.038503,-1.342809,-0.579376,...,0.001718,-0.043047,0.055661,0.024016,1.078488,-1.394511,-0.601683,1.803136,0.777991,0.335676
19,1.0,-0.667116,1.856182,0.014734,-0.027514,-0.649465,-0.05407,1.033821,-1.3478,-0.380923,...,0.002924,-0.055899,0.072876,0.020597,1.068785,-1.393384,-0.393806,1.816566,0.513409,0.145103
21,1.0,-1.135487,1.061601,-0.384323,-0.198234,-0.438415,-0.051263,1.038503,-1.3478,-0.407788,...,0.002628,-0.053236,0.069092,0.020904,1.078488,-1.399694,-0.423489,1.816566,0.549617,0.166291
25,1.0,-0.772129,0.982143,-0.377186,-0.13307,-0.978844,-0.039168,1.038503,-1.352792,-0.861023,...,0.001534,-0.040676,0.052986,0.033724,1.078488,-1.404878,-0.894174,1.830046,1.164784,0.74136
26,1.0,-0.742704,1.617807,-0.28332,-0.159854,-0.722758,-0.051118,1.038503,-1.352792,-0.979747,...,0.002613,-0.053087,0.069153,0.050083,1.078488,-1.404878,-1.01747,1.830046,1.325394,0.959905


In [29]:
# Show preprocessing suggestions
print("\nPreprocessing Suggestions:")
for suggestion in suggestions:
    print("-", suggestion)


Preprocessing Suggestions:
- Columns ['MedInc^2', 'MedInc AveRooms', 'MedInc MedHouseVal', 'HouseAge^2', 'AveRooms^2', 'AveBedrms^2', 'Population^2', 'AveOccup^2', 'MedHouseVal^2'] are highly skewed—consider log transformation.


In [30]:
# Apply preprocessing pipeline
df_transformed = pipeline.fit_transform(df)

In [31]:
# Display transformed data
print("\nTransformed DataFrame (using pipeline):")
display(pd.DataFrame(df_transformed).head())


Transformed DataFrame (using pipeline):


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,0.0,-1.377769,2.060392,-1.333542,0.350999,-1.168152,-0.601619,1.252754,-1.50149,-0.577968,...,-0.204606,-0.883758,1.112815,0.874627,0.483308,-1.140882,-0.93524,1.482888,1.399176,-0.236901
1,0.0,-0.95995,2.261169,0.217507,0.560387,-1.182215,-0.881145,1.247575,-1.506821,-0.268343,...,0.265974,-1.178867,1.49499,0.769253,0.46472,-1.138711,-0.578532,1.504472,0.951384,-0.616972
2,0.0,-1.737236,1.257283,-1.034604,-0.891641,-0.702048,-0.818952,1.252754,-1.506821,-0.310257,...,0.150517,-1.117726,1.41098,0.778734,0.483308,-1.150872,-0.629466,1.504472,1.012664,-0.574715
3,0.0,-1.134223,1.156894,-1.012213,-0.3374,-1.931597,-0.551041,1.252754,-1.512152,-1.017392,...,-0.276468,-0.829309,1.053428,1.173835,0.483308,-1.160862,-1.437141,1.526137,2.053803,0.572174
4,0.0,-1.085392,1.960003,-0.717692,-0.565207,-1.348968,-0.815758,1.252754,-1.512152,-1.202626,...,0.144754,-1.114287,1.412329,1.677995,0.483308,-1.160862,-1.648712,1.526137,2.325627,1.008029
