In [1]:
# Import essential libraries for pipeline preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Sklearn preprocessing and pipeline tools
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Set up plotting and display options
plt.style.use('default')
sns.set_palette("viridis")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

#### Load and Examine Raw Data

In [2]:
# Load the original training data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print("Dataset Overview:")
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Check for missing values in training data
print(f"\nMissing values in training data: {train_df.isnull().sum().sum()}")
print(f"Missing values in test data: {test_df.isnull().sum().sum()}")

# Display basic info about the target variable
print(f"\nTarget variable (SalePrice) statistics:")
print(f"Mean: ${train_df['SalePrice'].mean():,.0f}")
print(f"Median: ${train_df['SalePrice'].median():,.0f}")
print(f"Range: ${train_df['SalePrice'].min():,.0f} - ${train_df['SalePrice'].max():,.0f}")

# Show first few rows
print(f"\nFirst 3 rows of training data:")
train_df.head(3)

Dataset Overview:
Training data shape: (1460, 81)
Test data shape: (1459, 80)

Missing values in training data: 7829
Missing values in test data: 7878

Target variable (SalePrice) statistics:
Mean: $180,921
Median: $163,000
Range: $34,900 - $755,000

First 3 rows of training data:


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500


significant missing values to handle (7,829 in training, 7,878 in test)

#### Analyze Data Types and Missing Patterns

In [3]:
# Separate features by data type
numerical_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = train_df.select_dtypes(exclude=[np.number]).columns.tolist()

# Remove target and ID from feature lists
if 'SalePrice' in numerical_features:
    numerical_features.remove('SalePrice')
if 'Id' in numerical_features:
    numerical_features.remove('Id')

print("Feature Analysis:")
print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Analyze missing values by feature type
print(f"\nMissing Values Analysis:")
missing_numerical = train_df[numerical_features].isnull().sum()
missing_categorical = train_df[categorical_features].isnull().sum()

print(f"\nTop 10 numerical features with missing values:")
print(missing_numerical[missing_numerical > 0].sort_values(ascending=False).head(10))

print(f"\nTop 10 categorical features with missing values:")
print(missing_categorical[missing_categorical > 0].sort_values(ascending=False).head(10))

Feature Analysis:
Numerical features: 36
Categorical features: 43

Missing Values Analysis:

Top 10 numerical features with missing values:
LotFrontage    259
GarageYrBlt     81
MasVnrArea       8
dtype: int64

Top 10 categorical features with missing values:
PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
GarageType        81
GarageFinish      81
GarageQual        81
GarageCond        81
dtype: int64


Most categorical missing values are likely "None" cases (no pool, no fence, etc.).

#### Define Preprocessing Strategy

In [4]:
# Define feature groups based on missing value patterns and domain knowledge

# Numerical features - will use median imputation
print("Numerical Features for Pipeline:")
print(f"Total: {len(numerical_features)}")
print("Sample:", numerical_features[:10])

# Categorical features that should be "None" when missing (no feature present)
none_categorical_features = [
    'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
    'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
    'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'
]

# Categorical features that should use mode imputation (missing = most common value)
mode_categorical_features = [feat for feat in categorical_features 
                           if feat not in none_categorical_features]

print(f"\nCategorical Features Strategy:")
print(f"'None' imputation: {len(none_categorical_features)} features")
print("Sample:", none_categorical_features[:5])
print(f"\nMode imputation: {len(mode_categorical_features)} features") 
print("Sample:", mode_categorical_features[:5])

# Define ordinal features with their mappings (from previous analysis)
ordinal_features = {
    'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'FireplaceQu': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
}

print(f"\nOrdinal features: {len(ordinal_features)} features")
print("Features:", list(ordinal_features.keys()))

Numerical Features for Pipeline:
Total: 36
Sample: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2']

Categorical Features Strategy:
'None' imputation: 14 features
Sample: ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']

Mode imputation: 29 features
Sample: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities']

Ordinal features: 9 features
Features: ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']


#### Create ColumnTransformer Pipeline

In [5]:
# Create preprocessing pipelines for different feature types

# 1. Numerical pipeline: impute with median, then scale
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 2. Categorical pipeline for "None" features: impute with "None", then one-hot encode
none_categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# 3. Categorical pipeline for mode features: impute with mode, then one-hot encode
mode_categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# 4. Ordinal pipeline: impute with "None", then ordinal encode
ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('ordinal', OrdinalEncoder(categories=list(ordinal_features.values()), 
                              handle_unknown='use_encoded_value', 
                              unknown_value=-1))
])

print("Individual pipelines created:")
print("1. Numerical pipeline: median imputation + scaling")
print("2. None categorical pipeline: 'None' imputation + one-hot encoding")
print("3. Mode categorical pipeline: mode imputation + one-hot encoding")
print("4. Ordinal pipeline: 'None' imputation + ordinal encoding")

Individual pipelines created:
1. Numerical pipeline: median imputation + scaling
2. None categorical pipeline: 'None' imputation + one-hot encoding
3. Mode categorical pipeline: mode imputation + one-hot encoding
4. Ordinal pipeline: 'None' imputation + ordinal encoding


#### Build Complete ColumnTransformer

In [6]:
# Separate ordinal features from other categorical features
ordinal_feature_names = list(ordinal_features.keys())
none_categorical_final = [feat for feat in none_categorical_features 
                         if feat not in ordinal_feature_names]
mode_categorical_final = [feat for feat in mode_categorical_features 
                         if feat not in ordinal_feature_names]

# Create the complete ColumnTransformer
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('none_categorical', none_categorical_pipeline, none_categorical_final),
    ('mode_categorical', mode_categorical_pipeline, mode_categorical_final),
    ('ordinal', ordinal_pipeline, ordinal_feature_names)
], remainder='drop')  # Drop any features not specified

print("ColumnTransformer created with:")
print(f"- Numerical features: {len(numerical_features)}")
print(f"- None categorical features: {len(none_categorical_final)}")
print(f"- Mode categorical features: {len(mode_categorical_final)}")
print(f"- Ordinal features: {len(ordinal_feature_names)}")
print(f"- Total features to process: {len(numerical_features) + len(none_categorical_final) + len(mode_categorical_final) + len(ordinal_feature_names)}")

# Verify we're not missing any features
total_features = len(numerical_features) + len(categorical_features)
processed_features = len(numerical_features) + len(none_categorical_final) + len(mode_categorical_final) + len(ordinal_feature_names)
print(f"\nFeature accounting:")
print(f"Original features (excluding Id, SalePrice): {total_features}")
print(f"Features in preprocessor: {processed_features}")
print(f"Match: {total_features == processed_features}")

ColumnTransformer created with:
- Numerical features: 36
- None categorical features: 9
- Mode categorical features: 25
- Ordinal features: 9
- Total features to process: 79

Feature accounting:
Original features (excluding Id, SalePrice): 79
Features in preprocessor: 79
Match: True


#### Test Preprocessing Pipeline

In [7]:
# Prepare data for preprocessing (exclude Id and SalePrice)
X = train_df.drop(['Id', 'SalePrice'], axis=1)
y = train_df['SalePrice']

print("Data preparation:")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

# Split data for testing the pipeline
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTrain-validation split:")
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")

# Fit the preprocessor on training data
print(f"\nFitting preprocessor on training data...")
X_train_processed = preprocessor.fit_transform(X_train)
print(f"Processed training data shape: {X_train_processed.shape}")

# Transform validation data using fitted preprocessor
X_val_processed = preprocessor.transform(X_val)
print(f"Processed validation data shape: {X_val_processed.shape}")

# Check for any remaining missing values
print(f"\nMissing values after preprocessing:")
print(f"Training data: {np.isnan(X_train_processed).sum()}")
print(f"Validation data: {np.isnan(X_val_processed).sum()}")

print(f"\nPreprocessing pipeline test: SUCCESS")

Data preparation:
Features (X) shape: (1460, 79)
Target (y) shape: (1460,)

Train-validation split:
X_train shape: (1168, 79)
X_val shape: (292, 79)

Fitting preprocessor on training data...
Processed training data shape: (1168, 228)
Processed validation data shape: (292, 228)

Missing values after preprocessing:
Training data: 0
Validation data: 0

Preprocessing pipeline test: SUCCESS




#### Analyze Preprocessing Results and Handle Warning

In [8]:
# Get feature names after preprocessing
feature_names = preprocessor.get_feature_names_out()
print(f"Feature expansion analysis:")
print(f"Original features: 79")
print(f"After preprocessing: {len(feature_names)}")
print(f"Expansion factor: {len(feature_names)/79:.1f}x")

# Analyze feature types in final output
print(f"\nFeature breakdown after preprocessing:")
numerical_count = len([name for name in feature_names if name.startswith('numerical')])
none_cat_count = len([name for name in feature_names if name.startswith('none_categorical')])
mode_cat_count = len([name for name in feature_names if name.startswith('mode_categorical')])
ordinal_count = len([name for name in feature_names if name.startswith('ordinal')])

print(f"- Numerical features: {numerical_count}")
print(f"- None categorical features: {none_cat_count}")
print(f"- Mode categorical features: {mode_cat_count}")
print(f"- Ordinal features: {ordinal_count}")

# Show sample of feature names
print(f"\nSample feature names:")
print("First 10:", feature_names[:10])
print("Last 10:", feature_names[-10:])

# The warning about unknown categories is expected and handled correctly
print(f"\nNote: Warning about unknown categories is normal and properly handled")
print("Unknown categories are encoded as zeros (ignored), preventing errors")

Feature expansion analysis:
Original features: 79
After preprocessing: 228
Expansion factor: 2.9x

Feature breakdown after preprocessing:
- Numerical features: 36
- None categorical features: 38
- Mode categorical features: 145
- Ordinal features: 9

Sample feature names:
First 10: ['numerical__MSSubClass' 'numerical__LotFrontage' 'numerical__LotArea'
 'numerical__OverallQual' 'numerical__OverallCond' 'numerical__YearBuilt'
 'numerical__YearRemodAdd' 'numerical__MasVnrArea' 'numerical__BsmtFinSF1'
 'numerical__BsmtFinSF2']
Last 10: ['mode_categorical__SaleCondition_Partial' 'ordinal__ExterQual'
 'ordinal__ExterCond' 'ordinal__BsmtQual' 'ordinal__BsmtCond'
 'ordinal__HeatingQC' 'ordinal__KitchenQual' 'ordinal__FireplaceQu'
 'ordinal__GarageQual' 'ordinal__GarageCond']

Unknown categories are encoded as zeros (ignored), preventing errors


#### Import Feature Engineering Module

In [9]:
# Import FeatureEngineer from our custom module (professional approach)
import sys
sys.path.append('../src')
from feature_engineering import FeatureEngineer

print("FeatureEngineer imported from module")

# Create complete pipeline with feature engineering
complete_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineer()),
    ('preprocessing', preprocessor)
])

print("Complete pipeline created:")
print("1. Feature Engineering: TotalSF, TotalBath, HouseAge, YearsSinceRemod, WasRemodeled")
print("2. Preprocessing: Imputation, scaling, encoding")

# Test the complete pipeline
print(f"\nTesting complete pipeline...")
X_train_final = complete_pipeline.fit_transform(X_train)
X_val_final = complete_pipeline.transform(X_val)

print(f"Final processed shapes:")
print(f"Training: {X_train_final.shape}")
print(f"Validation: {X_val_final.shape}")

FeatureEngineer imported from module
Complete pipeline created:
1. Feature Engineering: TotalSF, TotalBath, HouseAge, YearsSinceRemod, WasRemodeled
2. Preprocessing: Imputation, scaling, encoding

Testing complete pipeline...
Final processed shapes:
Training: (1168, 228)
Validation: (292, 228)




#### Save Pipeline and Test on Real Test Data

In [10]:
import joblib
from pathlib import Path

# Create models directory if it doesn't exist
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Save the complete pipeline
pipeline_path = models_dir / 'preprocessing_pipeline.pkl'
joblib.dump(complete_pipeline, pipeline_path)
print(f"Pipeline saved to: {pipeline_path}")

Pipeline saved to: ..\models\preprocessing_pipeline.pkl


#### Production Pipeline Test

In [11]:
# Simulate production scenario: load pipeline from disk
print("PRODUCTION PIPELINE TEST")
print("=" * 30)

# Clear the pipeline from memory (simulate fresh start)
del complete_pipeline

# Test pipeline on actual test dataset
print(f"\nTesting pipeline on real test data...")
test_features = test_df.drop(['Id'], axis=1)  # Remove Id, no SalePrice in test
print(f"Test data shape before preprocessing: {test_features.shape}")

# Load pipeline from saved file
loaded_pipeline = joblib.load('../models/preprocessing_pipeline.pkl')
print("Pipeline loaded from disk successfully")

# Apply loaded pipeline to test data
test_processed_production = loaded_pipeline.transform(test_features)
print(f"Test data processed using loaded pipeline: {test_processed_production.shape}")

# Verify no missing values
missing_values = np.isnan(test_processed_production).sum()
print(f"Missing values: {missing_values}")

print("\nProduction pipeline test: SUCCESS")
print("Pipeline can be deployed and used independently")

PRODUCTION PIPELINE TEST

Testing pipeline on real test data...
Test data shape before preprocessing: (1459, 79)
Pipeline loaded from disk successfully
Test data processed using loaded pipeline: (1459, 228)
Missing values: 0

Production pipeline test: SUCCESS
Pipeline can be deployed and used independently
