In [None]:
# Import essential libraries for pipeline preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Sklearn preprocessing and pipeline tools
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Set up plotting and display options
plt.style.use('default')
sns.set_palette("viridis")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

#### Load and Examine Raw Data

In [None]:
# Load the original training data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print("Dataset Overview:")
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Check for missing values in training data
print(f"\nMissing values in training data: {train_df.isnull().sum().sum()}")
print(f"Missing values in test data: {test_df.isnull().sum().sum()}")

# Display basic info about the target variable
print(f"\nTarget variable (SalePrice) statistics:")
print(f"Mean: ${train_df['SalePrice'].mean():,.0f}")
print(f"Median: ${train_df['SalePrice'].median():,.0f}")
print(f"Range: ${train_df['SalePrice'].min():,.0f} - ${train_df['SalePrice'].max():,.0f}")

# Show first few rows
print(f"\nFirst 3 rows of training data:")
train_df.head(3)

significant missing values to handle (7,829 in training, 7,878 in test)

#### Analyze Data Types and Missing Patterns

In [None]:
# Separate features by data type
numerical_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = train_df.select_dtypes(exclude=[np.number]).columns.tolist()

# Remove target and ID from feature lists
if 'SalePrice' in numerical_features:
    numerical_features.remove('SalePrice')
if 'Id' in numerical_features:
    numerical_features.remove('Id')

print("Feature Analysis:")
print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Analyze missing values by feature type
print(f"\nMissing Values Analysis:")
missing_numerical = train_df[numerical_features].isnull().sum()
missing_categorical = train_df[categorical_features].isnull().sum()

print(f"\nTop 10 numerical features with missing values:")
print(missing_numerical[missing_numerical > 0].sort_values(ascending=False).head(10))

print(f"\nTop 10 categorical features with missing values:")
print(missing_categorical[missing_categorical > 0].sort_values(ascending=False).head(10))

Most categorical missing values are likely "None" cases (no pool, no fence, etc.).

#### Define Preprocessing Strategy

In [None]:
# Define feature groups based on missing value patterns and domain knowledge

# Numerical features - will use median imputation
print("Numerical Features for Pipeline:")
print(f"Total: {len(numerical_features)}")
print("Sample:", numerical_features[:10])

# Categorical features that should be "None" when missing (no feature present)
none_categorical_features = [
    'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
    'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
    'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'
]

# Categorical features that should use mode imputation (missing = most common value)
mode_categorical_features = [feat for feat in categorical_features 
                           if feat not in none_categorical_features]

print(f"\nCategorical Features Strategy:")
print(f"'None' imputation: {len(none_categorical_features)} features")
print("Sample:", none_categorical_features[:5])
print(f"\nMode imputation: {len(mode_categorical_features)} features") 
print("Sample:", mode_categorical_features[:5])

# Define ordinal features with their mappings (from previous analysis)
ordinal_features = {
    'ExterQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'ExterCond': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'HeatingQC': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'KitchenQual': ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'FireplaceQu': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageQual': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageCond': ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
}

print(f"\nOrdinal features: {len(ordinal_features)} features")
print("Features:", list(ordinal_features.keys()))

#### Create ColumnTransformer Pipeline

In [None]:
# Create preprocessing pipelines for different feature types

# 1. Numerical pipeline: impute with median, then scale
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 2. Categorical pipeline for "None" features: impute with "None", then one-hot encode
none_categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# 3. Categorical pipeline for mode features: impute with mode, then one-hot encode
mode_categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# 4. Ordinal pipeline: impute with "None", then ordinal encode
ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('ordinal', OrdinalEncoder(categories=list(ordinal_features.values()), 
                              handle_unknown='use_encoded_value', 
                              unknown_value=-1))
])

print("Individual pipelines created:")
print("1. Numerical pipeline: median imputation + scaling")
print("2. None categorical pipeline: 'None' imputation + one-hot encoding")
print("3. Mode categorical pipeline: mode imputation + one-hot encoding")
print("4. Ordinal pipeline: 'None' imputation + ordinal encoding")

#### Build Complete ColumnTransformer

In [None]:
# Separate ordinal features from other categorical features
ordinal_feature_names = list(ordinal_features.keys())
none_categorical_final = [feat for feat in none_categorical_features 
                         if feat not in ordinal_feature_names]
mode_categorical_final = [feat for feat in mode_categorical_features 
                         if feat not in ordinal_feature_names]

# Create the complete ColumnTransformer
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('none_categorical', none_categorical_pipeline, none_categorical_final),
    ('mode_categorical', mode_categorical_pipeline, mode_categorical_final),
    ('ordinal', ordinal_pipeline, ordinal_feature_names)
], remainder='drop')  # Drop any features not specified

print("ColumnTransformer created with:")
print(f"- Numerical features: {len(numerical_features)}")
print(f"- None categorical features: {len(none_categorical_final)}")
print(f"- Mode categorical features: {len(mode_categorical_final)}")
print(f"- Ordinal features: {len(ordinal_feature_names)}")
print(f"- Total features to process: {len(numerical_features) + len(none_categorical_final) + len(mode_categorical_final) + len(ordinal_feature_names)}")

# Verify we're not missing any features
total_features = len(numerical_features) + len(categorical_features)
processed_features = len(numerical_features) + len(none_categorical_final) + len(mode_categorical_final) + len(ordinal_feature_names)
print(f"\nFeature accounting:")
print(f"Original features (excluding Id, SalePrice): {total_features}")
print(f"Features in preprocessor: {processed_features}")
print(f"Match: {total_features == processed_features}")

#### Test Preprocessing Pipeline

In [None]:
# Prepare data for preprocessing (exclude Id and SalePrice)
X = train_df.drop(['Id', 'SalePrice'], axis=1)
y = train_df['SalePrice']

print("Data preparation:")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

# Split data for testing the pipeline
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTrain-validation split:")
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")

# Fit the preprocessor on training data
print(f"\nFitting preprocessor on training data...")
X_train_processed = preprocessor.fit_transform(X_train)
print(f"Processed training data shape: {X_train_processed.shape}")

# Transform validation data using fitted preprocessor
X_val_processed = preprocessor.transform(X_val)
print(f"Processed validation data shape: {X_val_processed.shape}")

# Check for any remaining missing values
print(f"\nMissing values after preprocessing:")
print(f"Training data: {np.isnan(X_train_processed).sum()}")
print(f"Validation data: {np.isnan(X_val_processed).sum()}")

print(f"\nPreprocessing pipeline test: SUCCESS")

#### Analyze Preprocessing Results and Handle Warning

In [None]:
# Get feature names after preprocessing
feature_names = preprocessor.get_feature_names_out()
print(f"Feature expansion analysis:")
print(f"Original features: 79")
print(f"After preprocessing: {len(feature_names)}")
print(f"Expansion factor: {len(feature_names)/79:.1f}x")

# Analyze feature types in final output
print(f"\nFeature breakdown after preprocessing:")
numerical_count = len([name for name in feature_names if name.startswith('numerical')])
none_cat_count = len([name for name in feature_names if name.startswith('none_categorical')])
mode_cat_count = len([name for name in feature_names if name.startswith('mode_categorical')])
ordinal_count = len([name for name in feature_names if name.startswith('ordinal')])

print(f"- Numerical features: {numerical_count}")
print(f"- None categorical features: {none_cat_count}")
print(f"- Mode categorical features: {mode_cat_count}")
print(f"- Ordinal features: {ordinal_count}")

# Show sample of feature names
print(f"\nSample feature names:")
print("First 10:", feature_names[:10])
print("Last 10:", feature_names[-10:])

# The warning about unknown categories is expected and handled correctly
print(f"\nNote: Warning about unknown categories is normal and properly handled")
print("Unknown categories are encoded as zeros (ignored), preventing errors")

#### Add Feature Engineering to Pipeline

In [None]:
# Let's add some feature engineering that we know works well from your previous analysis
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Create a copy to avoid modifying original data
        X_copy = X.copy()
        
        # Feature engineering (using original column names)
        # Total square footage
        X_copy['TotalSF'] = X_copy['1stFlrSF'] + X_copy['2ndFlrSF'] + X_copy['TotalBsmtSF']
        
        # Total bathrooms
        X_copy['TotalBath'] = (X_copy['FullBath'] + 
                              0.5 * X_copy['HalfBath'] + 
                              X_copy['BsmtFullBath'] + 
                              0.5 * X_copy['BsmtHalfBath'])
        
        # House age (assuming current year is 2023)
        X_copy['HouseAge'] = 2023 - X_copy['YearBuilt']
        
        # Years since remodel
        X_copy['YearsSinceRemod'] = 2023 - X_copy['YearRemodAdd']
        
        # Was remodeled (binary)
        X_copy['WasRemodeled'] = (X_copy['YearBuilt'] != X_copy['YearRemodAdd']).astype(int)
        
        return X_copy

# Create complete pipeline with feature engineering
complete_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineer()),
    ('preprocessing', preprocessor)
])

print("Complete pipeline created:")
print("1. Feature Engineering: TotalSF, TotalBath, HouseAge, YearsSinceRemod, WasRemodeled")
print("2. Preprocessing: Imputation, scaling, encoding")

# Test the complete pipeline
print(f"\nTesting complete pipeline...")
X_train_final = complete_pipeline.fit_transform(X_train)
X_val_final = complete_pipeline.transform(X_val)

print(f"Final processed shapes:")
print(f"Training: {X_train_final.shape}")
print(f"Validation: {X_val_final.shape}")

#### Save Pipeline and Test on Real Test Data

In [None]:
import joblib
from pathlib import Path

# Create models directory if it doesn't exist
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Save the complete pipeline
pipeline_path = models_dir / 'preprocessing_pipeline.pkl'
joblib.dump(complete_pipeline, pipeline_path)
print(f"Pipeline saved to: {pipeline_path}")

#### Production Pipeline Test

In [None]:
# Simulate production scenario: load pipeline from disk
print("PRODUCTION PIPELINE TEST")
print("=" * 30)

# Clear the pipeline from memory (simulate fresh start)
del complete_pipeline

# Test pipeline on actual test dataset
print(f"\nTesting pipeline on real test data...")
test_features = test_df.drop(['Id'], axis=1)  # Remove Id, no SalePrice in test
print(f"Test data shape before preprocessing: {test_features.shape}")

# Load pipeline from saved file
loaded_pipeline = joblib.load('../models/preprocessing_pipeline.pkl')
print("Pipeline loaded from disk successfully")

# Apply loaded pipeline to test data
test_processed_production = loaded_pipeline.transform(test_features)
print(f"Test data processed using loaded pipeline: {test_processed_production.shape}")

# Verify no missing values
missing_values = np.isnan(test_processed_production).sum()
print(f"Missing values: {missing_values}")

print("\nProduction pipeline test: SUCCESS")
print("Pipeline can be deployed and used independently")