# 02 Data Preprocessing â€” House Price Prediction (Option B)

This notebook implements a reproducible preprocessing pipeline and writes processed datasets.


In [2]:
from pathlib import Path
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def remove_outliers(df: pd.DataFrame, columns: list, factor: float = 1.5) -> pd.DataFrame:
    """
    Removes outliers from the dataframe using the IQR (Interquartile Range) method.
    
    Parameters:
    ----------
    df : pd.DataFrame
        Input dataframe.
    columns : list
        List of column names to check for outliers.
    factor : float, optional
        The IQR factor threshold (default is 1.5).
        
    Returns:
    -------
    pd.DataFrame
        Dataframe with outliers removed.
    """
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (factor * IQR)
        upper_bound = Q3 + (factor * IQR)
        
        # Filter
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
        
    print(f"Outlier removal: Reduced data from {len(df)} to {len(df_clean)} rows.")
    return df_clean

def create_derived_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates new features from existing columns to improve model performance.
    
    Parameters:
    ----------
    df : pd.DataFrame
        Input dataframe.
        
    Returns:
    -------
    pd.DataFrame
        Dataframe with added engineered features.
    """
    df_out = df.copy()
    # Feature engineering
    df_out['BedroomsPerRoom'] = df_out['AveBedrms'] / df_out['AveRooms']
    df_out['RoomsPerPerson'] = df_out['AveRooms'] / df_out['AveOccup']
    df_out['RoomsMinusBedrooms'] = df_out['AveRooms'] - df_out['AveBedrms']
    return df_out

def preprocess_and_split(df: pd.DataFrame, target_col: str, test_size: float = 0.2):
    """
    Splits data, scales features, and prepares train/test sets.
    
    Parameters:
    ----------
    df : pd.DataFrame
        The dataset.
    target_col : str
        The name of the target variable column.
    test_size : float
        Proportion of dataset to include in the test split.
        
    Returns:
    -------
    tuple
        (X_train_df, X_test_df, y_train, y_test, numeric_features)
    """
    feature_cols = [c for c in df.columns if c != target_col]
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        df[feature_cols], df[target_col], test_size=test_size, random_state=42
    )

    # Preprocessing pipeline
    numeric_features = feature_cols
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

    # Fit/Transform
    X_train_proc = preprocessor.fit_transform(X_train)
    X_test_proc = preprocessor.transform(X_test)
    
    # Convert back to DataFrame
    X_train_df = pd.DataFrame(X_train_proc, columns=numeric_features)
    X_test_df = pd.DataFrame(X_test_proc, columns=numeric_features)
    
    return X_train_df, X_test_df, y_train, y_test, numeric_features

# --- Execution ---
# Paths
raw_csv_path = Path('../data/raw/california_housing.csv').resolve()
processed_dir = Path('../data/processed').resolve()
processed_dir.mkdir(parents=True, exist_ok=True)

# 1. Load Data
df = pd.read_csv(raw_csv_path)
target_column = 'MedHouseVal'
base_features = [c for c in df.columns if c != target_column]

# 2. Handle Missing (Imputation)
df[base_features] = df[base_features].fillna(df[base_features].median())

# 3. Detect and Remove Outliers (Added Requirement)
# We focus on MedHouseVal and key income features for outlier removal
cols_to_check = ['MedHouseVal', 'MedInc', 'AveRooms']
df = remove_outliers(df, cols_to_check, factor=3.0) # Using conservative factor of 3.0

# 4. Feature Engineering
df = create_derived_features(df)

# 5. Split and Scale
X_train_df, X_test_df, y_train, y_test, feat_cols = preprocess_and_split(df, target_column)

# 6. Save Processed Data
train_out = processed_dir / 'train.csv'
test_out = processed_dir / 'test.csv'

pd.concat([X_train_df, y_train.reset_index(drop=True)], axis=1).to_csv(train_out, index=False)
pd.concat([X_test_df, y_test.reset_index(drop=True)], axis=1).to_csv(test_out, index=False)

print(f'Wrote processed files to: {processed_dir}')

# 7. Save Config
config = {
    'target_column': target_column,
    'feature_columns': feat_cols,
    'engineered_features': ['BedroomsPerRoom', 'RoomsPerPerson', 'RoomsMinusBedrooms'],
    'outlier_removal': 'IQR method (factor=3.0)',
    'scaler': 'StandardScaler()',
    'split': {'test_size': 0.2, 'random_state': 42}
}
with open(processed_dir / 'preprocessing_config.json', 'w', encoding='utf-8') as f:
    json.dump(config, f, indent=2)

Outlier removal: Reduced data from 20640 to 20319 rows.
Wrote processed files to: /Users/atukaberadze/Desktop/final-data-science/data/processed
