In [1]:
print("üè° HOUSE PRICE DATA PREPROCESSING PIPELINE")
print("="*65)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ===========================
# 1Ô∏è‚É£ Load Data
# ===========================
df = pd.read_csv("houseprice.csv")
print(f"‚úÖ Loaded dataset: {df.shape[0]} rows, {df.shape[1]} columns")

# ===========================
# 2Ô∏è‚É£ Handle Missing Values
# ===========================
num_cols = ['Median_Price', 'Median_PSF', 'Transactions']
cat_cols = ['Township','Area','State','Tenure','Type']

df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

print("‚úÖ Missing values handled")

# ===========================
# 3Ô∏è‚É£ Feature Engineering
# ===========================
df["Price_per_Transaction"] = df["Median_Price"] / (df["Transactions"] + 1)
df["Freehold_Flag"] = df["Tenure"].str.contains("Freehold", case=False).astype(int)
df["Is_Central"] = df["Area"].str.contains(r"(City|Central|Town)", case=False).astype(int)
df["Log_Median_Price"] = np.log1p(df["Median_Price"])
df["Log_Median_PSF"] = np.log1p(df["Median_PSF"])

print("‚úÖ Feature engineering completed")

# ===========================
# 4Ô∏è‚É£ Encoding Categorical Variables
# ===========================
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
print(f"‚úÖ One-hot encoding applied ({len(df_encoded.columns)} total features)")

# ===========================
# 5Ô∏è‚É£ Select Features & Target
# ===========================
target = "Median_Price"
X = df_encoded.drop(columns=[target])
y = df_encoded[target]

print(f"‚úÖ Target set: {target}")
print(f"‚úÖ Feature count: {X.shape[1]}")

# ===========================
# 6Ô∏è‚É£ Train-Test Split
# ===========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"‚úÖ Train: {X_train.shape[0]} rows | Test: {X_test.shape[0]} rows")

# ===========================
# 7Ô∏è‚É£ Scaling (Only numeric columns)
# ===========================
scaler = StandardScaler()

num_features_scaled = X.select_dtypes(include=['int64','float64']).columns
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[num_features_scaled] = scaler.fit_transform(X_train[num_features_scaled])
X_test_scaled[num_features_scaled] = scaler.transform(X_test[num_features_scaled])

print("‚úÖ Numeric features scaled")
print(f"Scaled columns: {list(num_features_scaled)}")



üè° HOUSE PRICE DATA PREPROCESSING PIPELINE
‚úÖ Loaded dataset: 2000 rows, 8 columns
‚úÖ Missing values handled
‚úÖ Feature engineering completed
‚úÖ One-hot encoding applied (2318 total features)
‚úÖ Target set: Median_Price
‚úÖ Feature count: 2317
‚úÖ Train: 1600 rows | Test: 400 rows
‚úÖ Numeric features scaled
Scaled columns: ['Median_PSF', 'Transactions', 'Price_per_Transaction', 'Log_Median_Price', 'Log_Median_PSF']


  df["Is_Central"] = df["Area"].str.contains(r"(City|Central|Town)", case=False).astype(int)
