# House Prices Modeling
**Course:** Data Science in Production — Assignment 1  
**Author:** Ahmad Jutt  
**Goal:** Build a basic modeling pipeline to predict house prices (prepare for assignment 2).  
Notebook contains: data loading, preprocessing, feature selection, scaling/encoding, model training & evaluation (RMSLE).


In [1]:
# --- 0. Imports & util ---
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 50)

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 4) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)


In [2]:
# --- 1. Dataset loading ---
# Expecting Kaggle "House Prices - Advanced Regression Techniques" files in: ../data/house-prices/
# Add /data to .gitignore (already required by the assignment)

train_path = "../data/train.csv"
test_path  = "../data/test.csv"

df = pd.read_csv(train_path)
df.head(10)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,...,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,...,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,...,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,...,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,...,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,...,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1.5Fin,5,5,1993,1995,Gable,CompShg,VinylSd,VinylSd,...,0,,Attchd,1993.0,Unf,2,480,TA,TA,Y,40,30,0,320,0,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,1Fam,1Story,8,5,2004,2005,Gable,CompShg,VinylSd,VinylSd,...,1,Gd,Attchd,2004.0,RFn,2,636,TA,TA,Y,255,57,0,0,0,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,PosN,Norm,1Fam,2Story,7,6,1973,1973,Gable,CompShg,HdBoard,HdBoard,...,2,TA,Attchd,1973.0,RFn,2,484,TA,TA,Y,235,204,228,0,0,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Artery,Norm,1Fam,1.5Fin,7,5,1931,1950,Gable,CompShg,BrkFace,Wd Shng,...,2,TA,Detchd,1931.0,Unf,2,468,Fa,TA,Y,90,0,205,0,0,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Artery,Artery,2fmCon,1.5Unf,5,6,1939,1950,Gable,CompShg,MetalSd,MetalSd,...,2,TA,Attchd,1939.0,RFn,1,205,Gd,TA,Y,0,4,0,0,0,0,,,,0,1,2008,WD,Normal,118000


In [3]:
# --- 2. Feature selection (2 continuous + 2 categorical) ---
continuous_feats = ["GrLivArea", "GarageArea"]
categorical_feats = ["MSZoning", "HouseStyle"]
target_col = "SalePrice"

# Keep only selected features + target
use_cols = continuous_feats + categorical_feats + [target_col]
df_small = df[use_cols].copy()

# Basic NA handling for selected columns
# (simple but explicit—ok for assignment 1; you can improve later)
for c in continuous_feats:
    df_small[c] = df_small[c].fillna(df_small[c].median())

for c in categorical_feats:
    df_small[c] = df_small[c].fillna(df_small[c].mode()[0])

df_small.head(10)


Unnamed: 0,GrLivArea,GarageArea,MSZoning,HouseStyle,SalePrice
0,1710,548,RL,2Story,208500
1,1262,460,RL,1Story,181500
2,1786,608,RL,2Story,223500
3,1717,642,RL,2Story,140000
4,2198,836,RL,2Story,250000
5,1362,480,RL,1.5Fin,143000
6,1694,636,RL,1Story,307000
7,2090,484,RL,2Story,200000
8,1774,468,RM,1.5Fin,129900
9,1077,205,RL,1.5Unf,118000


In [4]:
# --- 3. Train / validation split ---
X = df_small[continuous_feats + categorical_feats]
y = df_small[target_col].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape, X_valid.shape)


(1168, 4) (292, 4)


In [6]:
# --- Step 1: Preprocessing refactor using OneHotEncoder + StandardScaler ---
from sklearn.preprocessing import OneHotEncoder, StandardScaler

encoder = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")
scaler = StandardScaler()

# --- Fit encoder and scaler on TRAIN only ---
X_train_cat = encoder.fit_transform(X_train[categorical_feats])
X_valid_cat = encoder.transform(X_valid[categorical_feats])

X_train_num = scaler.fit_transform(X_train[continuous_feats])
X_valid_num = scaler.transform(X_valid[continuous_feats])

# Combine numeric + categorical
import numpy as np
X_train_proc = np.concatenate([X_train_num, X_train_cat], axis=1)
X_valid_proc = np.concatenate([X_valid_num, X_valid_cat], axis=1)

# --- Step 0 Verification: Compare to saved parquet ---
import pandas as pd
expected_df = pd.read_parquet("../data/processed_df.parquet")

# rebuild processed DataFrame for comparison
actual_df = pd.DataFrame(
    np.concatenate([X_train_num, X_train_cat], axis=1),
    columns=list(continuous_feats) + encoder.get_feature_names_out(categorical_feats).tolist()
)
# y_train is a numpy array, so no reset_index() needed
actual_df["SalePrice"] = y_train

# Compare with saved baseline
pd.testing.assert_frame_equal(
    expected_df.reset_index(drop=True),
    actual_df.reset_index(drop=True),
    check_dtype=False
)
print("✅ Refactor successful: DataFrames match exactly!")




✅ Refactor successful: DataFrames match exactly!


In [7]:
# --- 5. Model training ---
# Simple linear model with L2 (Ridge) to be robust to one-hot expansion
model = Ridge(alpha=10.0, random_state=42)
model.fit(X_train_proc, y_train)

print("Train R^2:", round(model.score(X_train_proc, y_train), 4))
print("Valid R^2:", round(model.score(X_valid_proc, y_valid), 4))


Train R^2: 0.6407
Valid R^2: 0.6994


In [13]:
# --- Step 3A : build_model function ---

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
import joblib
import os

def build_model(data: pd.DataFrame) -> dict[str, float]:
    """
    Builds and trains a Ridge regression model on the house prices dataset.
    Saves the trained model, encoder, and scaler in the 'models' folder.

    Parameters
    ----------
    data : pd.DataFrame
        Full training dataset with 'SalePrice' as the target column.

    Returns
    -------
    dict[str, float]
        Dictionary containing model performance metrics (e.g. {'rmsle': 0.24})
    """

    # --- feature selection ---
    continuous_feats = ["GrLivArea", "GarageArea"]
    categorical_feats = ["MSZoning", "HouseStyle"]
    target_col = "SalePrice"

    # handle missing values
    for c in continuous_feats:
        data[c] = data[c].fillna(data[c].median())
    for c in categorical_feats:
        data[c] = data[c].fillna(data[c].mode()[0])

    # split
    X = data[continuous_feats + categorical_feats]
    y = data[target_col]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

    # preprocessing
    encoder = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")
    scaler = StandardScaler()

    X_train_cat = encoder.fit_transform(X_train[categorical_feats])
    X_valid_cat = encoder.transform(X_valid[categorical_feats])
    X_train_num = scaler.fit_transform(X_train[continuous_feats])
    X_valid_num = scaler.transform(X_valid[continuous_feats])

    X_train_proc = np.concatenate([X_train_num, X_train_cat], axis=1)
    X_valid_proc = np.concatenate([X_valid_num, X_valid_cat], axis=1)

    # train model
    model = Ridge(alpha=10.0, random_state=42)
    model.fit(X_train_proc, y_train)

    # evaluate
    y_pred = np.maximum(model.predict(X_valid_proc), 1.0)
    rmsle = np.sqrt(mean_squared_log_error(y_valid, y_pred))

    # persist objects
    os.makedirs("../models", exist_ok=True)
    joblib.dump(model, "../models/model.joblib")
    joblib.dump(encoder, "../models/encoder.joblib")
    joblib.dump(scaler, "../models/scaler.joblib")

    print(f"✅ Model trained and saved. Validation RMSLE: {rmsle:.5f}")
    return {"rmsle": round(rmsle, 5)}


In [8]:
# --- Step 2: Persist trained model, encoder, and scaler ---
import joblib
import os

# Make sure the folder exists
os.makedirs("../models", exist_ok=True)

# Save all objects
joblib.dump(model, "../models/model.joblib")
joblib.dump(encoder, "../models/encoder.joblib")
joblib.dump(scaler, "../models/scaler.joblib")

print("✅ Model, encoder, and scaler saved successfully in the 'models' folder.")


✅ Model, encoder, and scaler saved successfully in the 'models' folder.


In [9]:
# --- 6. Evaluation with RMSLE on SalePrice ---
# The competition metric is RMSLE between log(pred) and log(ground truth).
# mean_squared_log_error does the log part internally and expects positive values.

y_pred_valid = model.predict(X_valid_proc)
y_pred_valid = np.maximum(y_pred_valid, 1.0)  # guard against non-positive preds

rmsle = compute_rmsle(y_valid, y_pred_valid, precision=5)
print("Validation RMSLE:", rmsle)


Validation RMSLE: 0.24881


In [12]:
## --- Step 2C: Model Inference using persisted model, encoder, and scaler ---
import joblib
import pandas as pd
import numpy as np

# Load saved objects
model = joblib.load("../models/model.joblib")
encoder = joblib.load("../models/encoder.joblib")
scaler = joblib.load("../models/scaler.joblib")

# Load test data
test_df = pd.read_csv("../data/test.csv")

# Feature lists
categorical_feats = ["MSZoning", "HouseStyle"]
continuous_feats = ["GrLivArea", "GarageArea"]

# Handle missing values (same logic as training)
for c in continuous_feats:
    test_df[c] = test_df[c].fillna(test_df[c].median())
for c in categorical_feats:
    test_df[c] = test_df[c].fillna(test_df[c].mode()[0])

# Apply preprocessing (same encoder & scaler)
X_test_cat = encoder.transform(test_df[categorical_feats])
X_test_num = scaler.transform(test_df[continuous_feats])
X_test_proc = np.concatenate([X_test_num, X_test_cat], axis=1)

# Predict
predictions = model.predict(X_test_proc)
predictions = np.maximum(predictions, 1.0)

# Build submission
submission = pd.DataFrame({
    "Id": test_df["Id"],
    "SalePrice": predictions
})

submission.head(10)





Unnamed: 0,Id,SalePrice
0,1461,140090.725579
1,1462,160679.067235
2,1463,183093.294292
3,1464,179566.950909
4,1465,176939.20709
5,1466,180957.09548
6,1467,159415.249211
7,1468,158865.383199
8,1469,182424.371714
9,1470,143174.770672


In [15]:
# --- Step 3B : make_predictions function ---

import pandas as pd
import numpy as np
import joblib

def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    """
    Loads persisted model, encoder, and scaler to make house price predictions.

    Parameters
    ----------
    input_data : pd.DataFrame
        New data (like test.csv) containing the same features used for training.

    Returns
    -------
    np.ndarray
        Predicted house prices.
    """

    categorical_feats = ["MSZoning", "HouseStyle"]
    continuous_feats = ["GrLivArea", "GarageArea"]

    model = joblib.load("../models/model.joblib")
    encoder = joblib.load("../models/encoder.joblib")
    scaler = joblib.load("../models/scaler.joblib")

    # handle missing values
    for c in continuous_feats:
        input_data[c] = input_data[c].fillna(input_data[c].median())
    for c in categorical_feats:
        input_data[c] = input_data[c].fillna(input_data[c].mode()[0])

    X_cat = encoder.transform(input_data[categorical_feats])
    X_num = scaler.transform(input_data[continuous_feats])
    X_proc = np.concatenate([X_num, X_cat], axis=1)

    preds = np.maximum(model.predict(X_proc), 1.0)
    return preds


In [16]:
train_df = pd.read_csv("../data/train.csv")
performance = build_model(train_df)
print(performance)


✅ Model trained and saved. Validation RMSLE: 0.24881
{'rmsle': np.float64(0.24881)}


In [17]:
test_df = pd.read_csv("../data/test.csv")
predictions = make_predictions(test_df)
print(predictions[:5])


[140090.72557861 160679.06723488 183093.29429195 179566.95090863
 176939.20709024]


In [None]:
print("Final summary:")
print("  Train R^2:", round(model.score(X_train_proc, y_train), 4))
print("  Valid R^2:", round(model.score(X_valid_proc, y_valid), 4))
print("  Valid RMSLE:", rmsle)
