# House Prices Modeling
**Course:** Data Science in Production — Assignment 1  
**Author:** Ahmad Jutt  
**Goal:** Build a basic modeling pipeline to predict house prices (prepare for assignment 2).  
Notebook contains: data loading, preprocessing, feature selection, scaling/encoding, model training & evaluation (RMSLE).


In [12]:
print(os.getcwd())

C:\Users\HP\dsp-ahmad-shamoon\notebooks


In [1]:
# --- 0. Imports & util ---
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 50)

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 4) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)


In [4]:
# --- 1. Dataset loading ---
# Expecting Kaggle "House Prices - Advanced Regression Techniques" files in: ../data/house-prices/
# Add /data to .gitignore (already required by the assignment)

train_path = "../data/train.csv"
test_path  = "../data/test.csv"

df = pd.read_csv(train_path)
df.head(10)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,...,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,...,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,...,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,...,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,...,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,...,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1.5Fin,5,5,1993,1995,Gable,CompShg,VinylSd,VinylSd,...,0,,Attchd,1993.0,Unf,2,480,TA,TA,Y,40,30,0,320,0,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,1Fam,1Story,8,5,2004,2005,Gable,CompShg,VinylSd,VinylSd,...,1,Gd,Attchd,2004.0,RFn,2,636,TA,TA,Y,255,57,0,0,0,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,PosN,Norm,1Fam,2Story,7,6,1973,1973,Gable,CompShg,HdBoard,HdBoard,...,2,TA,Attchd,1973.0,RFn,2,484,TA,TA,Y,235,204,228,0,0,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Artery,Norm,1Fam,1.5Fin,7,5,1931,1950,Gable,CompShg,BrkFace,Wd Shng,...,2,TA,Detchd,1931.0,Unf,2,468,Fa,TA,Y,90,0,205,0,0,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,Corner,Gtl,BrkSide,Artery,Artery,2fmCon,1.5Unf,5,6,1939,1950,Gable,CompShg,MetalSd,MetalSd,...,2,TA,Attchd,1939.0,RFn,1,205,Gd,TA,Y,0,4,0,0,0,0,,,,0,1,2008,WD,Normal,118000


In [5]:
# --- 2. Feature selection (2 continuous + 2 categorical) ---
continuous_feats = ["GrLivArea", "GarageArea"]
categorical_feats = ["MSZoning", "HouseStyle"]
target_col = "SalePrice"

# Keep only selected features + target
use_cols = continuous_feats + categorical_feats + [target_col]
df_small = df[use_cols].copy()

# Basic NA handling for selected columns
# (simple but explicit—ok for assignment 1; you can improve later)
for c in continuous_feats:
    df_small[c] = df_small[c].fillna(df_small[c].median())

for c in categorical_feats:
    df_small[c] = df_small[c].fillna(df_small[c].mode()[0])

df_small.head(10)


Unnamed: 0,GrLivArea,GarageArea,MSZoning,HouseStyle,SalePrice
0,1710,548,RL,2Story,208500
1,1262,460,RL,1Story,181500
2,1786,608,RL,2Story,223500
3,1717,642,RL,2Story,140000
4,2198,836,RL,2Story,250000
5,1362,480,RL,1.5Fin,143000
6,1694,636,RL,1Story,307000
7,2090,484,RL,2Story,200000
8,1774,468,RM,1.5Fin,129900
9,1077,205,RL,1.5Unf,118000


In [6]:
# --- 3. Train / validation split ---
X = df_small[continuous_feats + categorical_feats]
y = df_small[target_col].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape, X_valid.shape)


(1168, 4) (292, 4)


In [7]:
# --- 4. Manual preprocessing (no Pipeline/ColumnTransformer) ---
# One-hot encode categoricals on TRAIN, then align VALID to same columns

# 4.1 One-hot on train
X_train_num = X_train[continuous_feats].copy()
X_train_cat = pd.get_dummies(X_train[categorical_feats], drop_first=True)
X_train_proc = pd.concat([X_train_num, X_train_cat], axis=1)

# 4.2 Fit scaler on TRAIN numeric only
scaler = StandardScaler()
X_train_proc[continuous_feats] = scaler.fit_transform(X_train_proc[continuous_feats])

# 4.3 Prepare VALID using the same columns
X_valid_num = X_valid[continuous_feats].copy()
X_valid_cat = pd.get_dummies(X_valid[categorical_feats], drop_first=True)

# Align columns to train’s one-hot columns
X_valid_cat = X_valid_cat.reindex(columns=X_train_cat.columns, fill_value=0)

X_valid_proc = pd.concat([X_valid_num, X_valid_cat], axis=1)
X_valid_proc[continuous_feats] = scaler.transform(X_valid_proc[continuous_feats])

X_train_proc.head(10)


Unnamed: 0,GrLivArea,GarageArea,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl
254,-0.407093,-0.863837,False,False,True,False,False,True,False,False,False,False,False
1066,0.08317,-0.456264,False,False,True,False,False,False,False,False,True,False,False
638,-1.39525,-2.257169,False,False,True,False,False,True,False,False,False,False,False
799,0.458975,-1.119755,False,False,True,False,False,False,False,False,False,False,False
380,0.312087,-0.797488,False,False,True,False,False,False,False,False,False,False,False
303,-1.208302,0.358883,False,False,True,False,False,True,False,False,False,False,False
86,0.062186,-0.361479,False,False,True,False,False,False,False,False,True,False,False
1385,-0.712315,-1.119755,False,False,False,True,False,False,False,False,False,False,False
265,-0.201068,0.472624,False,False,True,False,False,True,False,False,False,False,False
793,-0.059903,-0.010777,False,False,True,False,False,True,False,False,False,False,False


In [8]:
# --- 5. Model training ---
# Simple linear model with L2 (Ridge) to be robust to one-hot expansion
model = Ridge(alpha=10.0, random_state=42)
model.fit(X_train_proc, y_train)

print("Train R^2:", round(model.score(X_train_proc, y_train), 4))
print("Valid R^2:", round(model.score(X_valid_proc, y_valid), 4))


Train R^2: 0.6407
Valid R^2: 0.6994


In [9]:
# --- 6. Evaluation with RMSLE on SalePrice ---
# The competition metric is RMSLE between log(pred) and log(ground truth).
# mean_squared_log_error does the log part internally and expects positive values.

y_pred_valid = model.predict(X_valid_proc)
y_pred_valid = np.maximum(y_pred_valid, 1.0)  # guard against non-positive preds

rmsle = compute_rmsle(y_valid, y_pred_valid, precision=5)
print("Validation RMSLE:", rmsle)


Validation RMSLE: 0.24881


In [10]:
# --- 7. (Optional) Prepare Kaggle submission using same preprocessing on test.csv ---
test_df = pd.read_csv(test_path)

# Keep only selected features; fill NAs consistently
test_small = test_df[continuous_feats + categorical_feats].copy()
for c in continuous_feats:
    test_small[c] = test_small[c].fillna(df_small[c].median())  # use train medians/modes
for c in categorical_feats:
    test_small[c] = test_small[c].fillna(df_small[c].mode()[0])

# One-hot and align to train columns
test_num = test_small[continuous_feats].copy()
test_cat = pd.get_dummies(test_small[categorical_feats], drop_first=True)
test_cat = test_cat.reindex(columns=X_train_cat.columns, fill_value=0)

test_proc = pd.concat([test_num, test_cat], axis=1)
test_proc[continuous_feats] = scaler.transform(test_proc[continuous_feats])

# Predict
test_pred = model.predict(test_proc)
test_pred = np.maximum(test_pred, 1.0)

# Build submission (Id + SalePrice)
submission = pd.DataFrame({"Id": test_df["Id"], "SalePrice": test_pred})
submission.head(10)


Unnamed: 0,Id,SalePrice
0,1461,140090.725579
1,1462,160679.067235
2,1463,183093.294292
3,1464,179566.950909
4,1465,176939.20709
5,1466,180957.09548
6,1467,159415.249211
7,1468,158865.383199
8,1469,182424.371714
9,1470,143174.770672


In [12]:
print("Final summary:")
print("  Train R^2:", round(model.score(X_train_proc, y_train), 4))
print("  Valid R^2:", round(model.score(X_valid_proc, y_valid), 4))
print("  Valid RMSLE:", rmsle)


Final summary:
  Train R^2: 0.6407
  Valid R^2: 0.6994
  Valid RMSLE: 0.24881
