# House Modeling Pipeline

## Data setup

- load, train and test sets split, etc

In [1]:
import pandas as pd

In [2]:
train_file_path = "/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/data/house-prices-advanced-regression-techniques/train.csv"
dataset_df = pd.read_csv(train_file_path)
print(dataset_df.head(10))
print("Full train dataset shape is {}".format(dataset_df.shape))

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   
5   6          50       RL         85.0    14115   Pave   NaN      IR1   
6   7          20       RL         75.0    10084   Pave   NaN      Reg   
7   8          60       RL          NaN    10382   Pave   NaN      IR1   
8   9          50       RM         51.0     6120   Pave   NaN      Reg   
9  10         190       RL         50.0     7420   Pave   NaN      Reg   

  LandContour Utilities  ... PoolArea PoolQC  Fence MiscFeature MiscVal  \
0         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
1         Lvl    AllPub  ...       

In [3]:
import numpy as np

def split_dataset(dataset, test_ratio=0.30):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_ds_pd), len(valid_ds_pd)))

1038 examples in training, 422 examples in testing.


# Feature selection

## Continuous features: 
- GrLivArea, YearBuilt

**reason**:capture size + age effects
## Categorical features: 
- Neighborhood, KitchenQual

**reason**:location + quality effects

In [4]:
from sklearn.preprocessing import StandardScaler

cont_features = ['GrLivArea', 'YearBuilt']


In [5]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

cat_nom_features = ['Neighborhood']
cat_ord_features = ['KitchenQual']


# Model Training

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split



X = dataset_df[cont_features + cat_nom_features + cat_ord_features].copy()
y = dataset_df['SalePrice'].copy()


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


scaler = StandardScaler()

scaler.fit(X_train[cont_features])

X_train_cont = scaler.transform(X_train[cont_features])
X_test_cont  = scaler.transform(X_test[cont_features])


ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(X_train[cat_nom_features])              # fit on train only

# transform
X_train_ohe = ohe.transform(X_train[cat_nom_features])
X_test_ohe  = ohe.transform(X_test[cat_nom_features])

# Get column names for OHE so we can build a DataFrame (optional but helpful)
ohe_cols = list(ohe.get_feature_names_out(cat_nom_features))


ord_categories = [['Po', 'Fa', 'TA', 'Gd', 'Ex']]
ord_enc = OrdinalEncoder(categories=ord_categories)
ord_enc.fit(X_train[cat_ord_features])

X_train_ord = ord_enc.transform(X_train[cat_ord_features])
X_test_ord  = ord_enc.transform(X_test[cat_ord_features])


df_train_cont = pd.DataFrame(X_train_cont, index=X_train.index, columns=cont_features)
df_train_ohe  = pd.DataFrame(X_train_ohe, index=X_train.index, columns=ohe_cols)
df_train_ord  = pd.DataFrame(X_train_ord, index=X_train.index, columns=cat_ord_features)

df_test_cont  = pd.DataFrame(X_test_cont, index=X_test.index, columns=cont_features)
df_test_ohe   = pd.DataFrame(X_test_ohe, index=X_test.index, columns=ohe_cols)
df_test_ord   = pd.DataFrame(X_test_ord, index=X_test.index, columns=cat_ord_features)

# Concatenate horizontally (column order is consistent)
X_train_final_df = pd.concat([df_train_cont, df_train_ohe, df_train_ord], axis=1)
X_test_final_df  = pd.concat([df_test_cont,  df_test_ohe,  df_test_ord],  axis=1)

# Convert to numpy arrays to feed into sklearn models (or keep DataFrame)
X_train_final = X_train_final_df.values
X_test_final  = X_test_final_df.values


#Train Linear Regression

model = LinearRegression()
model.fit(X_train_final, y_train)


# Model Evaluation 

In [7]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)


In [8]:
y_pred = model.predict(X_test_final)
rmsle = compute_rmsle(y_test.values, y_pred)
print("RMSLE:", rmsle)

RMSLE: 0.2
