# House Modeling Pipeline

## Data setup

- load, train and test sets split, etc

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_file_path = "/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/data/house-prices-advanced-regression-techniques/train.csv"
train_dataset_df = pd.read_csv(train_file_path)
print(train_dataset_df.head(10))
print("Full train dataset shape is {}".format(train_dataset_df.shape))

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   
5   6          50       RL         85.0    14115   Pave   NaN      IR1   
6   7          20       RL         75.0    10084   Pave   NaN      Reg   
7   8          60       RL          NaN    10382   Pave   NaN      IR1   
8   9          50       RM         51.0     6120   Pave   NaN      Reg   
9  10         190       RL         50.0     7420   Pave   NaN      Reg   

  LandContour Utilities  ... PoolArea PoolQC  Fence MiscFeature MiscVal  \
0         Lvl    AllPub  ...        0    NaN    NaN         NaN       0   
1         Lvl    AllPub  ...       

# Model Building

## Model Training

## Feature selection

## Continuous features: 
- GrLivArea, YearBuilt

**reason**:capture size + age effects
## Categorical features: 
- Neighborhood, KitchenQual

**reason**:location + quality effects

In [3]:
from sklearn.preprocessing import StandardScaler

cont_features = ['GrLivArea', 'YearBuilt']


In [4]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

cat_nom_features = ['Neighborhood']
cat_ord_features = ['KitchenQual']


## Divide Train and test

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


X = train_dataset_df[cont_features + cat_nom_features + cat_ord_features].copy()
y = train_dataset_df['SalePrice'].copy()


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Standardize Model

In [6]:
#Standard Scaler
standard_scaler = StandardScaler()

standard_scaler.fit(X_train[cont_features])

X_train_cont = standard_scaler.transform(X_train[cont_features])
X_test_cont  = standard_scaler.transform(X_test[cont_features])



In [7]:
#one hot encoder
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')#also called ohe
one_hot_encoder.fit(X_train[cat_nom_features])              # fit on train only


In [8]:
# transform
X_train_ohe = one_hot_encoder.transform(X_train[cat_nom_features])
X_test_ohe  = one_hot_encoder.transform(X_test[cat_nom_features])
ohe_cols = list(one_hot_encoder.get_feature_names_out(cat_nom_features))

In [9]:
!pip install joblib


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Ordinal Encoder Train Dataset

In [10]:
import joblib
# Get column names for OHE so we can build a DataFrame (optional but helpful)

#Ordinal Catgeries
ord_categories = [['Po', 'Fa', 'TA', 'Gd', 'Ex']]
ordinal_encoder = OrdinalEncoder(
    categories=ord_categories,  # optional
    handle_unknown='use_encoded_value',
    unknown_value=-1
)
ordinal_encoder.fit(X_train[cat_ord_features])

In [11]:
#Tranform Train Data set with ordinal Encoder
X_train_ord = ordinal_encoder.transform(X_train[cat_ord_features])

In [12]:
# Create DataFrames for Train data
df_train_cont = pd.DataFrame(X_train_cont, index=X_train.index, columns=cont_features)
df_train_ohe  = pd.DataFrame(X_train_ohe, index=X_train.index, columns=ohe_cols)
df_train_ord  = pd.DataFrame(X_train_ord, index=X_train.index, columns=cat_ord_features)


In [13]:
# Concatenate horizontally (column order is consistent)
X_train_final_df = pd.concat([df_train_cont, df_train_ohe, df_train_ord], axis=1)


# Convert to numpy arrays to feed into sklearn models (or keep DataFrame)
X_train_final = X_train_final_df.values


#Train Linear Regression
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train_final, y_train)

joblib.dump(linear_regression_model, "/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/models/linear_regression_model.joblib", compress=3)
joblib.dump(one_hot_encoder, "/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/models/one_hot_encoder.joblib", compress=3)
joblib.dump(ordinal_encoder, "/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/models/ordinal_encoder.joblib", compress=3)
joblib.dump(standard_scaler, "/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/models/standard_scaler.joblib", compress=3)

print("Model saved successfully!")

Model saved successfully!


## Model Evaluation 

In [14]:
# Transform TEST data using the fitted transformers
df_test_cont  = pd.DataFrame(X_test_cont, index=X_test.index, columns=cont_features)

#transform Ordinal Encoder for test
X_test_ord  = ordinal_encoder.transform(X_test[cat_ord_features])
#transform One Hot Encoder for test

# Create DataFrames for TEST data
df_test_ohe   = pd.DataFrame(X_test_ohe, index=X_test.index, columns=ohe_cols)
df_test_ord   = pd.DataFrame(X_test_ord, index=X_test.index, columns=cat_ord_features)
# Concatenate TEST processed features
X_test_final_df  = pd.concat([df_test_cont,  df_test_ohe,  df_test_ord],  axis=1)


X_test_final  = X_test_final_df.values

In [15]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)


In [16]:
y_pred = linear_regression_model.predict(X_test_final)
rmsle = compute_rmsle(y_test.values, y_pred)
print("RMSLE:", rmsle)

RMSLE: 0.2


# Model inference


In [17]:
test_file_path = "/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/data/house-prices-advanced-regression-techniques/test.csv"
inference_df  = pd.read_csv(test_file_path)
print(inference_df.head(10))
print("Full test dataset shape is {}".format(inference_df.shape))

     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   
4  1465         120       RL         43.0     5005   Pave   NaN      IR1   
5  1466          60       RL         75.0    10000   Pave   NaN      IR1   
6  1467          20       RL          NaN     7980   Pave   NaN      IR1   
7  1468          60       RL         63.0     8402   Pave   NaN      IR1   
8  1469          20       RL         85.0    10176   Pave   NaN      Reg   
9  1470          20       RL         70.0     8400   Pave   NaN      Reg   

  LandContour Utilities  ... ScreenPorch PoolArea PoolQC  Fence MiscFeature  \
0         Lvl    AllPub  ...         120        0    NaN  MnPrv         NaN   
1    

In [18]:
# Make sure the columns match the training features
X_infer = inference_df[cont_features + cat_nom_features + cat_ord_features].copy()

In [19]:
# Preprocessing & Feature Engineering (Use previously fitted transformers)
standard_scaler_load =joblib.load('/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/models/standard_scaler.joblib')
# Continuous features
X_infer_cont = standard_scaler_load.transform(X_infer[cont_features])
df_infer_cont = pd.DataFrame(X_infer_cont, index=X_infer.index, columns=cont_features)

In [20]:
one_hot_encoder_load =joblib.load('/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/models/one_hot_encoder.joblib')
# Nominal features
X_infer_ohe = one_hot_encoder_load.transform(X_infer[cat_nom_features])
df_infer_ohe = pd.DataFrame(X_infer_ohe, index=X_infer.index, columns=one_hot_encoder_load.get_feature_names_out(cat_nom_features))

In [21]:
ordinal_encoder_load =joblib.load('/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/models/ordinal_encoder.joblib')
# Ordinal features
X_infer_ord = ordinal_encoder_load.transform(X_infer[cat_ord_features])
df_infer_ord = pd.DataFrame(X_infer_ord, index=X_infer.index, columns=cat_ord_features)

In [22]:
# Combine preprocessed inference features
X_infer_final_df = pd.concat([df_infer_cont, df_infer_ohe, df_infer_ord], axis=1)


In [23]:
linear_regression_model_loaded=joblib.load('/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/models/linear_regression_model.joblib')
# Predict House Prices
predictions = linear_regression_model_loaded.predict(X_infer_final_df)

#  Create a submission-like output (optional)
results_df = pd.DataFrame({
    "Id": inference_df.index,
    "Predicted_SalePrice": predictions
})

# Display first few predictions
print("Inference Completed! Sample Predictions:")
print(results_df.head())


Inference Completed! Sample Predictions:
   Id  Predicted_SalePrice
0   0        113507.794483
1   1        167163.837046
2   2        176695.421315
3   3        201372.004412
4   4        252136.525235


