# House Modeling Project

## Data setup (load, train and test sets split, etc)

In [8]:
import pandas as pd

In [9]:
train_file_path = "/Users/ebotfabien/Desktop/school/hosuing_pipelinw/dsp-fabienmbi-ebot/data/house-prices-advanced-regression-techniques/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (1460, 81)


In [10]:
import numpy as np

def split_dataset(dataset, test_ratio=0.30):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_ds_pd), len(valid_ds_pd)))

1022 examples in training, 438 examples in testing.


# Feature selection

## Continuous features: 
-GrLivArea, YearBuilt

reason:capture size + age effects
## Categorical features: 
-Neighborhood, KitchenQual

reason:location + quality effects

In [11]:
from sklearn.preprocessing import StandardScaler

cont_features = ['GrLivArea', 'YearBuilt']


In [12]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

cat_nom_features = ['Neighborhood']
cat_ord_features = ['KitchenQual']


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Define transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('cont', StandardScaler(), cont_features),
        ('cat_nom', OneHotEncoder(handle_unknown='ignore'), cat_nom_features),
        ('cat_ord', OrdinalEncoder(categories=[['Po','Fa','TA','Gd','Ex']]), cat_ord_features)
    ]
)

# Build pipeline with regression model
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', LinearRegression())
])

# Fit the model
X = dataset_df [cont_features + cat_nom_features + cat_ord_features]
y = dataset_df ['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)



# Model Training

In [14]:
import numpy as np
from sklearn.metrics import mean_squared_log_error
import pickle

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)


with open("model.pkl", "wb") as f:
    pickle.dump(model, f)
    
# Predictions
y_pred = model.predict(X_test)

# Compute RMSLE
rmsle = compute_rmsle(y_test.values, y_pred)
print("RMSLE:", rmsle)


RMSLE: 0.2
