In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

import plotly.express as px

import mlflow
from mlflow.models import infer_signature

import json

In [4]:
# Load Dataset

# Load training and test datasets
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Display shapes
print(f"Train Shape: {train.shape}")
print(f"Test Shape: {test.shape}")

# Preview the data
train.head()

Train Shape: (1460, 81)
Test Shape: (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
# Drop columns with more than 30% missing values (except SalePrice)
null_percent = train.isnull().mean()
drop_cols = null_percent[(null_percent > 0.3) & (null_percent.index != 'SalePrice')].index
train_df = train.drop(columns=drop_cols)
train_df.drop(columns=['Id'], axis=1, inplace=True)

In [None]:
fig = px.histogram(
    train_df, 
    x='SalePrice',
    labels= {'SalePrice': 'Sale Price'},
    title='Distribution of House Sale Price',
    marginal='box'
)
fig.show()

In [None]:
# Define features and target
X = train_df.drop(['SalePrice'], axis=1)
y = train_df['SalePrice']

In [None]:
# Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
numeric_columns = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = X_train.select_dtypes(include=['object']).columns

print(f'Numeric columns: {numeric_columns}')
print(f'Categorical columns: {categorical_columns}')

In [None]:
impute_transformer = ColumnTransformer(
    [
        (
            'median_imputer', 
            SimpleImputer(missing_values=np.nan, strategy='median'),
            numeric_columns
        ),
        (
            'mode_imputer',
            SimpleImputer(missing_values=np.nan, strategy='most_frequent'),
            categorical_columns
        )
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

In [None]:
encode_transformer = ColumnTransformer(
    [
        (
            'categorical_one_hot', 
            OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
            categorical_columns
        )
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
pipeline = Pipeline(
    [
        ('impute', impute_transformer),
        ('encode', encode_transformer),
        ('model', model)
    ]
).set_output(transform="pandas")

In [None]:
# Create a new MLflow Experiment
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
mlflow.set_experiment('Housing Price Prediction')

In [None]:
# Start an MLflow run
with mlflow.start_run():
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    fig = px.scatter(
        x=y_val, 
        y=y_pred, 
        labels={
            'x': 'Actual House Sale Price',
            'y': 'Predicted House Sale Price'
        },
        title='Actual vs Predicted House Sale Price'
    )

    # Log metrics and figures
    mlflow.log_metric('RMSE', rmse)
    mlflow.log_metric('MAE', mae)
    mlflow.log_figure(fig, 'actual_vs_predicted.html')

    # Infer the model signature
    signature = infer_signature(train[X_train.columns], y_pred)
    
    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        signature=signature,
        input_example=X_train,
        registered_model_name="housing-price-regression-model"
    )
print("🌲 Random Forest RMSE:", rmse)
print("🌲 Random Forest MAE:", mae)

In [None]:
fig.show()

In [None]:
test_submit = pipeline.predict(test)

In [None]:
test_submit

In [18]:
json.dumps({
    "dataframe_split": json.loads(test.head(5).to_json(orient="split"))
})

'{"dataframe_split": {"columns": ["Id", "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "MasVnrArea", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "Heating", "HeatingQC", "CentralAir", "Electrical", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd", "Functional", "Fireplaces", "FireplaceQu", "GarageType", "GarageYrBlt", "GarageFinish", "GarageCars", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "S