# House Prediction Regression pipeline with ZenML

ZenML Installation:

In [8]:
%pip install zenml
!zenml integration install sklearn -y
%pip install pyparsing==2.4.2  # required for Colab

import IPython

# automatically restart kernel
IPython.Application.instance().kernel.do_shutdown(restart=True)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
INFO:numexpr.utils:NumExpr defaulting to 2 threads.
[2K[32m⠸[0m Installing integrations...
[1A[2KLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


{'status': 'ok', 'restart': True}

ZenML Setup:

In [1]:
!rm -rf .zen

In [2]:
!zenml init

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
[?25l[32m⠋[0m Initializing ZenML repository at /content.
[2K[1A[2K[32m⠙[0m Initializing ZenML repository at /content.
[2K[1A[2K[32m⠹[0m Initializing ZenML repository at /content.
[2K[1A[2K[2;36mZenML repository initialized at [0m[2;35m/[0m[2;95mcontent.[0m
[2;32m⠹[0m[2;36m [0m[2;36mInitializing ZenML repository at /content.[0m
[2K[1A[2K[32m⠹[0m Initializing ZenML repository at /content.

[1A[2K[1A[2K[2;36mThe local active profile was initialized to [0m[2;32m'default'[0m[2;36m and the local active stack[0m
[2;36mto [0m[2;32m'default'[0m[2;36m. This local configuration will only take effect when you're running[0m
[2;36mZenML from the initialized repository root, or from a subdirectory. For more [0m
[2;36minformation on profile and stack configuration, please visit [0m
[2;4;94mhttps://docs.zenml.io/developer-guide/stacks-profiles-repositories.[0m


In [None]:
!zenml profile create zenbytes

In [3]:
!zenml profile set zenbytes

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
[2;36mRunning with active profile: [0m[2;32m'default'[0m[2;36m [0m[1;2;36m([0m[2;36mlocal[0m[1;2;36m)[0m
[?25l[2;36mActive profile changed to: [0m[2;32m'zenbytes'[0m
[2K[32m⠋[0m Setting the active profile to 'zenbytes'...
[1A[2K

In [4]:
!zenml stack set default

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
[2;36mRunning with active profile: [0m[2;32m'zenbytes'[0m[2;36m [0m[1;2;36m([0m[2;36mlocal[0m[1;2;36m)[0m
[2K[2;36mActive stack set to: [0m[2;32m'default'[0m
[2K[32m⠹[0m Setting the active stack to 'default'...
[1A[2K

In [5]:
!zenml stack get

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
[2;36mRunning with active profile: [0m[2;32m'zenbytes'[0m[2;36m [0m[1;2;36m([0m[2;36mlocal[0m[1;2;36m)[0m
[?25l[2;36mThe active stack is: [0m[2;32m'default'[0m
[2K[32m⠋[0m Getting the active stack...
[1A[2K

Defining house prediction using linear regression pipeline with ZenML:

In [26]:
from zenml.steps import step, Output
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model._base import LinearRegression
%matplotlib inline

In [7]:
@step
def data_loading() -> Output(
    dataset=pd.core.frame.DataFrame,
    ):
    """Load the data"""
    dataset = pd.read_csv("housePrice.csv",header= 0 , sep=',')
    return dataset

In [8]:
@step
def data_preprocessing(dataset: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    """Preparing the data"""
    dataset = dataset.dropna(subset =["Address"])
    dataset['Area'] = dataset['Area'].str.replace(',','')
    dataset['Area'] = dataset['Area'].apply(lambda x : float(x[:]))
    dataset = dataset[dataset['Area'] < 1000]
    dataset = dataset[dataset['Parking'].notnull()].copy()
    dataset['Parking'] = dataset['Parking'].astype(int)
    dataset = dataset[dataset['Warehouse'].notnull()].copy()
    dataset['Warehouse'] = dataset['Warehouse'].astype(int)
    dataset = dataset[dataset['Elevator'].notnull()].copy()
    dataset['Elevator'] = dataset['Elevator'].astype(int)
    encoder =LabelEncoder()
    dataset['Address'] = encoder.fit_transform(dataset.Address)
    data_for_set_model =dataset.copy()
    mean =(data_for_set_model.groupby('Address').mean()['Area'])
    mean_price =data_for_set_model.groupby('Address').mean()['Price']
    data_for_set_model['The_value_of_each_address'] =(data_for_set_model['Address'].map(mean_price))/(data_for_set_model['Address'].map(mean))
    data_for_set_model["Room_pre_Area"] =data_for_set_model['Room']/data_for_set_model["Area"]
    return data_for_set_model

In [18]:
@step
def split_the_data(data_for_set_model: pd.core.frame.DataFrame) -> Output(
    train_set=pd.core.frame.DataFrame,
    test_set=pd.core.frame.DataFrame,
    ):
    """Split the data into train and test"""
    data=data_for_set_model.copy()
    train_set , test_set =train_test_split(data , test_size =0.2 , random_state =42)
    return train_set,test_set

In [27]:
@step
def model_training(train_set:pd.core.frame.DataFrame,)-> LinearRegression:
    """Train the Linear regression model"""
    regr = linear_model.LinearRegression()
    x = np.asanyarray(train_set[['The_value_of_each_address','Room','Parking', 'Area']])
    y = np.asanyarray(train_set[['Price']])
    regr.fit (x, y)
    print ('Coefficients: ', regr.coef_)
    print('Intercept: ', regr.intercept_)
    return regr

In [28]:
@step
def model_inference(test_set:pd.core.frame.DataFrame, regr: LinearRegression) -> float:
    """Predict the model"""
    y_hat =regr.predict(test_set[['The_value_of_each_address','Room','Parking','Area']])
    x =np.asanyarray(test_set[['The_value_of_each_address','Room','Parking','Area']])
    y =np.asanyarray(test_set[['Price']])
    print("Residual sum of squares: %.2f" % np.mean((y_hat - y) ** 2))
    print('Variance score: %.2f' % regr.score(x, y))
    return regr.score(x, y)

In [29]:
from zenml.pipelines import pipeline

@pipeline
def house_prediction_pipeline(load, prepare, split, train, predict):
    """Links all the steps together in a pipeline"""
    dataset = load()
    data_for_set_model = prepare(dataset)
    train_set,test_set = split(data_for_set_model)
    model = train(train_set)
    score = predict(test_set, model)

Running zenML Pipeline:

In [30]:
house_prediction_instance = house_prediction_pipeline(
    load=data_loading(), prepare=data_preprocessing(), split=split_the_data(), train=model_training(), predict=model_inference()
)

In [31]:
house_prediction_instance.run()

[1;35mCreating run for pipeline: [0m[33mhouse_predictio_pipeline[1;35m[0m
[1;35mCache enabled for pipeline [0m[33mhouse_predictio_pipeline[1;35m[0m
[1;35mUsing stack [0m[33mdefault[1;35m to run pipeline [0m[33mhouse_predictio_pipeline[1;35m...[0m
[1;35mStep [0m[33mdata_loading[1;35m has started.[0m
[1;35mUsing cached version of [0m[33mdata_loading[1;35m.[0m
[1;35mStep [0m[33mdata_loading[1;35m has finished in 0.047s.[0m
[1;35mStep [0m[33mdata_preprocessing[1;35m has started.[0m
[1;35mUsing cached version of [0m[33mdata_preprocessing[1;35m.[0m
[1;35mStep [0m[33mdata_preprocessing[1;35m has finished in 0.042s.[0m
[1;35mStep [0m[33msplit_the_data[1;35m has started.[0m
[1;35mUsing cached version of [0m[33msplit_the_data[1;35m.[0m
[1;35mStep [0m[33msplit_the_data[1;35m has finished in 0.041s.[0m
[1;35mStep [0m[33mmodel_training[1;35m has started.[0m
Coefficients:  [[ 1.29351109e+02  5.27371860e+08 -1.41202432e+09  6.0416712

  f"X has feature names, but {self.__class__.__name__} was fitted without"
