#Prepare Data

In [5]:
#Import libraries
 
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge

from sklearn.model_selection import train_test_split

!pip install category_encoders 

from category_encoders import OneHotEncoder

from ipywidgets import Dropdown, IntSlider, interact

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


To take our Dataset to the state it was in the previous repository by defining a function termed "wrangle" which will process the data when called;

In [6]:

def wrangle(filepath):
    #import Data

    df = pd.read_csv(filepath)

    #Subset Data:Detached Duplex in Lagos
    mask_state = df["state"] == "Lagos"
    mask_title = df["title"] == "Detached Duplex"
    df = df[mask_state & mask_title]

    #Removing outliers in price distribution
    high = df["price"].quantile(.9)
    mask_price = df["price"] < high
    df = df[mask_price]
    
    #Drop columns with high and low cardinality categorical values 
    df.drop(columns=["title", "state"], inplace=True)

    #Drop columns with multicollinearities
    df.drop(columns = ["bathrooms", "toilets"], inplace =True)
    
     
    return df

To import and process the dataset by calling the already defined wrangle function

In [7]:
df = wrangle ("/content/drive/MyDrive/Project /nigeria_houses_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10587 entries, 2 to 24325
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   bedrooms       10587 non-null  float64
 1   parking_space  10587 non-null  float64
 2   town           10587 non-null  object 
 3   price          10587 non-null  float64
dtypes: float64(3), object(1)
memory usage: 413.6+ KB


#Split Data

In [8]:
#split: feature matrix (X) and target vector (y) 

target = "price"

X = df.drop(columns= target)
y = df[target]

In [9]:
#Train—Test Split 
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=.2)


#Build Model

Baseline

In [10]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean]*len(y_train)

print (f"Mean Det_Dup Price : {round(y_mean, 2)}")
print ("Baseline MAE: ", mean_absolute_error(y_train, y_pred_baseline).round(2))

Mean Det_Dup Price : 143483388.83
Baseline MAE:  67356893.86


Observations: 
* The information above is telling us that if we always predicted that a Detached Duplex price is #143,483,388.83 our prediction will be off by an average of #67,356,893.86 .

* Also, our model needs to have mean absolute error below #67,356,893.86 in order to be useful

Iterate :

In [11]:
model = make_pipeline (
 OneHotEncoder(),
 Ridge(),
)

model.fit(X_train, y_train)

Pipeline(steps=[('onehotencoder', OneHotEncoder(cols=['town'])),
                ('ridge', Ridge())])

Evaluate:

In [12]:
y_train_pred = model.predict(X_train)


print ("Training MAE: ", mean_absolute_error(y_train, y_train_pred).round(2))

Training MAE:  54600537.99


Wow! Our model beat the baseline by over #13,000,000 ! That's a good indicator that it will be helpful in predicting Detached Duplex prices.

Now, the real test is how the model performs on our test set (y_test)...

In [13]:
y_test_pred = model.predict(X_test)

print ( "Test MAE: ", mean_absolute_error(y_test, y_test_pred). round (2))

Test MAE:  53463885.52


Very impressive the Test MAE is very close to the the training MAE. 


Thus, our Linear Regression model predicting the price of Detached Duplex in Lagos will generalise well


#Deploy Model

To create a function that will make a prediction of a Detached Duplex in Lagos, when called by taking 3 arguments — bedrooms, parking_space, and town .

In [22]:
def make_prediction ( bedrooms, parking_space, town):
    data = { "bedrooms" : bedrooms,
             "parking_space": parking_space,
             "town" : town
            }
    df= pd.DataFrame(data, index= [0])
    prediction= model.predict(df).round(2)[0]
 
    return f"Predicted apartment price: #{prediction}"

Let's see our function in action !

In [23]:
print (make_prediction(5, 4, "Ikoyi"))
print(make_prediction(4, 4, "Ajah"))
print (make_prediction(4, 4, "Ikeja"))
make_prediction(5, 4, "Lekki")

Predicted apartment price: #306701866.22
Predicted apartment price: #61104387.11
Predicted apartment price: #126020005.56


'Predicted apartment price: #157129779.34'

Now, to create an Interactive dashboard, where a user can supply values and receive a prediction.

In [25]:
interact(
    make_prediction,
    bedrooms= IntSlider(
              min= X_train["bedrooms"].min(),
              max= X_train["bedrooms"].max(),
              value= X_train["bedrooms"].mean(),
    ),
    parking_space= IntSlider (
                   min= X_train["parking_space"].min(),
                   max= X_train["parking_space"].max(),
                   value= X_train["parking_space"].mean(),
    ),
    town= Dropdown(options=sorted(X_train["town"].unique())),
);

interactive(children=(IntSlider(value=4, description='bedrooms', max=9, min=1), IntSlider(value=4, description…