# Model to predict house prices in Lagos

In [2]:
import pandas as pd 
import numpy as np

In [70]:
prop=pd.read_csv('Lagos_property.csv')

In [71]:
prop.head()

Unnamed: 0,title,price,location,bed,bath,toilet
0,1 BEDROOM MINI FLAT MINI FLAT FLAT / APARTMENT...,200000,meiran abule_egba abule_egba lagos,1,1,1
1,"OFFICE SPACE COMMERCIAL PROPERTY FOR RENT 1, S...",20000000,"1, sylvia crescent, anthony village, ilupeju...",0,0,1
2,2 BEDROOM BLOCKS OF FLATS HOUSE FOR RENT OFF I...,300000,off ikola road command alagbado abule_egba l...,2,2,3
3,2 BEDROOM FLAT / APARTMENT FOR RENT OLD IKOYI ...,8000000,old ikoyi ikoyi lagos,2,2,3
4,4 BEDROOM OFFICE SPACE COMMERCIAL PROPERTY FOR...,3800000,toyin street ikeja lagos,4,4,4


In [72]:
prop.shape

(13520, 6)

In [73]:
prop.isna().sum()

title       0
price       0
location    0
bed         0
bath        0
toilet      0
dtype: int64

### Extract the last two words in the location column

In [76]:
prop['location']=(prop['location'].str.split().str[-2])


In [77]:
prop.head()

Unnamed: 0,title,price,location,bed,bath,toilet
0,1 BEDROOM MINI FLAT MINI FLAT FLAT / APARTMENT...,200000,abule_egba,1,1,1
1,"OFFICE SPACE COMMERCIAL PROPERTY FOR RENT 1, S...",20000000,ilupeju,0,0,1
2,2 BEDROOM BLOCKS OF FLATS HOUSE FOR RENT OFF I...,300000,abule_egba,2,2,3
3,2 BEDROOM FLAT / APARTMENT FOR RENT OLD IKOYI ...,8000000,ikoyi,2,2,3
4,4 BEDROOM OFFICE SPACE COMMERCIAL PROPERTY FOR...,3800000,ikeja,4,4,4


### Select columns for prediction

In [80]:
prop=prop.loc[:, ['price','location','bed','bath','toilet']]

In [81]:
prop.head()

Unnamed: 0,price,location,bed,bath,toilet
0,200000,abule_egba,1,1,1
1,20000000,ilupeju,0,0,1
2,300000,abule_egba,2,2,3
3,8000000,ikoyi,2,2,3
4,3800000,ikeja,4,4,4


### Define x and y. 
### x is every column we want to use to predict. note that only numerical variables can be used for prediction
### y is the column we want to predict

In [166]:
x=prop.loc[:,['bed','bath','toilet']]
y=prop.price

In [91]:
x.shape


(13520, 1)

In [92]:
y.shape

(13520,)

### Import the logistic regression model
### create an instance of it 
### import cross validation score
### pass the instance of the regression model into the cross validation score to get the mean accuracy of the score of the model

In [93]:
from sklearn.linear_model import LogisticRegression

In [94]:
logreg= LogisticRegression(solver='lbfgs')

In [95]:
from sklearn.model_selection import cross_val_score

In [167]:
cross_val_score(logreg,x,y,scoring='accuracy',cv=5).mean()

0.65

## To transform the location column to numeric variables using one hot encoder
### Import onehotencoder
### fit transform the location column
### create X which is all the columns except the price column

In [99]:
prop.head()

Unnamed: 0,price,location,bed,bath,toilet
0,200000,abule_egba,1,1,1
1,20000000,ilupeju,0,0,1
2,300000,abule_egba,2,2,3
3,8000000,ikoyi,2,2,3
4,3800000,ikeja,4,4,4


In [169]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(sparse=False)

In [170]:
ohe.fit_transform(prop[['location']])

array([[1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.]])

In [171]:
ohe.categories_

[array(['abule_egba', 'ikeja', 'ikoyi', 'ilupeju', 'island', 'lekki'],
       dtype=object)]

In [149]:
X= prop.drop('price', axis='columns')

In [150]:
X.head()

Unnamed: 0,location,bed,bath,toilet
0,abule_egba,1,1,1
1,ilupeju,0,0,1
2,abule_egba,2,2,3
3,ikoyi,2,2,3
4,ikeja,4,4,4


### import column transformer 
### pass onehotencoder and fittransform the column transformer

In [151]:
from sklearn.compose import make_column_transformer

In [154]:
column_trans = make_column_transformer(
    (OneHotEncoder(), ['location']), remainder='passthrough')

In [153]:
column_trans.fit_transform(X)

array([[1., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 2., 2., 3.],
       ...,
       [0., 0., 0., ..., 3., 3., 4.],
       [0., 0., 0., ..., 3., 3., 4.],
       [0., 0., 0., ..., 4., 4., 5.]])

### make pipeline and pass column transform and logistic regression into the pipeline
### create a sample out of the training dataset, fit it into the pipeline and predict the outcome

In [155]:
from sklearn.pipeline import make_pipeline

In [156]:
pipe= make_pipeline(column_trans, logreg)

In [161]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.95

In [163]:
X_new=X.sample(5, random_state=99)
X_new

Unnamed: 0,location,bed,bath,toilet
13318,lekki,3,3,4
9219,lekki,4,4,5
5741,ilupeju,0,0,1
8903,ikoyi,2,2,3
2088,ikoyi,3,0,0


In [164]:
pipe.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['location'])])),
                ('logisticregression', LogisticRegression())])

In [165]:
pipe.predict(X_new)

array([ 4000000,  4000000, 20000000,  8000000, 16000000], dtype=int64)