In [178]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression


In [179]:
data0 = pd.read_csv("apartments.csv")
data0.shape

(2520, 7)

In [180]:
data0["location"].unique()


array(['Riverside Dr Nairobi, Riverside, Nairobi', 'Kileleshwa, Nairobi',
       'Links Rd Mombasa, Nyali, Mombasa',
       'Near Valley Arcade, Lavington, Nairobi',
       'Thika Rd Nairobi, Kahawa Wendani, Nairobi', 'Kilimani, Nairobi',
       nan, 'Nyali, Mombasa', 'Muthaiga, Nairobi', 'Westlands, Nairobi',
       'Kikuyu Town Bus park Kikuyu, Kikuyu, Kikuyu', 'Shanzu, Mombasa',
       'Westlands downtown, Westlands, Nairobi',
       'Kileleshwa Nairobi, Kileleshwa, Nairobi',
       'Grevillea Grove Spring Valley, Spring Valley, Nairobi',
       'Vihiga road, Kileleshwa, Nairobi',
       'Off Othaya road, Lavington, Nairobi',
       'Jabavu court, Kilimani, Nairobi'], dtype=object)

In [181]:
data0.columns

Index(['Unnamed: 0', 'title', 'location', 'bedrooms', 'bathrooms', 'price',
       'rate'],
      dtype='object')

In [182]:
data1 = data0.drop(["title", "rate", "Unnamed: 0"], axis = "columns")
data1.columns

Index(['location', 'bedrooms', 'bathrooms', 'price'], dtype='object')

In [183]:
data1["location"].value_counts()

Kileleshwa, Nairobi                                      257
Westlands, Nairobi                                       252
Kilimani, Nairobi                                        251
Shanzu, Mombasa                                          129
Kikuyu Town Bus park Kikuyu, Kikuyu, Kikuyu              128
Westlands downtown, Westlands, Nairobi                   127
Links Rd Mombasa, Nyali, Mombasa                         126
Muthaiga, Nairobi                                        126
Grevillea Grove Spring Valley, Spring Valley, Nairobi    125
Kileleshwa Nairobi, Kileleshwa, Nairobi                  125
Riverside Dr Nairobi, Riverside, Nairobi                 125
Nyali, Mombasa                                           125
Thika Rd Nairobi, Kahawa Wendani, Nairobi                125
Vihiga road, Kileleshwa, Nairobi                          56
Near Valley Arcade, Lavington, Nairobi                    35
Off Othaya road, Lavington, Nairobi                       20
Jabavu court, Kilimani, 

In [184]:
data1.isnull().sum()

location     376
bedrooms       0
bathrooms      0
price          0
dtype: int64

In [185]:
data1.shape

(2520, 4)

In [186]:
data2 = data1.dropna()
data2.shape

(2144, 4)

In [187]:
data2.isnull().sum()

location     0
bedrooms     0
bathrooms    0
price        0
dtype: int64

In [188]:
data2.columns

Index(['location', 'bedrooms', 'bathrooms', 'price'], dtype='object')

Introducing the bedroom bathroom ratio


In [189]:
df = pd.DataFrame(data2)
df.columns


Index(['location', 'bedrooms', 'bathrooms', 'price'], dtype='object')

In [190]:
df["Bed/Bath"] = df["bedrooms"]/df["bathrooms"]
df.columns

Index(['location', 'bedrooms', 'bathrooms', 'price', 'Bed/Bath'], dtype='object')

A little data exploration to check on the possible outliers

In [191]:
max(df["bedrooms"])

4

In [192]:
min(df["bedrooms"])

1

In [193]:
max(df["bathrooms"])


5

In [194]:
min(df["bathrooms"])

1

It looks good, lets continue

In [195]:
df.to_csv("APartments1.csv")

In [196]:
df.shape

(2144, 5)

Some one hot encoding

In [197]:
dummies = pd.get_dummies(df.location)
dummies.head()

Unnamed: 0,"Grevillea Grove Spring Valley, Spring Valley, Nairobi","Jabavu court, Kilimani, Nairobi","Kikuyu Town Bus park Kikuyu, Kikuyu, Kikuyu","Kileleshwa Nairobi, Kileleshwa, Nairobi","Kileleshwa, Nairobi","Kilimani, Nairobi","Links Rd Mombasa, Nyali, Mombasa","Muthaiga, Nairobi","Near Valley Arcade, Lavington, Nairobi","Nyali, Mombasa","Off Othaya road, Lavington, Nairobi","Riverside Dr Nairobi, Riverside, Nairobi","Shanzu, Mombasa","Thika Rd Nairobi, Kahawa Wendani, Nairobi","Vihiga road, Kileleshwa, Nairobi","Westlands downtown, Westlands, Nairobi","Westlands, Nairobi"
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [198]:
df1 = pd.concat([df, dummies], axis = "columns")
df1

Unnamed: 0,location,bedrooms,bathrooms,price,Bed/Bath,"Grevillea Grove Spring Valley, Spring Valley, Nairobi","Jabavu court, Kilimani, Nairobi","Kikuyu Town Bus park Kikuyu, Kikuyu, Kikuyu","Kileleshwa Nairobi, Kileleshwa, Nairobi","Kileleshwa, Nairobi",...,"Muthaiga, Nairobi","Near Valley Arcade, Lavington, Nairobi","Nyali, Mombasa","Off Othaya road, Lavington, Nairobi","Riverside Dr Nairobi, Riverside, Nairobi","Shanzu, Mombasa","Thika Rd Nairobi, Kahawa Wendani, Nairobi","Vihiga road, Kileleshwa, Nairobi","Westlands downtown, Westlands, Nairobi","Westlands, Nairobi"
0,"Riverside Dr Nairobi, Riverside, Nairobi",3,3,200000,1.000000,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,"Kileleshwa, Nairobi",3,4,70000,0.750000,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,"Links Rd Mombasa, Nyali, Mombasa",3,2,38000,1.500000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Near Valley Arcade, Lavington, Nairobi",3,3,80000,1.000000,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,"Kileleshwa, Nairobi",1,1,110000,1.000000,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2514,"Kileleshwa Nairobi, Kileleshwa, Nairobi",2,2,55000,1.000000,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2515,"Kilimani, Nairobi",3,3,75000,1.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2517,"Grevillea Grove Spring Valley, Spring Valley, ...",2,3,65000,0.666667,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2518,"Muthaiga, Nairobi",4,5,300000,0.800000,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [199]:
df2 = df1.drop("location", axis = "columns")
df2.head()

Unnamed: 0,bedrooms,bathrooms,price,Bed/Bath,"Grevillea Grove Spring Valley, Spring Valley, Nairobi","Jabavu court, Kilimani, Nairobi","Kikuyu Town Bus park Kikuyu, Kikuyu, Kikuyu","Kileleshwa Nairobi, Kileleshwa, Nairobi","Kileleshwa, Nairobi","Kilimani, Nairobi",...,"Muthaiga, Nairobi","Near Valley Arcade, Lavington, Nairobi","Nyali, Mombasa","Off Othaya road, Lavington, Nairobi","Riverside Dr Nairobi, Riverside, Nairobi","Shanzu, Mombasa","Thika Rd Nairobi, Kahawa Wendani, Nairobi","Vihiga road, Kileleshwa, Nairobi","Westlands downtown, Westlands, Nairobi","Westlands, Nairobi"
0,3,3,200000,1.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,3,4,70000,0.75,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,2,38000,1.5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,80000,1.0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,1,1,110000,1.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


Now building the model

In [200]:
X = df2.drop(["price"], axis = "columns")
X

Unnamed: 0,bedrooms,bathrooms,Bed/Bath,"Grevillea Grove Spring Valley, Spring Valley, Nairobi","Jabavu court, Kilimani, Nairobi","Kikuyu Town Bus park Kikuyu, Kikuyu, Kikuyu","Kileleshwa Nairobi, Kileleshwa, Nairobi","Kileleshwa, Nairobi","Kilimani, Nairobi","Links Rd Mombasa, Nyali, Mombasa","Muthaiga, Nairobi","Near Valley Arcade, Lavington, Nairobi","Nyali, Mombasa","Off Othaya road, Lavington, Nairobi","Riverside Dr Nairobi, Riverside, Nairobi","Shanzu, Mombasa","Thika Rd Nairobi, Kahawa Wendani, Nairobi","Vihiga road, Kileleshwa, Nairobi","Westlands downtown, Westlands, Nairobi","Westlands, Nairobi"
0,3,3,1.000000,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,3,4,0.750000,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,3,2,1.500000,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,3,3,1.000000,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,1,1,1.000000,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2514,2,2,1.000000,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2515,3,3,1.000000,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2517,2,3,0.666667,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2518,4,5,0.800000,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [201]:
X.shape

(2144, 20)

In [202]:
y = df2.price
y

0       200000
1        70000
2        38000
3        80000
4       110000
         ...  
2514     55000
2515     75000
2517     65000
2518    300000
2519    100000
Name: price, Length: 2144, dtype: int64

In [203]:
 X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [204]:
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.9999498058868628

In [205]:
cv = ShuffleSplit(n_splits=4, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.99994782, 0.98494799, 0.99992467, 0.99553832])

building a prediction model

In [206]:
def predict_price(location,bathrooms,bedrooms):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = bathrooms
    x[1] = bedrooms
    
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

In [207]:
predict_price('Kilimani, Nairobi', 1, 2)


-13010.204081632837

In [208]:
# TODO: check why the price returns a negative value