In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [2]:
houses = pd.read_csv('../data/nigerian_houses.csv')

In [3]:
data_train, data_test = train_test_split(houses, test_size=0.2, random_state=42)

In [4]:
features = data_train.drop(columns=['price'])

target = data_train['price']

In [5]:
def attr_adder(dataframe):
    
    dataframe['bathrooms_per_bedrooms'] = dataframe['bathrooms'] / dataframe['bedrooms']
    dataframe['bedrooms_per_bathrooms'] = dataframe['bedrooms'] / dataframe['bathrooms']
    dataframe['bedrooms_bathrooms'] = dataframe['bedrooms'] * dataframe['bathrooms']
    dataframe['bathrooms_toilets'] = dataframe['bathrooms'] * dataframe['toilets']
    
    return dataframe

In [6]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

In [7]:
features = attr_adder(features)

In [8]:
features.head()

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state,bathrooms_per_bedrooms,bedrooms_per_bathrooms,bedrooms_bathrooms,bathrooms_toilets
4058,4.0,4.0,5.0,,Detached Duplex,Lekki,Lagos,1.0,1.0,16.0,20.0
15290,4.0,4.0,5.0,4.0,Detached Duplex,Lekki,Lagos,1.0,1.0,16.0,20.0
19500,4.0,4.0,5.0,3.0,Detached Duplex,Lekki,Lagos,1.0,1.0,16.0,20.0
3461,1.0,1.0,1.0,,Detached Duplex,Ikeja,Lagos,1.0,1.0,1.0,1.0
977,4.0,4.0,5.0,,Terraced Duplexes,Guzape District,Abuja,1.0,1.0,16.0,20.0


In [9]:
def full_pipe(dataframe, transform=False):
    
    num_attr = list(dataframe.drop(columns=['title', 'town', 'state']))

    cat_attr = ['title', 'town', 'state']
    
    result = ColumnTransformer([
        ('num', num_pipeline, num_attr),
        ('cat', OrdinalEncoder(), cat_attr)
    ])
    
    if transform:
        return result.transform(dataframe)
    return result.fit_transform(dataframe)

In [10]:
prepared_data = full_pipe(features)

In [11]:
pd.DataFrame(features)

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state,bathrooms_per_bedrooms,bedrooms_per_bathrooms,bedrooms_bathrooms,bathrooms_toilets
4058,4.0,4.0,5.0,,Detached Duplex,Lekki,Lagos,1.0,1.000000,16.0,20.0
15290,4.0,4.0,5.0,4.0,Detached Duplex,Lekki,Lagos,1.0,1.000000,16.0,20.0
19500,4.0,4.0,5.0,3.0,Detached Duplex,Lekki,Lagos,1.0,1.000000,16.0,20.0
3461,1.0,1.0,1.0,,Detached Duplex,Ikeja,Lagos,1.0,1.000000,1.0,1.0
977,4.0,4.0,5.0,,Terraced Duplexes,Guzape District,Abuja,1.0,1.000000,16.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...
21575,4.0,4.0,5.0,8.0,Detached Duplex,Isheri North,Lagos,1.0,1.000000,16.0,20.0
5390,5.0,6.0,7.0,1.0,Detached Duplex,Ajah,Lagos,1.2,0.833333,30.0,42.0
860,5.0,5.0,6.0,,Detached Duplex,Lekki,Lagos,1.0,1.000000,25.0,30.0
15795,5.0,6.0,6.0,,Detached Duplex,Katampe,Abuja,1.2,0.833333,30.0,36.0


In [12]:
gbr = GradientBoostingRegressor(random_state=42)

In [13]:
gbr.fit(prepared_data, target)

GradientBoostingRegressor(random_state=42)

In [14]:
some_data = full_pipe(features.iloc[:5])
some_labels = target.iloc[:5]


some_pred = gbr.predict(some_data)

print("Predictions: ", some_pred)
print("Labels: ", list(some_labels))
print("Error: ", float("{:.2f}".format(mean_squared_error(some_labels, some_pred, squared=False))))

Predictions:  [1.74827835e+08 1.74827835e+08 1.68805701e+08 1.35876679e+08
 1.74827835e+08]
Labels:  [75000000.0, 120000000.0, 70000000.0, 600000000.0, 95000000.0]
Error:  221141219.67


In [15]:
predictions = gbr.predict(prepared_data)

reg_mse = mean_squared_error(target, predictions)

reg_mse

1.5905801364385173e+20

In [16]:
scores = cross_val_score(gbr, prepared_data, target, cv=10, n_jobs=-1, scoring="neg_mean_squared_error")

results = np.sqrt(-scores)

In [17]:
def display_score(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation: ", float("{:.2f}".format(scores.std())))

In [18]:
display_score(results)

Scores: [5.57933808e+09 2.70696521e+09 2.44284100e+09 4.29478470e+10
 4.23758343e+09 7.21088170e+09 3.32163165e+09 4.42961728e+09
 5.47606240e+09 5.69862336e+09]
Mean: 8405139116.36854
Standard deviation:  11599777230.5


## Fine tuning the model

In [19]:
params = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [3, 5, 7, 8],
    "n_estimators": [200, 350, 500],
    "min_samples_split": [2, 3, 5]
}

In [20]:
rand_model = RandomizedSearchCV(gbr, param_distributions=params, scoring="neg_mean_squared_error", cv=5, n_jobs=-1,
                             return_train_score=True)

In [None]:
_results = rand_model.fit(prepared_data, target)

In [None]:
pd.DataFrame(_results.cv_results_)

In [None]:
# n_estimator = 500, min_sample_split = 5, max_depth = 3, learning_rate = 0.1

_results.best_params_

In [None]:
best_model = _results.best_estimator_