In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

from xgboost import XGBRegressor

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [2]:
houses = pd.read_csv('../data/nigerian_houses.csv')

In [3]:
data_train, data_test = train_test_split(houses, test_size=0.2, random_state=42)

In [4]:
features = data_train.drop(columns=['price'])

target = data_train['price']

In [5]:
def attr_adder(dataframe):
    
    dataframe['bathrooms_per_bedrooms'] = dataframe['bathrooms'] / dataframe['bedrooms']
    dataframe['bedrooms_per_bathrooms'] = dataframe['bedrooms'] / dataframe['bathrooms']
    dataframe['bedrooms_bathrooms'] = dataframe['bedrooms'] * dataframe['bathrooms']
    dataframe['bathrooms_toilets'] = dataframe['bathrooms'] * dataframe['toilets']
    
    return dataframe

In [6]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

In [7]:
features = attr_adder(features)

In [8]:
features.head()

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state,bathrooms_per_bedrooms,bedrooms_per_bathrooms,bedrooms_bathrooms,bathrooms_toilets
4058,4.0,4.0,5.0,,Detached Duplex,Lekki,Lagos,1.0,1.0,16.0,20.0
15290,4.0,4.0,5.0,4.0,Detached Duplex,Lekki,Lagos,1.0,1.0,16.0,20.0
19500,4.0,4.0,5.0,3.0,Detached Duplex,Lekki,Lagos,1.0,1.0,16.0,20.0
3461,1.0,1.0,1.0,,Detached Duplex,Ikeja,Lagos,1.0,1.0,1.0,1.0
977,4.0,4.0,5.0,,Terraced Duplexes,Guzape District,Abuja,1.0,1.0,16.0,20.0


In [9]:
def full_pipe(dataframe, transform=False):
    
    num_attr = list(dataframe.drop(columns=['title', 'town', 'state']))

    cat_attr = ['title', 'town', 'state']
    
    result = ColumnTransformer([
        ('num', num_pipeline, num_attr),
        ('cat', OrdinalEncoder(), cat_attr)
    ])
    
    if transform:
        return result.transform(dataframe)
    return result.fit_transform(dataframe)

In [10]:
prepared_data = full_pipe(features)

In [12]:
pd.DataFrame(prepared_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.296297,-0.514351,-0.141636,-0.031299,-0.203960,0.013177,-0.498936,-0.464075,2.0,117.0,16.0
1,-0.296297,-0.514351,-0.141636,-0.031299,-0.203960,0.013177,-0.498936,-0.464075,2.0,117.0,16.0
2,-0.296297,-0.514351,-0.141636,-0.742841,-0.203960,0.013177,-0.498936,-0.464075,2.0,117.0,16.0
3,-2.956680,-3.111915,-3.425792,-0.031299,-0.203960,0.013177,-2.065625,-2.288518,2.0,74.0,16.0
4,-0.296297,-0.514351,-0.141636,-0.031299,-0.203960,0.013177,-0.498936,-0.464075,6.0,52.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
19455,-0.296297,-0.514351,-0.141636,2.814868,-0.203960,0.013177,-0.498936,-0.464075,2.0,87.0,16.0
19456,0.590497,1.217358,1.500443,-2.165924,0.695684,-0.407504,0.963307,1.648438,2.0,9.0,16.0
19457,0.590497,0.351503,0.679404,-0.031299,-0.203960,0.013177,0.441077,0.496158,2.0,117.0,16.0
19458,0.590497,1.217358,0.679404,-0.031299,0.695684,-0.407504,0.963307,1.072298,2.0,105.0,1.0


In [13]:
xgb = XGBRegressor(random_state=42)

In [14]:
xgb.fit(prepared_data, target)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [15]:
some_data = full_pipe(features.iloc[:5])
some_labels = target.iloc[:5]


some_pred = xgb.predict(some_data)

print("Predictions: ", some_pred)
print("Labels: ", list(some_labels))
print("Error: ", float("{:.2f}".format(mean_squared_error(some_labels, some_pred, squared=False))))

Predictions:  [ 8.7815610e+08  1.2926344e+09  2.4178312e+08  7.6132040e+07
 -2.3899744e+07]
Labels:  [75000000.0, 120000000.0, 70000000.0, 600000000.0, 95000000.0]
Error:  683844325.05


In [16]:
predictions = xgb.predict(prepared_data)

reg_mse = mean_squared_error(target, predictions)

reg_mse

1.273369047898411e+20

In [17]:
scores = cross_val_score(xgb, prepared_data, target, cv=10, n_jobs=-1, scoring="neg_mean_squared_error")

results = np.sqrt(-scores)

In [18]:
def display_score(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation: ", float("{:.2f}".format(scores.std())))

In [19]:
display_score(results)

Scores: [5.67139894e+08 4.16410923e+09 7.27281673e+08 4.29388560e+10
 3.04658922e+09 2.04020636e+10 1.96481407e+10 3.78178268e+09
 1.37511432e+10 2.06050962e+10]
Mean: 12963220239.489145
Standard deviation:  12731952332.7


## Fine tunning model

In [20]:
params = {
    "learning_rate": [0.005, 0.01, 0.02, 0.1, 0.2],
    "max_depth": [2, 3, 5, 7, 8],
    "gamma": [0.05, 0.01, 0.1, 0.3, 0.5],
    "subsample": [0.7, 0.8, 0.9, 1.0],
    "n_estimators": [150, 200, 350, 500],
}

In [21]:
rand_model = RandomizedSearchCV(xgb, param_distributions=params, scoring="neg_mean_squared_error", cv=10, n_jobs=-1,
                             return_train_score=True)

In [22]:
_results = rand_model.fit(prepared_data, target)

In [24]:
# best params subsample=1.0, n_estimators = 350, max_depth=2 learning_rate=0.005 gamma = 0.3
_results.best_params_

{'subsample': 1.0,
 'n_estimators': 350,
 'max_depth': 2,
 'learning_rate': 0.005,
 'gamma': 0.3}

In [25]:
best_model_1 = _results.best_estimator_

### Fine tuning again

In [26]:
params_2 = {
    "learning_rate": [0.001, 0.003, 0.005, 0.009],
    "max_depth": [1, 2, 3, 8],
    "gamma": [0.1, 0.2, 0.3, 0.5, 0.7],
    "subsample": [0.7, 0.9, 1.0],
    "n_estimators": [250, 350, 400, 500],
}

In [27]:
rand_model_2 = RandomizedSearchCV(xgb, param_distributions=params_2, scoring="neg_mean_squared_error", cv=10, n_jobs=-1,
                             return_train_score=True)

In [28]:
result_1 = _results

In [29]:
result_2 = rand_model_2.fit(prepared_data, target)

In [32]:
# subsample = 0.9, n_estm = 500, max_depth = 1, learnin_rate = 0.001, gamma = 0.5

result_2.best_params_

{'subsample': 0.9,
 'n_estimators': 500,
 'max_depth': 1,
 'learning_rate': 0.001,
 'gamma': 0.5}

In [31]:
best_model_2 = result_2.best_estimator_

In [33]:
some_data_ = full_pipe(features.iloc[5:20])
some_labels_ = target.iloc[5:20]


some_pred_1 = best_model_1.predict(some_data_)

print("Predictions: ", some_pred_1)
print("Labels: ", list(some_labels_))
print("Error: ", float("{:.2f}".format(mean_squared_error(some_labels_, some_pred_1, squared=False))))

Predictions:  [1.9064739e+08 2.1886443e+08 2.1886443e+08 1.9064739e+08 2.1886443e+08
 2.1886443e+08 2.1886443e+08 1.9064739e+08 2.1886443e+08 1.9987480e+08
 2.1886443e+08 2.1886443e+08 1.9064739e+08 2.1886443e+08 1.9064739e+08]
Labels:  [28500000.0, 49000000.0, 88000000.0, 130000000.0, 150000000.0, 82000000.0, 150000000.0, 22000000.0, 60000000.0, 88000000.0, 90000000.0, 75000000.0, 65000000.0, 150000000.0, 250000000.0]
Error:  124289164.91


In [34]:
some_pred_2 = best_model_2.predict(some_data_)


print("Predictions: ", some_pred_2)
print("Labels: ", list(some_labels_))
print("Error: ", float("{:.2f}".format(mean_squared_error(some_labels_, some_pred_2, squared=False))))

Predictions:  [85469690. 85469690. 85469690. 85469690. 85469690. 85469690. 85469690.
 85469690. 85469690. 85469690. 85469690. 85469690. 85469690. 85469690.
 85469690.]
Labels:  [28500000.0, 49000000.0, 88000000.0, 130000000.0, 150000000.0, 82000000.0, 150000000.0, 22000000.0, 60000000.0, 88000000.0, 90000000.0, 75000000.0, 65000000.0, 150000000.0, 250000000.0]
Error:  58521981.89


In [35]:
predictions_1 = best_model_1.predict(prepared_data)

reg_mse_1 = mean_squared_error(target, predictions_1)

reg_mse_1

1.8168168806817772e+20

In [36]:
predictions_2 = best_model_2.predict(prepared_data)

reg_mse_2 = mean_squared_error(target, predictions_2)

reg_mse_2

1.857575854393379e+20

## Cross Validation with fine-tuned best models

In [37]:
scores_1 = cross_val_score(best_model_1, prepared_data, target, cv=10, n_jobs=-1, scoring="neg_mean_squared_error")

re_1 = np.sqrt(-scores_1)

In [38]:
scores_2 = cross_val_score(best_model_2, prepared_data, target, cv=10, n_jobs=-1, scoring="neg_mean_squared_error")

re_2 = np.sqrt(-scores_2)

In [39]:
def display_score(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation: ", float("{:.2f}".format(scores.std())))

In [40]:
display_score(re_1)

Scores: [1.69295026e+09 1.94453004e+09 1.46685538e+09 4.29596937e+10
 3.05701341e+09 1.55441675e+09 1.26049025e+09 1.78325549e+09
 2.67790105e+09 1.72933208e+09]
Mean: 6012643841.63992
Standard deviation:  12326830444.75


In [41]:
display_score(re_2)

Scores: [4.21157833e+08 1.27032969e+09 5.05053501e+08 4.29718461e+10
 2.86246966e+09 5.96839810e+08 4.24557988e+08 3.90122861e+08
 2.01263396e+09 1.13681656e+09]
Mean: 5259182794.736845
Standard deviation:  12594685994.54
