# Gradient Boosting Regression

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("./Datasets/clean_immo.csv", sep=',')
df.head()

Unnamed: 0,zip,type_of_property,subtype_of_property,price,number_of_rooms,house_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,commune,province,region,rank
0,1050,house,house,340000,6,203,95,2,0,to be done up,IXELLES,Bruxelles (19 communes),Région Bruxelles-capitale,564.0
1,1880,house,villa,525000,6,250,826,4,0,as new,KAPELLE-OP-DEN-BOS,Brabant Flamand,Région flamande,497.0
2,4900,house,exceptional property,550000,11,475,1543,4,0,good,SPA,Liège,Région wallonne,522.0
3,7912,house,villa,550000,4,325,3570,4,0,good,FRASNES-LEZ-ANVAING,Hainaut,Région wallonne,340.0
4,6032,house,house,550000,5,400,616,3,0,as new,CHARLEROI,Hainaut,Région wallonne,19.0


In [3]:
print(df.isnull().sum())

zip                      0
type_of_property         0
subtype_of_property      0
price                    0
number_of_rooms          0
house_area               0
surface_of_the_land      0
number_of_facades        0
swimming_pool            0
state_of_the_building    0
commune                  0
province                 0
region                   0
rank                     0
dtype: int64


In [4]:
house = df[df['type_of_property'] == 'house']
apart = df[df['type_of_property'] == 'apartment']

In [5]:
hX = house.drop(['zip', 'type_of_property',  'price', 'commune', 'province', 'region', 'subtype_of_property', 'number_of_rooms', 'number_of_facades', 'swimming_pool'], axis=1)
hy = pd.DataFrame()
hy['price'] = house['price']
print(hX.dtypes)
print(hX.shape)
print(hy.shape)
hX.head()

house_area                 int64
surface_of_the_land        int64
state_of_the_building     object
rank                     float64
dtype: object
(21603, 4)
(21603, 1)


Unnamed: 0,house_area,surface_of_the_land,state_of_the_building,rank
0,203,95,to be done up,564.0
1,250,826,as new,497.0
2,475,1543,good,522.0
3,325,3570,good,340.0
4,400,616,as new,19.0


In [6]:
aX = apart.drop(['zip', 'type_of_property',  'price', 'commune', 'province', 'region'], axis=1)
ay = pd.DataFrame()
ay['price'] = apart['price']
print(aX.dtypes)
print(aX.shape)
print(ay.shape)
aX.head()

subtype_of_property       object
number_of_rooms            int64
house_area                 int64
surface_of_the_land        int64
number_of_facades          int64
swimming_pool              int64
state_of_the_building     object
rank                     float64
dtype: object
(19824, 8)
(19824, 1)


Unnamed: 0,subtype_of_property,number_of_rooms,house_area,surface_of_the_land,number_of_facades,swimming_pool,state_of_the_building,rank
19825,ground floor,1,102,166,3,0,as new,391.0
19826,flat studio,1,30,5036,1,0,good,118.0
19827,flat studio,1,30,5036,1,0,good,118.0
19828,apartment,2,60,310,2,0,as new,70.0
19829,apartment,1,48,128,3,0,just renovated,40.0


In [7]:
# Get dummies for object type
X_columns_name = ['subtype_of_property', 'state_of_the_building']
hX = pd.get_dummies(hX, columns=X_columns_name[1], prefix=X_columns_name, drop_first=True)
aX = pd.get_dummies(aX, columns=X_columns_name, prefix=X_columns_name, drop_first=True)

TypeError: Input must be a list-like for parameter `columns`

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [None]:
hX_train, hX_test, hy_train, hy_test = train_test_split(hX, hy, test_size=0.2)
aX_train, aX_test, ay_train, ay_test = train_test_split(aX, ay, test_size=0.2)

In [None]:
hest = GradientBoostingRegressor(n_estimators=1000, max_depth=4, min_samples_split=2, learning_rate=0.1, loss='ls')
aest = GradientBoostingRegressor(n_estimators=1000, max_depth=4, min_samples_split=2, learning_rate=0.1, loss='ls')

In [None]:
hest.fit(hX_train, hy_train)

In [None]:
aest.fit(aX_train, ay_train)

In [None]:
print(f"Score training House : {hest.score(hX_train, hy_train)}")
print(f"Score training Apart : {aest.score(aX_train, ay_train)}")

In [None]:
hy_pred = hest.predict(hX_test)
ay_pred = aest.predict(aX_test)

In [None]:
print(f"Score test House : {hest.score(hX_test, hy_test)}")
print(f"Score test Apart : {aest.score(aX_test, ay_test)}")