In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingRegressor

In [5]:
data = pd.read_csv("drive/My Drive/Colab Notebooks/Houses.csv", encoding='latin-1')

In [6]:
data

Unnamed: 0.1,Unnamed: 0,address,city,floor,id,latitude,longitude,price,rooms,sq,year
0,0,Podgórze Zab³ocie Stanis³awa Klimeckiego,Kraków,2.0,23918.0,50.049224,19.970379,749000.0,3.0,74.05,2021.0
1,1,Praga-Po³udnie Grochowska,Warszawa,3.0,17828.0,52.249775,21.106886,240548.0,1.0,24.38,2021.0
2,2,Krowodrza Czarnowiejska,Kraków,2.0,22784.0,50.066964,19.920025,427000.0,2.0,37.00,1970.0
3,3,Grunwald,Poznañ,2.0,4315.0,52.404212,16.882542,1290000.0,5.0,166.00,1935.0
4,4,Ochota Gotowy budynek. Stan deweloperski. Osta...,Warszawa,1.0,11770.0,52.212225,20.972630,996000.0,5.0,105.00,2020.0
...,...,...,...,...,...,...,...,...,...,...,...
23759,23759,Stare Miasto Naramowice,Poznañ,0.0,3976.0,52.449649,16.949408,543000.0,4.0,77.00,2020.0
23760,23760,W³ochy,Warszawa,4.0,10206.0,52.186109,20.948438,910000.0,3.0,71.00,2017.0
23761,23761,Nowe Miasto Malta ul. Katowicka,Poznañ,0.0,4952.0,52.397345,16.961939,430695.0,3.0,50.67,2022.0
23762,23762,Podgórze Duchackie Walerego S³awka,Kraków,6.0,24148.0,50.024231,19.959569,359000.0,2.0,38.86,2021.0


In [15]:
def preprocess_inputs(df):
  df = df.copy()

  #drop unwanted columns
  df = df.drop(['Unnamed: 0', 'address', 'id'], axis=1)

  #features and label
  y = df['price']
  X = df.drop('price', axis=1)

  #split the data
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, shuffle=True, random_state=1)

  return X_train, X_test, y_train, y_test

In [17]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [18]:
X_train

Unnamed: 0,city,floor,latitude,longitude,rooms,sq,year
14581,Kraków,7.0,50.064972,19.968826,2.0,53.06,2021.0
20969,Kraków,0.0,50.096642,20.013767,2.0,52.46,2020.0
3858,Warszawa,8.0,52.183927,21.008725,2.0,32.79,2022.0
9364,Poznañ,0.0,52.417260,16.957358,2.0,40.33,2021.0
6157,Poznañ,1.0,52.400663,16.919733,5.0,165.53,2020.0
...,...,...,...,...,...,...,...
10955,Warszawa,4.0,52.277527,21.022353,2.0,37.50,1968.0
17289,Kraków,4.0,50.056192,19.928406,1.0,35.35,2020.0
5192,Warszawa,3.0,52.231958,21.006725,2.0,36.00,2020.0
12172,Poznañ,0.0,52.387661,16.914801,1.0,20.00,1902.0


In [19]:
X_test

Unnamed: 0,city,floor,latitude,longitude,rooms,sq,year
6912,Kraków,3.0,50.064972,19.968826,2.0,56.00,2009.0
18988,Kraków,0.0,50.085575,20.022019,3.0,45.00,1968.0
9847,Warszawa,4.0,52.236238,20.954781,1.0,24.50,1966.0
6709,Warszawa,10.0,52.231158,21.010063,1.0,63.00,2021.0
8286,Poznañ,2.0,52.391608,16.994063,3.0,61.22,2019.0
...,...,...,...,...,...,...,...
14867,Kraków,1.0,50.058010,19.928423,1.0,44.28,1896.0
15632,Poznañ,3.0,52.404212,16.882542,2.0,48.77,2021.0
9210,Warszawa,10.0,52.286932,21.043068,2.0,38.00,1971.0
2415,Kraków,3.0,50.083526,19.903817,2.0,56.00,2000.0


#Building Pipeline

In [20]:
nominal_transformer = Pipeline(steps=[
                                      ('onehotencoder', OneHotEncoder(sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
                                               ('nominal', nominal_transformer, ['city'])
], remainder = 'passthrough')

model = Pipeline(steps=[
                        ('preprocessor', preprocessor),
                        ('scaler', StandardScaler()),
                        ('regressor', GradientBoostingRegressor())
])

# Model Training

In [23]:
model = model.fit(X_train, y_train)

In [24]:
model_accuracy = model.score(X_test, y_test)

In [25]:
print(model_accuracy)

0.7761825658077705


In [26]:
y_pred = model.predict(X_test)

In [27]:
print(y_pred)

[676100.3618961  380714.70627695 337871.08552428 ... 375504.84707122
 476455.55348193 331002.91793455]


In [28]:
y_test

6912     750000.0
18988    399900.0
9847     300000.0
6709     999000.0
8286     427928.0
           ...   
14867    595000.0
15632    399914.0
9210     330000.0
2415     515000.0
8374     315000.0
Name: price, Length: 5941, dtype: float64

In [30]:
print(y_test - y_pred)

6912      73899.638104
18988     19185.293723
9847     -37871.085524
6709      99802.948948
8286    -130441.707887
             ...      
14867      3887.823931
15632     16398.116296
9210     -45504.847071
2415      38544.446518
8374     -16002.917935
Name: price, Length: 5941, dtype: float64


In [31]:
print((y_test - y_pred) ** 2)

6912     5.461157e+09
18988    3.680755e+08
9847     1.434219e+09
6709     9.960629e+09
8286     1.701504e+10
             ...     
14867    1.511517e+07
15632    2.688982e+08
9210     2.070691e+09
2415     1.485674e+09
8374     2.560934e+08
Name: price, Length: 5941, dtype: float64


In [33]:
print(np.mean((y_test - y_pred)**2))

65680150552.983765


In [36]:
rmse = np.sqrt(np.mean((y_test - y_pred)**2))