# Description of the dataset:

1. address - Full addres
2. city - Warszawa (Warsaw), Kraków (Cracow), Poznań (Poznan).
3. floor - The number of the floor where the apartment is located
4. id - id
5. latitude - latitude
6. longitude - longitude
7. price - Price of apartment in PLN [TARGET]
8. rooms - Number of rooms in the apartment
9. sq - Number of square meters of the apartment
10. year - Year of the building / apartment

# Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Loading the dataset

In [3]:
# Load the dataset from the 'data' folder
file_path = '../data/Houses.csv'

try:
    dataset = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    # If utf-8 fails, try 'latin1' encoding
    dataset = pd.read_csv(file_path, encoding='latin1')

In [4]:
print(dataset.head())

   Unnamed: 0                                            address      city  \
0           0           Podgórze Zab³ocie Stanis³awa Klimeckiego    Kraków   
1           1                          Praga-Po³udnie Grochowska  Warszawa   
2           2                            Krowodrza Czarnowiejska    Kraków   
3           3                                           Grunwald    Poznañ   
4           4  Ochota Gotowy budynek. Stan deweloperski. Osta...  Warszawa   

   floor       id   latitude  longitude      price  rooms      sq    year  
0    2.0  23918.0  50.049224  19.970379   749000.0    3.0   74.05  2021.0  
1    3.0  17828.0  52.249775  21.106886   240548.0    1.0   24.38  2021.0  
2    2.0  22784.0  50.066964  19.920025   427000.0    2.0   37.00  1970.0  
3    2.0   4315.0  52.404212  16.882542  1290000.0    5.0  166.00  1935.0  
4    1.0  11770.0  52.212225  20.972630   996000.0    5.0  105.00  2020.0  


In [5]:
print(dataset.shape)

(23764, 11)


# Dropping unnecessary columns

In [6]:
# Drop unnecessary columns for now
dataset = dataset.drop(['id', 'address', 'latitude', 'longitude'], axis=1)

In [7]:
print(dataset.head())

   Unnamed: 0      city  floor      price  rooms      sq    year
0           0    Kraków    2.0   749000.0    3.0   74.05  2021.0
1           1  Warszawa    3.0   240548.0    1.0   24.38  2021.0
2           2    Kraków    2.0   427000.0    2.0   37.00  1970.0
3           3    Poznañ    2.0  1290000.0    5.0  166.00  1935.0
4           4  Warszawa    1.0   996000.0    5.0  105.00  2020.0


In [8]:
print(dataset.shape)

(23764, 7)


# Separate features (X) and target variable (y)

In [9]:
# Separate features (X) and target variable (y)
X = dataset.drop('price', axis=1)
y = dataset['price']

In [10]:
print(X)

       Unnamed: 0      city  floor  rooms      sq    year
0               0    Kraków    2.0    3.0   74.05  2021.0
1               1  Warszawa    3.0    1.0   24.38  2021.0
2               2    Kraków    2.0    2.0   37.00  1970.0
3               3    Poznañ    2.0    5.0  166.00  1935.0
4               4  Warszawa    1.0    5.0  105.00  2020.0
...           ...       ...    ...    ...     ...     ...
23759       23759    Poznañ    0.0    4.0   77.00  2020.0
23760       23760  Warszawa    4.0    3.0   71.00  2017.0
23761       23761    Poznañ    0.0    3.0   50.67  2022.0
23762       23762    Kraków    6.0    2.0   38.86  2021.0
23763       23763  Warszawa    2.0    3.0   63.00  1978.0

[23764 rows x 6 columns]


In [11]:
print(y)

0         749000.0
1         240548.0
2         427000.0
3        1290000.0
4         996000.0
           ...    
23759     543000.0
23760     910000.0
23761     430695.0
23762     359000.0
23763     604800.0
Name: price, Length: 23764, dtype: float64


# Handling missing data

In [12]:
missing_data = dataset.isnull()
missing_data_count_per_column = missing_data.sum()

In [13]:
print(missing_data_count_per_column)

Unnamed: 0    0
city          0
floor         0
price         0
rooms         0
sq            0
year          0
dtype: int64


The output indicates that there are no missing values in the columns of the dataset. Each column is reported to have 0 missing values, as indicated by the count of 0 for each column in the missing_data_count_per_column result.

# Encoding categorical data

In [14]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])] , remainder='passthrough')
X = ct.fit_transform(X)

In [15]:
print(X)

[[1.000e+00 0.000e+00 0.000e+00 ... 3.000e+00 7.405e+01 2.021e+03]
 [0.000e+00 0.000e+00 1.000e+00 ... 1.000e+00 2.438e+01 2.021e+03]
 [1.000e+00 0.000e+00 0.000e+00 ... 2.000e+00 3.700e+01 1.970e+03]
 ...
 [0.000e+00 1.000e+00 0.000e+00 ... 3.000e+00 5.067e+01 2.022e+03]
 [1.000e+00 0.000e+00 0.000e+00 ... 2.000e+00 3.886e+01 2.021e+03]
 [0.000e+00 0.000e+00 1.000e+00 ... 3.000e+00 6.300e+01 1.978e+03]]


# Splitting the dataset into the train set and the test set

In [16]:
# train_test_split() takes numpy arrays only as arguments
X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
print(X_train)

[[1.0000e+00 0.0000e+00 0.0000e+00 ... 5.0000e+00 1.4000e+02 2.0210e+03]
 [0.0000e+00 0.0000e+00 1.0000e+00 ... 3.0000e+00 5.6000e+01 1.9700e+03]
 [0.0000e+00 0.0000e+00 1.0000e+00 ... 3.0000e+00 1.1280e+02 2.0010e+03]
 ...
 [1.0000e+00 0.0000e+00 0.0000e+00 ... 4.0000e+00 1.4000e+02 1.9000e+03]
 [1.0000e+00 0.0000e+00 0.0000e+00 ... 4.0000e+00 6.9190e+01 2.0210e+03]
 [0.0000e+00 1.0000e+00 0.0000e+00 ... 6.0000e+00 1.3647e+02 2.0200e+03]]


In [18]:
print(X_test)

[[1.000e+00 0.000e+00 0.000e+00 ... 2.000e+00 3.100e+01 2.009e+03]
 [0.000e+00 0.000e+00 1.000e+00 ... 4.000e+00 9.404e+01 2.000e+03]
 [1.000e+00 0.000e+00 0.000e+00 ... 3.000e+00 8.175e+01 2.020e+03]
 ...
 [0.000e+00 0.000e+00 1.000e+00 ... 1.000e+00 2.472e+01 1.970e+03]
 [1.000e+00 0.000e+00 0.000e+00 ... 2.000e+00 5.250e+01 2.013e+03]
 [0.000e+00 0.000e+00 1.000e+00 ... 3.000e+00 5.000e+01 2.021e+03]]


In [19]:
print(y_train)

[1118600.  620000. 2932800. ... 1350000.  664224. 1100000.]


In [20]:
print(y_test)

[ 410000. 1175000. 1021875. ...  368000.  640000.  639000.]


# Feature scaling (Standardization)

In [21]:
sc = StandardScaler()
X_train_scaled = X_train
X_test_scaled = X_test
X_train_scaled[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test_scaled[:, 3:] = sc.transform(X_test[:, 3:])

In [22]:
print(X_train_scaled)

[[ 1.00000000e+00  0.00000000e+00  0.00000000e+00 ...  2.37805150e+00
   3.61158788e-03  4.14272885e-01]
 [ 0.00000000e+00  0.00000000e+00  1.00000000e+00 ...  3.76938846e-01
  -7.88783202e-03 -6.11091291e-01]
 [ 0.00000000e+00  0.00000000e+00  1.00000000e+00 ...  3.76938846e-01
  -1.12033804e-04  1.21692865e-02]
 ...
 [ 1.00000000e+00  0.00000000e+00  0.00000000e+00 ...  1.37749517e+00
   3.61158788e-03 -2.01845389e+00]
 [ 1.00000000e+00  0.00000000e+00  0.00000000e+00 ...  1.37749517e+00
  -6.08214930e-03  4.14272885e-01]
 [ 0.00000000e+00  1.00000000e+00  0.00000000e+00 ...  3.37860782e+00
   3.12833845e-03  3.94167705e-01]]


In [23]:
print(X_test_scaled)

[[ 1.          0.          0.         ... -0.62361748 -0.01131028
   0.17301073]
 [ 0.          0.          1.         ...  1.37749517 -0.00268024
  -0.00793589]
 [ 1.          0.          0.         ...  0.37693885 -0.00436271
   0.39416771]
 ...
 [ 0.          0.          1.         ... -1.62417381 -0.01217
  -0.61109129]
 [ 1.          0.          0.         ... -0.62361748 -0.00836697
   0.25343145]
 [ 0.          0.          1.         ...  0.37693885 -0.00870922
   0.41427289]]
