In [25]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderServiceError
from tqdm import tqdm

housing = pd.read_csv('datasets/mieszkania_poland/Houses.csv', encoding='latin-1', index_col=0)

columns = housing.columns
for column in columns:
    if housing[column].dtype == object:
        try:
            housing[column] = housing[column].str.encode('latin-1').str.decode('utf-8', errors='ignore')
        except UnicodeDecodeError:
            print(f"Error decoding column: {column}")
    else:
        print(f"Skipping column: {column} (not of string data type)")

housing = housing[housing['sq'] < 500]
housing = housing[housing['price'] < 3000000]
housing = housing[housing['year'] <= 2023]
housing = housing[housing['year'] >= 1500]
housing = housing.drop('id',axis=1)

geolocator = Nominatim(user_agent="my_app")
rest_housing = pd.read_csv('datasets/mieszkania_poland/data.csv', encoding='utf-8', index_col=0)

rest_housing.loc[:, 'latitude'] = None
rest_housing.loc[:, 'longitude'] = None

total_records = len(rest_housing)
processed_records = 0

for index, row in tqdm(rest_housing.iterrows(), total=total_records, desc='Geocoding Progress'):
    city = row['city']
    address = row['address']

    location = f'{address}, {city}'

    try:
        location_data = geolocator.geocode(location)
        if location_data is not None:
            latitude = location_data.latitude
            longitude = location_data.longitude
            rest_housing.loc[index, 'latitude'] = latitude
            rest_housing.loc[index, 'longitude'] = longitude
        else:
            continue
    except GeocoderServiceError as e:
        print(f"Geocoding service error: {str(e)}")
        continue

    processed_records += 1

print(f"Total records: {total_records}")
print(f"Processed records: {processed_records}")

housing = pd.concat([housing, rest_housing], ignore_index=True)
housing.to_csv('datasets/mieszkania_poland/merged_dataset.csv')
housing.head()

Skipping column: floor (not of string data type)
Skipping column: id (not of string data type)
Skipping column: latitude (not of string data type)
Skipping column: longitude (not of string data type)
Skipping column: price (not of string data type)
Skipping column: rooms (not of string data type)
Skipping column: sq (not of string data type)
Skipping column: year (not of string data type)


Geocoding Progress: 100%|██████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.16it/s]

Total records: 10
Processed records: 10





Unnamed: 0,address,city,floor,latitude,longitude,price,rooms,sq,year
0,Podgrze Zabocie Stanisawa Klimeckiego,Krakw,2.0,50.049224,19.970379,749000.0,3.0,74.05,2021.0
1,Praga-Poudnie Grochowska,Warszawa,3.0,52.249775,21.106886,240548.0,1.0,24.38,2021.0
2,Krowodrza Czarnowiejska,Krakw,2.0,50.066964,19.920025,427000.0,2.0,37.0,1970.0
3,Grunwald,Pozna,2.0,52.404212,16.882542,1290000.0,5.0,166.0,1935.0
4,Ochota Gotowy budynek. Stan deweloperski. Osta...,Warszawa,1.0,52.212225,20.97263,996000.0,5.0,105.0,2020.0


In [26]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23583 entries, 0 to 23582
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   address    23583 non-null  object 
 1   city       23583 non-null  object 
 2   floor      23583 non-null  float64
 3   latitude   23583 non-null  object 
 4   longitude  23583 non-null  object 
 5   price      23583 non-null  float64
 6   rooms      23583 non-null  float64
 7   sq         23583 non-null  float64
 8   year       23583 non-null  float64
dtypes: float64(5), object(4)
memory usage: 1.6+ MB


In [29]:
housing['city'].value_counts()

city
Krakw        9733
Warszawa     9695
Pozna        4145
Białystok      10
Name: count, dtype: int64

In [28]:
summary = housing.describe()
summary.round(0)

Unnamed: 0,floor,price,rooms,sq,year
count,23583.0,23583.0,23583.0,23583.0,23583.0
mean,3.0,618324.0,3.0,60.0,2001.0
std,2.0,356800.0,1.0,71.0,30.0
min,0.0,5000.0,1.0,9.0,1535.0
25%,1.0,410062.0,2.0,42.0,1985.0
50%,2.0,520000.0,3.0,54.0,2019.0
75%,4.0,697215.0,3.0,68.0,2021.0
max,10.0,2995333.0,10.0,7133.0,2023.0


In [None]:
housing_max_price = housing['price'].idxmax()

In [None]:
address_with_max_price = housing.loc[housing_max_price, 'address']

In [None]:
print(address_with_max_price)

In [None]:
housing_max_sq = housing['sq'].idxmax()
address_with_max_sq = housing.loc[housing_max_sq, 'city']
address_with_max_sq

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

housing.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ['price', 'sq', 'rooms', 'year']

scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
import numpy as np

housing['avg_room_size'] = housing['sq'] / housing['rooms']

numeric_columns = housing.select_dtypes(include=np.number)
correlation_matrix = numeric_columns.corr()
correlation_matrix['price'].sort_values(ascending=False)

In [None]:
housing = train_set.copy()

housing_labels = train_set['price'].copy()
housing = train_set.drop('price', axis=1)
housing_without = housing.drop('address', axis=1)

In [None]:
housing_without_cat = housing_without[['city']]
housing_without_cat.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder()

housing_without_cat_encoded = one_hot.fit_transform(housing_without_cat)
housing_without_cat_encoded.toarray()

In [None]:
one_hot.categories_

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

housing_num = housing_without.drop("city", axis=1)
sq_ix = housing_num.columns.get_loc("sq")
rooms_ix = housing_num.columns.get_loc("rooms")

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, avg_m_squared = True):
        self.avg_m_squared = avg_m_squared
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if self.avg_m_squared:
            avg_room_size = X.iloc[:, sq_ix] / X.iloc[:, rooms_ix]
            return np.c_[X, avg_room_size]

In [None]:
housing_num.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ['city']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing_without)