In [1]:
# Project Two
# 1. Sprawdzić i poprawić ewentualne błedy w danych
# 2. Dodać kolumnę - cena za metr kw.
# 3. Przeprowadzić dadanie z wykorzystaniem walidacji krzyżowej dla conajmniej 5 algorytmow
# 4. Zwizualizować wyniki eksperymentu
# 5. Przedstawić wnioski 
### UWAGA . Cena za metr kw. jest celem budowy modelu

In [2]:
# Data Loading

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

rawData = pd.read_csv("./Houses.csv", encoding='ISO-8859-2', index_col=0)

# Sanitize Integer Data
# Columns like floor, rooms, year and id
sanitizedData = rawData.copy()
sanitizedData['floor'] = sanitizedData['floor'].astype(int)
sanitizedData['id'] = sanitizedData['id'].astype(int)
sanitizedData['rooms'] = sanitizedData['rooms'].astype(int)
sanitizedData['year'] = sanitizedData['year'].astype(int)

# Denomination data, 1995. To verify, avg price is 4000 for 35m2 so prices here are not matching historical data
denominationYear = 1995
# denominationFactor = 10000
# sanitizedData.loc[sanitizedData['year'] < 1995, 'price'] = sanitizedData['price'] / denominationFactor
# sanitizedData.head()
sanitizedData = sanitizedData[sanitizedData['year'] >= 1995]

# Remove data about building status
def adjust_address(address):
    if 'Gotowy budynek' in address:
        return address.split()[0]  # Return only the first word if 'Gotowy budynek' is in the address
    else:
        return address 
        
sanitizedData['address'] = sanitizedData['address'].apply(adjust_address)

# Drop the ones with negative data in code fields
sanitizedData = sanitizedData[(sanitizedData['floor'] >= 0) & (sanitizedData['rooms'] >= 0) & (sanitizedData['price'] >= 0)]

In [6]:
# Remove rows that are not matching city long/lat
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from geopy.extra.rate_limiter import RateLimiter

## Sanitize city names
sanitizedData['city'] = sanitizedData['city'].str.lower()

## Initialize Nominatim API
geolocator = Nominatim(user_agent="MyApp")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

## Create a dictionary to store city coordinates
cityCoords = {}
for city in sanitizedData['city'].unique():
    location = geocode(city)
    if location:
        cityCoords[city] = (location.latitude, location.longitude)
        
def locationWithinTolerance(row, tolerance_km=10):
    city = row['city']
    propertyCoords = (row['latitude'], row['longitude'])
    
    if city in cityCoords:
        cityLatLon = cityCoords[city]
        distance = geodesic(cityLatLon, propertyCoords).kilometers
        return distance <= tolerance_km
    return False

sanitizedData = sanitizedData[sanitizedData.apply(locationWithinTolerance, axis=1)]
sanitizedData

Unnamed: 0,address,city,floor,id,latitude,longitude,price,rooms,sq,year
0,Podgórze Zabłocie Stanisława Klimeckiego,kraków,2,23918,50.049224,19.970379,749000.0,3,74.05,2021
1,Praga-Południe Grochowska,warszawa,3,17828,52.249775,21.106886,240548.0,1,24.38,2021
4,Ochota,warszawa,1,11770,52.212225,20.972630,996000.0,5,105.00,2020
5,Nowa Huta Czyżyny ul. Woniców,kraków,2,26071,50.046943,19.997153,414600.0,1,34.55,2022
6,Podgórze Płaszów Koszykarska,kraków,0,22569,50.049893,19.990603,750000.0,4,81.40,2021
...,...,...,...,...,...,...,...,...,...,...
23758,Nowe Miasto Rataje Wagrowska,poznań,3,4516,52.378997,16.944244,294021.0,2,35.23,2022
23759,Stare Miasto Naramowice,poznań,0,3976,52.449649,16.949408,543000.0,4,77.00,2020
23760,Włochy,warszawa,4,10206,52.186109,20.948438,910000.0,3,71.00,2017
23761,Nowe Miasto Malta ul. Katowicka,poznań,0,4952,52.397345,16.961939,430695.0,3,50.67,2022


In [9]:
# Calculate Price per square meter
sanitizedData['price_per_sqm'] = round(sanitizedData['price'] / sanitizedData['sq'], 2)
sanitizedData

Unnamed: 0,address,city,floor,id,latitude,longitude,price,rooms,sq,year,price_per_sqm
0,Podgórze Zabłocie Stanisława Klimeckiego,kraków,2,23918,50.049224,19.970379,749000.0,3,74.05,2021,10114.79
1,Praga-Południe Grochowska,warszawa,3,17828,52.249775,21.106886,240548.0,1,24.38,2021,9866.61
4,Ochota,warszawa,1,11770,52.212225,20.972630,996000.0,5,105.00,2020,9485.71
5,Nowa Huta Czyżyny ul. Woniców,kraków,2,26071,50.046943,19.997153,414600.0,1,34.55,2022,12000.00
6,Podgórze Płaszów Koszykarska,kraków,0,22569,50.049893,19.990603,750000.0,4,81.40,2021,9213.76
...,...,...,...,...,...,...,...,...,...,...,...
23758,Nowe Miasto Rataje Wagrowska,poznań,3,4516,52.378997,16.944244,294021.0,2,35.23,2022,8345.76
23759,Stare Miasto Naramowice,poznań,0,3976,52.449649,16.949408,543000.0,4,77.00,2020,7051.95
23760,Włochy,warszawa,4,10206,52.186109,20.948438,910000.0,3,71.00,2017,12816.90
23761,Nowe Miasto Malta ul. Katowicka,poznań,0,4952,52.397345,16.961939,430695.0,3,50.67,2022,8500.00
