In [14]:
import pandas as pd
from ConnectDB import *
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time


In [None]:
df = get_data(date_str='2025-02-02')
df.info()

In [None]:
df.head(3)

In [7]:
# 1. Remove rows where price is "Preço sob consulta"
df = df[~df["price"].str.contains("Preço sob consulta", na=False)]

# 2. Convert price to numeric (fixing non-breaking spaces)
df["price"] = (
    df["price"]
    .str.replace("\xa0", "", regex=True)  # Remove non-breaking spaces
    .str.replace("€", "", regex=True)  # Remove euro symbol
    .str.replace(",", "", regex=True)  # Remove thousand separators
    .astype(float)
)

# 3. Split location into neighborhood, city, and state
df["location"] = df["location"].fillna("")  # Handle NaN values
df["location_parts"] = df["location"].str.split(",")

df["state"] = df["location_parts"].str[-1].str.strip()
df["city"] = df["location_parts"].str[-2].str.strip()
df["neighborhood"] = df["location_parts"].apply(lambda x: ", ".join(x[:-2]) if len(x) > 2 else "")

df.drop(columns=["location_parts"], inplace=True)  # Remove helper column

# 4. Clean rooms column
df["rooms"] = df["rooms"].str.extract("(\d+)").astype(float)

# 5. Drop page and location column
df.drop(columns=["page", "location"], inplace=True)

In [None]:
df.head(3)

In [None]:
# Initialize geolocator
geolocator = Nominatim(user_agent="imovirtual_scraper")

# Dictionary to cache geolocation results
location_cache = {}

def get_lat_lon(neighborhood, city, state):
    address = f"{neighborhood}, {city}, {state}, Portugal"

    if address in location_cache:  # Check if already cached
        return location_cache[address]

    try:
        location = geolocator.geocode(address, timeout=10)
        if location:
            location_cache[address] = (location.latitude, location.longitude)
        else:
            location_cache[address] = (None, None)

        time.sleep(1)  # Respect Nominatim rate limits (1 request/sec)
        return location_cache[address]

    except GeocoderTimedOut:
        return (None, None)

# Apply function only for unique locations
df[['latitude', 'longitude']] = df.apply(lambda row: pd.Series(get_lat_lon(row['neighborhood'], row['city'], row['state'])), axis=1)

# Save cache to a file (optional, useful for resuming)
pd.DataFrame.from_dict(location_cache, orient="index", columns=["latitude", "longitude"]).to_csv("location_cache.csv")

In [None]:
df.head(3)