# Imports

In [426]:
import requests
import bs4
import json
import datetime as dt
import sys
import pandas as pd

# Lisbon Properties for Sale - SUPERCASA Webscrapping

In [546]:
url = 'https://supercasa.pt/comprar-casas/lisboa/pagina-1'
result = requests.get(url)
soup = bs4.BeautifulSoup(result.text, 'lxml')
num_of_properties = int(soup.find_all('h1', id='searchTitle')[0].get_text().split()[0].replace('.',''))
num_prop_per_page = 25
total_pages = int(num_of_properties / num_prop_per_page)
time_estimation_for_12505 = 20
time_estimation = round(num_of_properties * 20 / 12505)

print(f'Time estimation for this task: {time_estimation} minutes.')
estimation = (dt.datetime.now() + dt.timedelta(minutes=time_estimation))
start = f'Tasks started at {dt.datetime.now().hour}h:{dt.datetime.now().minute}min. Estimated finish time {estimation.hour}h:{estimation.minute}min.'
print(start)

title, price, num_rooms, total_area, latitude, longitude, region, extras, id = ([] for i in range(9))

for n in range(1, total_pages+2):
    url = f'https://supercasa.pt/comprar-casas/lisboa/pagina-{n}'
    result = requests.get(url)
    soup = bs4.BeautifulSoup(result.text, 'lxml')

    # Find all properties on the current page
    properties = soup.find_all('div', class_='property big-picture') 

    for prop in properties:
        # Title
        a = prop.find('h2', class_='property-list-title').find('a')
        title.append(a.get_text().strip() if a else ' ')

        # Price
        span = prop.find('div', class_='property-price').find('span')
        price.append(span.get_text(strip=True) if span else ' ')

        # Features
        feature = prop.find('div', class_='property-features')
        spans = feature.find_all('span') if feature else []
        rooms = spans[0].get_text() if len(spans) > 0 else "Unknown"
        area = spans[1].get_text() if len(spans) > 1 else "Unknown"
        num_rooms.append(rooms)
        total_area.append(area)

        # Links for latitude and longitude
        link = prop.find('a', class_='property-link')
        latitude.append(link.get('data-latitude') if link else 'Unknown')
        longitude.append(link.get('data-longitude') if link else 'Unknown')

        # Extras
        highlight = prop.find('div', class_='property-highlights')
        if highlight:
            extra_spans = highlight.find_all('span')
            extras.append(', '.join([span.get_text(strip=True) for span in extra_spans]))
        else:
            extras.append(' ')

        # Address region from JSON-LD script if necessary
        script = prop.find('script', type='application/ld+json')
        if script:
            data = json.loads(script.string)
            if data.get('@type') == 'Offer':
                available_at_or_from = data.get('availableAtOrFrom', {})
                address_info = available_at_or_from.get('address', {})
                address_region = address_info.get('addressRegion', 'Not provided')
                region.append(address_region)
        else:
            region.append('Not provided')

        sys.stdout.write(f"\rProgress: {int((n / total_pages) * 100)}%")
        sys.stdout.flush()

for i in range(len(title)):
    id.append(i)

sys.stdout.write(f"\rProgress: 100%")
sys.stdout.flush()
print('\nCompleted!')

headers = ['id', 'title', 'price', 'num_rooms', 'total_area', 'latitude', 'longitude', 'region', 'extras']
final_data = [id, title, price, num_rooms, total_area, latitude, longitude, region, extras]
LisbonProperties = pd.DataFrame(dict(zip(headers, final_data)))

print(f"\nYou now have data on {len(lisbon['id'])} properties located in Lisbon!")

Time estimation for this task: 20 minutes.
Tasks started at 18h:45min. Estimated finish time 19h:5min.
Progress: 2%

KeyboardInterrupt: 

# Lisbon Metro Info - Wikipedia Webscrapping

In [445]:
url = 'https://pt.wikipedia.org/wiki/Lista_de_esta%C3%A7%C3%B5es_do_Metropolitano_de_Lisboa'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
data = str(soup.find('table', {'class': 'wikitable'}))

table = pd.read_html(data)[0]
columns = ['Nome','Outros nomes','Linha','Lat.','Long.']
LisbonMetro = table[columns]

# Data

In [446]:
LisbonProperties.head()

Unnamed: 0,id,title,price,num_rooms,total_area,latitude,longitude,region,extras
0,0,"Apartamento T1 em Benfica, Lisboa",269.900 €,1 quarto,Área bruta 97 m²,3875171,-92009,Benfica,
1,1,"Apartamento T2 em Alvalade, Lisboa",430.000 €,2 quartos,Área bruta 90 m²,387457392,-91425898,Alvalade,
2,2,"Apartamento T3 na Rua António Nobre, São Domin...",399.900 €,3 quartos,Área bruta 120 m²,3874657,-917989,São Domingos de Benfica,
3,3,"Apartamento T5 na Rua Sousa Pinto, Santo Antón...",3.950.000 €,5 quartos,Área bruta 416 m²,387234746455,-91581178942,Santo António,"De luxo, Com garagem"
4,4,"Apartamento T1 em Praça de Luís de Camões, Mis...",690.000 €,1 quarto,Área bruta 93 m²,3871078,-914385,Misericórdia,Com garagem


In [447]:
LisbonMetro['NomeConcat'] = LisbonMetro['Nome']+LisbonMetro['Outros nomes']
LisbonMetro['Lat.'] = LisbonMetro['Lat.'].astype(float)
LisbonMetro['Long.'] = LisbonMetro['Long.'].astype(float)
LisbonMetro.head()

Unnamed: 0,Nome,Outros nomes,Linha,Lat.,Long.
0,Aeroporto,—,Vermelha,38.76861,−9.12861
1,Alameda,Alameda I (técn.),Verde,38.73713,−9.13388
2,Alameda,Alameda II (técn.),Vermelha,38.73697,−9.13261
3,Alfornelos,—,Azul,38.76038,−9.20435
4,Alto dos Moinhos,Centro Administrativo (prev.),Azul,38.74994,−9.18003


In [556]:
def extra_rooms(x):
    if '+' in x:
        return x.split('+')[1]
    else:
        return 0

def area(x):
    if 'Área' in x:
        return x.split()[2]
    else:
        return 'Unknown'

def num_extras(x):
    if x.isspace():
        return 0
    elif ',' in x:
        return int(len(x.split(',')))
    else:
        return 1

def separate_extras(x):
    if x.isspace():
        return 'None'
    elif ',' in x:
        return x.split(',')
    else:
        return x

In [515]:
LisbonProperties['Type'] = LisbonProperties['title'].apply(lambda x: x.split()[0])
LisbonProperties['Typology'] = LisbonProperties['title'].apply(lambda x: x.split()[1])
LisbonProperties['Extra_Rooms'] = LisbonProperties['Typology'].apply(extra_rooms)
LisbonProperties['N_Rooms'] = LisbonProperties['num_rooms'].apply(lambda x: x.split()[0])
LisbonProperties['Price'] = LisbonProperties['price'].apply(lambda x: x.replace('.','').split()[0])
LisbonProperties['Area_m2'] = LisbonProperties['total_area'].apply(area)
LisbonProperties['N_Extras'] = LisbonProperties['extras'].apply(num_extras)

In [544]:
unique_extras = []
for x in LisbonProperties['extras'].unique():
    if not x.isspace():
        unique_extras.append(x)

In [553]:
unique_features = set(feature.strip() for item in unique_extras for feature in item.split(','))
unique_features_list = list(unique_features)

In [558]:
# for x in LisbonProperties['extras']:
#     if x in unique_features_list:
#         return 1
#     else:
#         return 0

['Vista para mar',
 'Com elevador',
 'De luxo',
 'Piscina',
 'Com garagem',
 'Rés do chão',
 'Último andar']

In [555]:
unique_extras

['De luxo, Com garagem',
 'Com garagem',
 'Com elevador, Com garagem',
 'Com elevador',
 'De luxo, Com elevador, Com garagem',
 'Rés do chão',
 'Piscina, Com elevador, Com garagem',
 'De luxo, Piscina, Com elevador',
 'Vista para mar, Com elevador, Com garagem',
 'Piscina, Vista para mar, Com garagem',
 'Rés do chão, Com elevador, Com garagem',
 'Último andar, Com elevador, Com garagem',
 'Piscina, Com elevador',
 'De luxo, Piscina, Com garagem',
 'De luxo, Vista para mar, Com elevador, Com garagem',
 'De luxo, Com elevador',
 'Rés do chão, Com garagem',
 'Último andar',
 'Piscina',
 'De luxo',
 'Piscina, Último andar, Com elevador, Com garagem',
 'Rés do chão, Com elevador',
 'De luxo, Piscina, Rés do chão, Com elevador, Com garagem',
 'De luxo, Último andar, Com elevador',
 'De luxo, Piscina, Com elevador, Com garagem',
 'Vista para mar, Com elevador',
 'De luxo, Piscina, Rés do chão',
 'Piscina, Vista para mar, Com elevador',
 'De luxo, Piscina',
 'Piscina, Rés do chão',
 'De luxo, 

In [535]:
def flatten_list(l):
    for el in l:
        if isinstance(el, list):
            yield from flatten_list(el)
        else:
            yield el

flattened = [item.strip() for sublist in unique_extras for item in (flatten_list(sublist) if isinstance(sublist, list) else sublist.split(','))]

# Extract unique values using a set
unique_values = list(set(flattened))
print(unique_values)

{'Vista para mar', 'Com elevador', 'De luxo', 'Piscina', 'Com garagem', 'Rés do chão', 'Último andar'}


Converti o set p lista p poder usar a seguir.

In [None]:
#Creat Copy of Lisbon Properties
lx_prop = LisbonProperties.copy()
# Create DataFrame correctly by encapsulating each element as a list
df = pd.DataFrame({'Description': unique_values})

# Assuming 'unique_features' holds the list of unique features
unique_values

# Add binary columns for each feature
for feature in unique_values:
    lx_prop[feature] = lx_prop['extras'].apply(lambda x: int(feature in x))

lx_prop.head()

In [None]:
lx_prop.drop('extras', axis=1, inplace=True)
lx_prop.head()

In [516]:
unique_stations = []
for x in LisbonMetro['Nome'].unique():
    if not x.isspace():
        unique_stations.append(x)

def flatten_list(l):
    for el in l:
        if isinstance(el, list):
            yield from flatten_list(el)
        else:
            yield el

flattened = [item.strip() for sublist in unique_stations for item in (flatten_list(sublist) if isinstance(sublist, list) else sublist.split(','))]

# Extract unique values using a set
unique_stations = list(set(flattened))
print(unique_stations)

Unnamed: 0,id,title,price,num_rooms,total_area,latitude,longitude,region,extras,Type,Typology,N_Rooms,Extra_Rooms,Price,Area_m2,N_Extras
0,0,"Apartamento T1 em Benfica, Lisboa",269.900 €,1 quarto,Área bruta 97 m²,3875171,-92009,Benfica,,Apartamento,T1,1,0,269900,97,0
1,1,"Apartamento T2 em Alvalade, Lisboa",430.000 €,2 quartos,Área bruta 90 m²,387457392,-91425898,Alvalade,,Apartamento,T2,2,0,430000,90,0
2,2,"Apartamento T3 na Rua António Nobre, São Domin...",399.900 €,3 quartos,Área bruta 120 m²,3874657,-917989,São Domingos de Benfica,,Apartamento,T3,3,0,399900,120,0
3,3,"Apartamento T5 na Rua Sousa Pinto, Santo Antón...",3.950.000 €,5 quartos,Área bruta 416 m²,387234746455,-91581178942,Santo António,"De luxo, Com garagem",Apartamento,T5,5,0,3950000,416,2
4,4,"Apartamento T1 em Praça de Luís de Camões, Mis...",690.000 €,1 quarto,Área bruta 93 m²,3871078,-914385,Misericórdia,Com garagem,Apartamento,T1,1,0,690000,93,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9827,9827,"Apartamento T1+2 na Rua Cecílio de Sousa, Mise...",320.000 €,1 quarto,Área bruta 68 m²,387160237,-91500739,Misericórdia,Rés do chão,Apartamento,T1+2,1,2,320000,68,1
9828,9828,"Apartamento T2 Duplex em Calçada da Ajuda, Aju...",325.000 €,2 quartos,Área bruta 48 m²,3870531,-919939,Ajuda,,Apartamento,T2,2,0,325000,48,0
9829,9829,"Apartamento T2 em Largo Conde de Ottolini, São...",318.000 €,2 quartos,Área bruta 85 m²,3874253,-917759,São Domingos de Benfica,Com elevador,Apartamento,T2,2,0,318000,85,1
9830,9830,"Apartamento T1 na Rua dos Arneiros, Benfica, L...",269.900 €,1 quarto,Área bruta 44 m²,3875171,-92009,Benfica,,Apartamento,T1,1,0,269900,44,0


In [None]:
for station in unique_stations:
    lx_prop[station] = 0

lx_prop.head()

Tentativa de flaggar as estações a menos de 1Km

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the Haversine distance between two points on the earth specified in decimal degrees.
    """
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))

    # Radius of Earth in kilometers: 6371
    distance = 6371 * c
    return distance

def update_house_station_flags(lx_prop, LisbonMetro):
    """
    Update the lx_prop DataFrame containing houses with station columns, setting flags based on proximity to stations in LisbonMetro.
    
    Args:
    lx_prop (pd.DataFrame): DataFrame with houses, each row should have 'Latitude' and 'Longitude' columns.
    LisbonMetro (pd.DataFrame): DataFrame with stations, columns should include 'Station', 'Latitude', and 'Longitude'.
    
    Returns:
    pd.DataFrame: The updated lx_prop DataFrame with flags for each station (1 if within 1km, 0 otherwise).
    """
    # Extract station coordinates into a dictionary
    station_coords = LisbonMetro.set_index('NomeConcat')[['Lat.', 'Long.']].to_dict('index')

    # Initialize all station columns in lx_prop
    for station in station_coords:
        if station not in lx_prop.columns:
            lx_prop[station] = 0

    # Iterate over each house row
    for index, house in lx_prop.iterrows():
        house_lat = house['latitude']
        house_lon = house['longitude']

        # Calculate distance to each station and flag accordingly
        for station, coords in station_coords.items():
            distance = haversine(house_lat, house_lon, coords['Lat.'], coords['Long.'])
            lx_prop.at[index, station] = 1 if distance <= 1 else 0

    return lx_prop

# Example usage and setup commented out to prevent execution in PCI
# lx_prop = pd.DataFrame({
#     'HouseID': [1, 2],
#     'Latitude': [38.75171, 38.752],
#     'Longitude': [-9.2009, -9.201]
# })
# LisbonMetro = pd.DataFrame({
#     'Station': ['Station A', 'Station B', 'Station C'],
#     'Latitude': [38.752, 38.760, 38.749],
#     'Longitude': [-9.199, -9.195, -9.204]
# })
# updated_lx_prop = update_house_station_flags(lx_prop, LisbonMetro)
# print(updated_lx_prop)
