In [13]:
from bs4 import BeautifulSoup
import cloudscraper
import pandas as pd
import itertools
import re
import json

In [2]:
def get_house_links(soup):
    """Takes the soup of a page and returns a list of links to each house"""
    a_tags = soup.find_all('a',class_='sold-property-link js-sold-property-card-link')
    return [tag.get('href') for tag in a_tags]

def get_houses_info(links:list[str]):
    """Takes the link to a house and returns a dictionary with the house properties"""
    scraper = cloudscraper.create_scraper()  # returns a CloudScraper instance
    
    house_properties = []
    
    for link in links:
        html = scraper.get(link).content
        soup = BeautifulSoup(html, 'html.parser')
        
        data_raw = json.loads(soup.find('div', class_='sold-property__map js-listing-map-sold')['data-initial-data'])
        
        data_parsed = {
            "adress": f"{data_raw['listing']['streetAddress']}, {data_raw['listing']['area']}, {data_raw['listing']['municipality']['fullName']}",
            "longitude": f"{data_raw['listing']['coordinate'][0]}",
            "latitude": f"{data_raw['listing']['coordinate'][1]}",
            "living_area": f"{data_raw['listing']['livingArea']}",
            "land_area": f"{data_raw['listing']['landArea']}",
            "supplemental_area": f"{data_raw['listing']['supplementalArea']}",
            "patio":'1' if 'Uteplats' in [feature['text'] for feature in data_raw['listing']['labels']] else '0',
            "balcony":'1' if 'Balkong' in [feature['text'] for feature in data_raw['listing']['labels']] else '0',
            "number_of_rooms": f"{data_raw['listing']['numberOfRooms']}",
            "build_year": re.sub(r"^\s+|\s+$", "", soup.find("dl", class_="sold-property__attributes").find("dt", text="Byggår").find_next_sibling('dd').text) if soup.find("dl", class_="sold-property__attributes").find("dt", text="Byggår") is not None else 'None',
            "operating cost":' '.join(re.findall(r'\d+', soup.find("dl", class_="sold-property__attributes").find("dt", text="Driftskostnad").find_next_sibling('dd').text)) if soup.find("dl", class_="sold-property__attributes").find("dt", text="Driftskostnad") is not None else 'None',
            "sold_price": ' '.join(re.findall(r'\d+', f"{data_raw['listing']['sellingPrice']['formatted']}"))
        }
        
        house_properties.append(data_parsed)
        
    return house_properties
    


In [16]:
import concurrent.futures

listing_links = [
    f"https://www.hemnet.se/salda/bostader?item_types%5B%5D=villa&page={page}&sold_age=12m"
    for page in range(1, 51)
]

data_pararell = []

def process_listing_page(listing_page):
  html = scraper.get(listing_page).content
  soup = BeautifulSoup(html, 'html.parser')

  links = get_house_links(soup)

  return get_houses_info(links)

with concurrent.futures.ThreadPoolExecutor() as executor:
    data_pararell.extend(executor.map(process_listing_page, listing_links))
    
executor.shutdown(wait=True)

flattened_data = list(itertools.chain.from_iterable(data_pararell))


In [17]:
# convert the data to a Pandas DataFrame
df = pd.DataFrame(flattened_data)

# write the DataFrame to a CSV file
df.to_csv('data.csv', index=False)