In [9]:
import pandas as pd
import json
from bs4 import BeautifulSoup
import logging
from typing import List, Dict
import aiohttp


In [10]:
level = logging.INFO
fmt = '[%(levelname)s] - %(message)s'
logging.basicConfig(level=level, format=fmt)


In [11]:

with open('features.json') as f:
    feature_list = json.loads(f.read())
    features = {f['key']: f for f in feature_list}


In [12]:
async def find_ad_url_list(amount_of_page: int, district: str) -> List[str]:
    ad_url_list = []
    async with aiohttp.ClientSession() as session:
        for page in range(amount_of_page):
            url = f'https://ingatlanok.hu/elado/lakas/budapest-{district}/20Mft-tol;70Mft-ig?page={page}'
            async with session.get(url) as resp:
                html_text = await resp.text()
                soup = BeautifulSoup(html_text, 'html.parser')
                ad_url = [item['data-original-url']
                          for item in soup.select('[data-original-url]')]
                ad_url_list.extend(ad_url)
                logging.info('scraping ad urls: %d / %d', page, amount_of_page)
    return ad_url_list


In [13]:
def find_feature_by_displaying_name(name):
    return next((f for f in features.values() if f['displayingName'] == name), None)


In [14]:
def process_ad_data(html:str, district:str)-> Dict[str, str]:
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.findAll('tbody')
    tr_elements = [*tables[0].find_all('tr'), *tables[1].find_all('tr')]
    ad = {'DISTRICT':district}
    for tr_element in tr_elements:
        td_1, td_2 = tr_element.find_all('td')
        displaying_name = td_1.text[:-1]
        feature = find_feature_by_displaying_name(displaying_name)
        if not feature:
            continue
        value = td_2.text.replace('\n', '')
        ad[feature["key"]]=value
    return ad

In [15]:
async def get_and_process_ad_list(ad_url_list:List[str], district:str) -> List[str]:
    async with aiohttp.ClientSession() as session:
        ad_data = []
        for url in ad_url_list:
            async with session.get(url) as resp:
                html_text = await resp.text()
                ad_data.append(process_ad_data(html_text, district))
                if len(ad_data) % 50 ==0:
                    logging.info('processing ad of district %d: %d / %d',district, len(ad_data), len(ad_url_list))
        return ad_data

In [16]:
ad_raw_data = []
for district in features['DISTRICT']['values']:
    ad_url_list = await find_ad_url_list(15, district)
    data = await get_and_process_ad_list(ad_url_list, district)
    ad_raw_data.extend(data)

[INFO] - scraping ad urls: 0 / 15
[INFO] - scraping ad urls: 1 / 15
[INFO] - scraping ad urls: 2 / 15
[INFO] - scraping ad urls: 3 / 15
[INFO] - scraping ad urls: 4 / 15
[INFO] - scraping ad urls: 5 / 15
[INFO] - scraping ad urls: 6 / 15
[INFO] - scraping ad urls: 7 / 15
[INFO] - scraping ad urls: 8 / 15
[INFO] - scraping ad urls: 9 / 15
[INFO] - scraping ad urls: 10 / 15
[INFO] - scraping ad urls: 11 / 15
[INFO] - scraping ad urls: 12 / 15
[INFO] - scraping ad urls: 13 / 15
[INFO] - scraping ad urls: 14 / 15
[INFO] - processing ad: 10 / 97
[INFO] - processing ad: 20 / 97
[INFO] - processing ad: 30 / 97
[INFO] - processing ad: 40 / 97
[INFO] - processing ad: 50 / 97
[INFO] - processing ad: 60 / 97
[INFO] - processing ad: 70 / 97
[INFO] - processing ad: 80 / 97
[INFO] - processing ad: 90 / 97
[INFO] - scraping ad urls: 0 / 15
[INFO] - scraping ad urls: 1 / 15
[INFO] - scraping ad urls: 2 / 15
[INFO] - scraping ad urls: 3 / 15
[INFO] - scraping ad urls: 4 / 15
[INFO] - scraping ad urls: 

CancelledError: 

In [None]:
raw_dataset = pd.DataFrame(ad_raw_data)
raw_dataset.to_pickle('ad.pkl')