In [9]:
import pandas as pd
import json
from bs4 import BeautifulSoup
import logging
from typing import List, Dict
import aiohttp


In [4]:
level = logging.INFO
fmt = '[%(levelname)s] - %(message)s'
logging.basicConfig(level=level, format=fmt)


In [2]:

with open('features.json') as f:
    feature_list = json.loads(f.read())
    features = {f['key']: f for f in feature_list}


In [11]:
async def find_ad_url_list(amount_of_page: int, district: str) -> List[str]:
    ad_url_list = []
    async with aiohttp.ClientSession() as session:
        for page in range(amount_of_page):
            url = f'https://ingatlanok.hu/elado/lakas/budapest-{district}/20Mft-tol;70Mft-ig?page={page}'
            async with session.get(url) as resp:
                html_text = await resp.text()
                soup = BeautifulSoup(html_text, 'html.parser')
                ad_url = [item['data-original-url']
                          for item in soup.select('[data-original-url]')]
                ad_url_list.extend(ad_url)
                logging.info('%d / %d', page, amount_of_page)
    return ad_url_list


In [19]:
def find_feature_by_displaying_name(name):
    return next((f for f in features.values() if f['displayingName'] == name), None)


In [20]:
def process_ad_data(html:str, district_name:str)-> Dict[str, str]:
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.findAll('tbody')
    tr_elements = [*tables[0].find_all('tr'), *tables[1].find_all('tr')]
    ad = {'DISTRICT_NUM':district_name}
    for tr_element in tr_elements:
        td_1, td_2 = tr_element.find_all('td')
        displaying_name = td_1.text[:-1]
        feature = find_feature_by_displaying_name(displaying_name)
        if not feature:
            continue
        value = td_2.text.replace('\n', '')
        ad[feature["key"]]=value
    return ad

In [21]:
async def fetch_and_process_ad_list(ad_url_list:List[str], district_num:int) -> List[str]:
    async with aiohttp.ClientSession() as session:
        ad_data = []
        for url in ad_url_list:
            async with session.get(url) as resp:
                html_text = await resp.text()
                ad_data.append(process_ad_data(html_text, district_num))
                if len(ad_data) % 10 ==0:
                    logging.info('%d / %d', len(ad_data), len(ad_url_list))
        return ad_data

In [22]:
ad_raw_data = []
for district in features['DISTRICT_NUM']['values']:
    ad_url_list = await find_ad_url_list(1, district)
    data = await fetch_and_process_ad_list(ad_url_list, district)
    ad_raw_data.extend(data)

[INFO] - 0 / 1
[INFO] - 10 / 20
[INFO] - 20 / 20
[INFO] - 0 / 1
[INFO] - 10 / 20
[INFO] - 20 / 20
[INFO] - 0 / 1
[INFO] - 10 / 20
[INFO] - 20 / 20
[INFO] - 0 / 1
[INFO] - 10 / 20
[INFO] - 20 / 20
[INFO] - 0 / 1
[INFO] - 10 / 20
[INFO] - 20 / 20
[INFO] - 0 / 1
[INFO] - 10 / 20
[INFO] - 20 / 20
