In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import math 
import logging
from tqdm import tqdm

In [None]:
# Configure logging to file and stdout
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler("info.log"), logging.StreamHandler()],
)

class PageURLs:
    """
    This class scraps page URLs from base_url.

    Attributes:
        base_url (str): The base URL that contains all the house listings.

    Methods:
        get_npage: Extract the maximum number of pages in base_url.
        construct_page_url: Construct individual page URLs.
        get_page_url: Get all page_urls in the base_url.
    """

    def __init__(self, base_url):
        self.base_url = base_url
        self.params = {
            "d": "true",
            "sd": "DESC",
            "sf": "RELEVANCE",
        }

    def get_npage(self):
        try:
            page = requests.get(self.base_url, timeout=10)
            page.raise_for_status()  # Raise an exception for HTTP errors
            elements = BeautifulSoup(page.content, "html.parser")
            logging.info("Base_url is opened")

            for element in elements.find_all("h1"):
                nentries = element.text.split()[0]
                logging.info(f"There are {nentries} house listings in the base_url.")

                try:
                    npage = math.ceil(int(nentries) / 20)  # Assume 20 entries per page
                except:
                    npage = math.ceil(int(nentries.replace(".", "")) / 20)  # Remove dots in large numbers
                logging.info(f"There are {npage} pages of house listings in the base_url.")

            return npage

        except requests.exceptions.RequestException as e:
            logging.error(f"Failed to open base_url: {e}")

            return 1

    def construct_page_url(self, page):
        self.params["sp"] = page
        url = (f"{self.base_url}?{'&'.join([f'{k}={v}' for k, v in self.params.items()])}")

        return url

    def get_page_urls(self):
        try:
            npage = self.get_npage()
            urls = []

            for page in range(1, npage + 1):
                url = self.construct_page_url(page)
                urls.append(url)
        except Exception as e:
            logging.error(f"An unexpected error occurred while constructing page URLS: {e}")
            return []

        return urls

class ExposeLinkExtractor:
    """
    This class scraps housing entries (expose links) from each page URL.
    """

    def __init__(self, url):
        self.url = url

    def extract_expose_links(self):

        try:
            page = requests.get(self.url, timeout=10)
            page.raise_for_status()
            elements = BeautifulSoup(page.content, "html.parser")
            flat_links = elements.find_all("a", href=True)
            expose_links = []

            for link in flat_links:
                link_href = link.get("href")
                if "/expose/" in link_href and "/projekte/expose/" not in link_href:
                    expose_links.append(link_href)

            # Sanity check
            # logging.info(f"Extracted {len(expose_links)} expose links from {self.url}")
            return expose_links

        except requests.exceptions.RequestException as e:
            logging.error(f"Error while processing the URL {self.url}: {e}")
            return []  # Return empty list

        except Exception as e:
            logging.error(f"An unexpected error occurred: {e}")
            return []

class AllURLs:
    """
    This class scrapes all house listings from the given base_url using ExposeLinkExtractor() and PageURLs().
    """
    def __init__(self, base_url):
        self.base_url = base_url

    def scrape_all_links(self):
        page_scraper = PageURLs(self.base_url)
        urls = page_scraper.get_page_urls()
        all_links = []
        
        # TODO: remove progress bar when deploying
        with tqdm(total=len(urls), desc="Scraping house listings (expose link) from each page") as pbar:
            for url in urls:
                try:
                    extractor = ExposeLinkExtractor(url)
                    expose_links = extractor.extract_expose_links()
                    if expose_links is not None:
                        all_links.extend(expose_links)
                except Exception as e:
                    logging.error(f"Failed to scrape links from {url}: {e}")
                    
                pbar.update(1)

        logging.info(f"Total links scraped: {len(all_links)}")
        return all_links

In [None]:
class ExposeLinkExtractor:
    """
    This class scraps housing entries (expose links) from each page URL.
    """

    def __init__(self, url):
        self.url = url

    def extract_expose_links(self):

        try:
            page = requests.get(self.url, timeout=10)
            page.raise_for_status()
            elements = BeautifulSoup(page.content, "html.parser")
            flat_links = elements.find_all("a", href=True)
            expose_links = []

            for link in flat_links:
                link_href = link.get("href")
                if "/expose/" in link_href and "/projekte/expose/" not in link_href:
                    expose_links.append(link_href)

            # Sanity check
            # logging.info(f"Extracted {len(expose_links)} expose links from {self.url}")
            return expose_links

        except requests.exceptions.RequestException as e:
            logging.error(f"Error while processing the URL {self.url}: {e}")
            return []  # Return empty list

        except Exception as e:
            logging.error(f"An unexpected error occurred: {e}")
            return []

In [None]:
class AllURLs:
    """
    This class scrapes all house listings from the given base_url using ExposeLinkExtractor() and PageURLs().
    """
    def __init__(self, base_url):
        self.base_url = base_url

    def scrape_all_links(self):
        page_scraper = PageURLs(self.base_url)
        urls = page_scraper.get_page_urls()
        all_links = []
        
        # TODO: remove progress bar when deploying
        with tqdm(total=len(urls), desc="Scraping house listings (expose link) from each page") as pbar:
            for url in urls:
                try:
                    extractor = ExposeLinkExtractor(url)
                    expose_links = extractor.extract_expose_links()
                    if expose_links is not None:
                        all_links.extend(expose_links)
                except Exception as e:
                    logging.error(f"Failed to scrape links from {url}: {e}")
                    
                pbar.update(1)

        logging.info(f"Total links scraped: {len(all_links)}")
        return all_links

In [None]:

# Define the base URL, hamburg rental units
base_url = "https://www.immowelt.de/liste/hamburg/wohnungen/mieten"

links = AllURLs(base_url)
expose_links = links.scrape_all_links()

In [None]:
from immowelt_urls import AllURLs
import pandas as pd
from tqdm import tqdm

base_url = "https://www.immowelt.de/liste/hamburg/wohnungen/mieten"

all_urls_scraper = AllURLs(base_url)
expose_links = all_urls_scraper.scrape_all_links()


In [None]:
expose_links

In [None]:
from immowelt_scrapper import ImmoWeltScrapper

scraper = ImmoWeltScrapper('https://www.immowelt.de/expose/2dvz55j')

row = {
    "title": scraper.get_title(),
    "address": scraper.get_address(),
    "cold_rent": scraper.get_cold_rent(),
    "warm_rent": scraper.get_warm_rent(),
    "deposit": scraper.get_deposit(),
    "living_space": scraper.get_living_space(),
    "room_number": scraper.get_room_number(),
    "floor": scraper.get_floor(),
    "availability": scraper.get_availability(),
    "amenities": scraper.get_amenities(),
    "built_year": scraper.get_built_year(),
    "energy_consumption": scraper.get_energy_consumption(),
    "object_detail": scraper.get_detail_object(),
    "furnishing": scraper.get_detail_furnishing(),
    "extra": scraper.get_detail_extra(),
    "timestamp": scraper.get_query_time()
}


In [None]:
json_object = json.loads(data)

if "props" in json_object:
    if "pageProps" in json_object["props"]:
        if "classified" in json_object["props"]["pageProps"]:
            if "sections" in json_object["props"]["pageProps"]["classified"]:
                if "similarListings" in json_object["props"]["pageProps"]["classified"]["sections"]:
                    json_object["props"]["pageProps"]["classified"]["sections"].pop("similarListings")
            if "seo" in json_object["props"]["pageProps"]["classified"]:
                json_object["props"]["pageProps"]["classified"].pop("seo")
            if "advertising" in json_object["props"]["pageProps"]["classified"]:
                json_object["props"]["pageProps"]["classified"].pop("advertising")
            if "tracking" in json_object["props"]["pageProps"]["classified"]:
                json_object["props"]["pageProps"]["classified"].pop("tracking")
            if "contactSections" in json_object["props"]["pageProps"]["classified"]:
                json_object["props"]["pageProps"]["classified"].pop("contactSections")
        if "i18nResources" in json_object["props"]["pageProps"]:
            json_object["props"]["pageProps"].pop("i18nResources")
        if "headerFooterConfig" in json_object["props"]["pageProps"]:
            json_object["props"]["pageProps"].pop("headerFooterConfig")
if "page" in json_object:
     json_object.pop("page")
if "query" in json_object:
     json_object.pop("query")
if "buildId" in json_object:
     json_object.pop("buildId")
if "assetPrefix" in json_object:
     json_object.pop("assetPrefix")
if "isFallback" in json_object:
     json_object.pop("isFallback")
if "gssp" in json_object:
     json_object.pop("gssp")
if "scriptLoader" in json_object:
     json_object.pop("scriptLoader")

json_formatted_str = json.dumps(json_object, indent=4)
print(json_formatted_str)


In [None]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import pandas as pd
import json


url = 'https://www.immowelt.de/expose/2dvz55j'

page = requests.get(url)
elements = BeautifulSoup(page.content, "html.parser")
data = elements.find('script', id='__NEXT_DATA__').text

json_object = json.loads(data)

json_formatted_str = json.dumps(json_object, indent=4)
print(json_formatted_str)

In [None]:
data = {}

id = json_object["props"]["pageProps"]["classified"]["id"]
title = json_object["props"]["pageProps"]["classified"]["title"]

creationDate = c
updateDate = json_object["props"]["pageProps"]["classified"]["metadata"]["updateDate"]

city = json_object["props"]["pageProps"]["classified"]["sections"]["location"]["address"]["city"]
zipCode = json_object["props"]["pageProps"]["classified"]["sections"]["location"]["address"]["zipCode"]
street = json_object["props"]["pageProps"]["classified"]["sections"]["location"]["address"]["street"]
district = json_object["props"]["pageProps"]["classified"]["sections"]["location"]["address"]["district"]

coordinates = json_object["props"]["pageProps"]["classified"]["sections"]["location"]["geometry"]["coordinates"]

# No need for description, yet, there is too much text to parse in an meaningful way. 

facts_dict = {}
facts = json_object["props"]["pageProps"]["classified"]["sections"]["hardFacts"]["facts"]

for fact in facts:
    print(fact["type"], fact["value"])
    facts_dict[fact["type"]] = fact["value"]
    
locationDescription = json_object["props"]["pageProps"]["classified"]["sections"]["hardFacts"]["locationDescription"]

features_dict = {}
features = json_object["props"]["pageProps"]["classified"]["sections"]["energy"]["features"]
for feature in features:
    features_dict[feature["type"]] = feature["value"]

certificates_dict = {}
certificates = json_object["props"]["pageProps"]["classified"]["sections"]["energy"]["certificates"][0]["features"] # It is a list
for feature in certificates:
    certificates_dict[feature["type"]] = feature["value"]

energyRating_dict = {}
energyRating = json_object["props"]["pageProps"]["classified"]["sections"]["energy"]["certificates"][0]["scales"][0]["efficiencyClass"]["rating"]
energyRequirement = json_object["props"]["pageProps"]["classified"]["sections"]["energy"]["certificates"][0]["scales"][0]["values"][0]["value"]

energyRating_dict = {"energyRating": energyRating, "energyRequirement": energyRequirement}

housePrices_dict = {}
housePrices = json_object["props"]["pageProps"]["classified"]["sections"]["price"]["base"]["details"]
for housePrice in housePrices:
    housePrices_dict[housePrice["label"]["main"]] = housePrice["value"]["main"]["value"]

# No need for additional price elements, e.g. monthly parking rent or caution. This has nothing to do with the house quality, hence isn't necessary in the model.

details_dict = {}
details = json_object["props"]["pageProps"]["classified"]["sections"]["features"]["details"]["categories"]
for detail in details:
    for element in detail["elements"]:
        details_dict[element["icon"]] = element["value"]



In [None]:
data = {
    "id": id,
    "title": title,
    "creationDate": creationDate,
    "updateDate": updateDate,
    "city": city,
    "zipCode": zipCode,
    "street": street,
    "district": district,
    "coordinates": coordinates,
    "locationDescription": locationDescription
}
data.update(facts_dict)
data.update(features_dict)
data.update(certificates_dict)
data.update(energyRating_dict)
data.update(housePrices_dict)
data.update(details_dict)

	

In [None]:
data

In [None]:
details_dict = {}
details = json_object["props"]["pageProps"]["classified"]["sections"]["features"]["details"]["categories"]
for detail in details:
    for element in detail["elements"]:
        details_dict[element["icon"]] = element["value"]


In [None]:
print(priceAdditionals["label"])

In [None]:
# description atlatdik cunku cok text var

#facts_dict = {}
#facts = json_object["props"]["pageProps"]["classified"]["sections"]["hardFacts"]["facts"]
#for fact in facts:
#    value = re.sub(r'[^\d.,]', '', fact["value"]).replace(',', '.')
#    if value:
#        facts_dict[fact["type"]] = float(value)

In [None]:
import requests
from bs4 import BeautifulSoup
import logging


# Configure logging to file and stdout
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler("info.log"), logging.StreamHandler()],
)

class ParsePageJson:
    """
    Extracts housing related data from the JSON object obtained from the webpage.

    Attributes:
        page_url (str): The URL of the webpage containing the JSON object.
        page_json (dict): The JSON object to parse.
        num_total_features (int): The total number of features to extract.
        num_extracted_features (int): The number of features successfully extracted.
        data (dict): The extracted data.

    Methods:
        __init__(page_url): Initializes the object with the page URL and loads the JSON object.
        get_json_object(): Loads the JSON object from the webpage.
        extract_features(): Extracts the housing related data from the JSON object.
    """

    def __init__(self, page_url):
        self.page_url = page_url
        self.num_total_features = 12
        self.num_extracted_features = 0
        self.data = {}
        self.get_json_object()


    def get_json_object(self):
        page = requests.get(self.page_url, timeout=10)
        page.raise_for_status()  # Raise an exception for HTTP errors
        elements = BeautifulSoup(page.content, "html.parser")
        data = elements.find('script', id='__NEXT_DATA__').text
        self.page_json = json.loads(data)


    def extract_features(self):

        try:
            self.data["id"] = self.page_json["props"]["pageProps"]["classified"]["id"]
            self.num_extracted_features += 1
        except Exception as e:
            self.data["id"] = None

        try:
            self.data["title"] = self.page_json["props"]["pageProps"]["classified"]["title"]
            self.num_extracted_features += 1
        except Exception as e:
            self.data["title"] = None

        try:
            self.data["creationDate"] = self.page_json["props"]["pageProps"]["classified"]["metadata"]["creationDate"]
            self.num_extracted_features += 1
        except Exception as e:
            self.data["creationDate"] = None

        try:
            self.data["updateDate"] = self.page_json["props"]["pageProps"]["classified"]["metadata"]["updateDate"]
            self.num_extracted_features += 1
        except Exception as e:
            self.data["updateDate"] = None

        try:
            self.data["city"] = self.page_json["props"]["pageProps"]["classified"]["sections"]["location"]["address"]["city"]
            self.num_extracted_features += 1
        except Exception as e:
            self.data["city"] = None

        try:
            self.data["zipCode"] = self.page_json["props"]["pageProps"]["classified"]["sections"]["location"]["address"]["zipCode"]
            self.num_extracted_features += 1
        except Exception as e:
            self.data["zipCode"] = None

        try:
            self.data["street"] = self.page_json["props"]["pageProps"]["classified"]["sections"]["location"]["address"]["street"]
            self.num_extracted_features += 1
        except Exception as e:
            self.data["street"] = None

        try:
            self.data["district"] = self.page_json["props"]["pageProps"]["classified"]["sections"]["location"]["address"]["district"]
            self.num_extracted_features += 1
        except Exception as e:
            self.data["district"] = None

        try:
            self.data["coordinates"] = self.page_json["props"]["pageProps"]["classified"]["sections"]["location"]["geometry"]["coordinates"]
            self.num_extracted_features += 1
        except Exception as e:
            self.data["coordinates"] = None

        try:
            self.data["locationDescription"] = self.page_json["props"]["pageProps"]["classified"]["sections"]["hardFacts"]["locationDescription"]
            self.num_extracted_features += 1
        except Exception as e:
            self.data["locationDescription"] = None            

        try:
            facts_dict = {}
            facts = self.page_json["props"]["pageProps"]["classified"]["sections"]["hardFacts"]["facts"]
            for fact in facts:
                facts_dict[fact["type"]] = fact["value"]
            self.data.update(facts_dict)
            self.num_extracted_features += 1
        except Exception as e:
            pass

        try:
            features_dict = {}
            features = self.page_json["props"]["pageProps"]["classified"]["sections"]["energy"]["features"]
            for feature in features:
                features_dict[feature["type"]] = feature["value"]
            self.data.update(features_dict)
            self.num_extracted_features += 1
        except Exception as e:
            pass

        try:
            certificates_dict = {}
            certificates = self.page_json["props"]["pageProps"]["classified"]["sections"]["energy"]["certificates"][0]["features"]
            for feature in certificates:
                certificates_dict[feature["type"]] = feature["value"]
            self.data.update(certificates_dict)
            self.num_extracted_features += 1
        except Exception as e:
            pass

        try:
            energyRating_dict = {}
            energyRating = self.page_json["props"]["pageProps"]["classified"]["sections"]["energy"]["certificates"][0]["scales"][0]["efficiencyClass"]["rating"]
            energyRequirement = self.page_json["props"]["pageProps"]["classified"]["sections"]["energy"]["certificates"][0]["scales"][0]["values"][0]["value"]
            energyRating_dict = {"energyRating": energyRating, "energyRequirement": energyRequirement}
            self.data.update(energyRating_dict)
            self.num_extracted_features += 1
        except Exception as e:
            pass

        try:
            housePrices_dict = {}
            housePrices = self.page_json["props"]["pageProps"]["classified"]["sections"]["price"]["base"]["details"]
            for housePrice in housePrices:
                housePrices_dict[housePrice["label"]["main"]] = housePrice["value"]["main"]["value"]
            self.data.update(housePrices_dict)
            self.num_extracted_features += 1
        except Exception as e:
            pass

        try:
            details_dict = {}
            details = self.page_json["props"]["pageProps"]["classified"]["sections"]["features"]["details"]["categories"]
            for detail in details:
                for element in detail["elements"]:
                    details_dict[element["icon"]] = element["value"]
            self.data.update(details_dict)
            self.num_extracted_features += 1
        except Exception as e:
            pass
		
        logging.info(f"Total number of features extracted from {self.page_url}: {self.num_extracted_features}/16")

        return self.data


In [None]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import pandas as pd
import json


url = 'https://www.immowelt.de/expose/2fc3d5w'

pg = ParsePageJson(url)

data = pg.extract_features()

In [None]:
data

In [None]:

    try:
        features_dict = {}
        features = json_object["props"]["pageProps"]["classified"]["sections"]["energy"]["features"]
        for feature in features:
            features_dict[feature["type"]] = feature["value"]
        data.update("features_dict")
        if data["features_dict"]
        successful_extractions += 1
    except Exception as e:
        data["features"] = {}


    try:
        certificates_dict = {}
        certificates = json_object["props"]["pageProps"]["classified"]["sections"]["energy"]["certificates"][0]["features"]
        for feature in certificates:
            certificates_dict[feature["type"]] = feature["value"]
        data["certificates"] = certificates_dict
        successful_extractions += 1
    except Exception as e:
        data["certificates"] = {}


    try:
        energyRating_dict = {}
        energyRating = json_object["props"]["pageProps"]["classified"]["sections"]["energy"]["certificates"][0]["scales"][0]["efficiencyClass"]["rating"]
        energyRequirement = json_object["props"]["pageProps"]["classified"]["sections"]["energy"]["certificates"][0]["scales"][0]["values"][0]["value"]
        energyRating_dict = {"energyRating": energyRating, "energyRequirement": energyRequirement}
        data["energyRating"] = energyRating_dict
        successful_extractions += 1
    except Exception as e:
        data["energyRating"] = {}
    successful_extractions += 1

    try:
        housePrices_dict = {}
        housePrices = json_object["props"]["pageProps"]["classified"]["sections"]["price"]["base"]["details"]
        for housePrice in housePrices:
            housePrices_dict[housePrice["label"]["main"]] = housePrice["value"]["main"]["value"]
        data["housePrices"] = housePrices_dict
        successful_extractions += 1

In [None]:
from immowelt_urls import AllURLs
from immowelt_parse import ParsePageJson


base_url = "https://www.immowelt.de/liste/hamburg/wohnungen/mieten"

all_urls_scraper = AllURLs(base_url)
expose_links = all_urls_scraper.scrape_all_links()

data = {}

for expose_link in expose_links:
    page_data = {}
    page = ParsePageJson(expose_link)
    page_data = page.extract_features()
    data.update(page_data)


In [1]:
from immowelt_urls import AllURLs
from immowelt_parse import ParsePageJson
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
import logging
import json


# Configure logging to file and stdout
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler("info.log"), logging.StreamHandler()],
)


base_url = "https://www.immowelt.de/liste/hamburg/wohnungen/mieten"

all_urls_scraper = AllURLs(base_url)
expose_links = all_urls_scraper.scrape_all_links()

data = []
num = 0

for link in expose_links:
    page = ParsePageJson(link)
    page_data = page.extract_features()
    data.append(page_data)
    num += 1
    logging.info(f"{num}/{len(expose_links)}")

with open('data.json', 'w') as f:
    json.dump(data, f)

2024-10-10 18:27:14,827 - INFO - Base_url is opened
2024-10-10 18:27:14,828 - INFO - There are 1697 house listings in the base_url.
2024-10-10 18:27:14,829 - INFO - There are 57 pages of house listings in the base_url.
2024-10-10 18:27:14,829 - INFO - https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=Apartment&locations=AD08DE1113&page
2024-10-10 18:27:14,829 - INFO - https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=Apartment&locations=AD08DE1113&page=2
2024-10-10 18:27:14,830 - INFO - https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=Apartment&locations=AD08DE1113&page=3
2024-10-10 18:27:14,830 - INFO - https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=Apartment&locations=AD08DE1113&page=4
2024-10-10 18:27:14,830 - INFO - https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=Apartment&locations=AD08DE1113&page=5
2024-10-10 18:27:14,831 - INFO - https://ww

KeyboardInterrupt: 

In [3]:
data

[{'id': '24TRNCT4U8W1',
  'url': 'https://www.immowelt.de/expose/55db9a19-9a5e-464f-9a13-9a26dc3fb427',
  'title': '2 Zi.-Whg., 62 qm, DBad, Einbauküche, großer Balkon. Soeben saniert, zahlreiche Geräte/Einrichtungen neu (Dusche, Waschmaschine, Herd, Kühlschrank, Lampen, Fussboden, Rollos).  Möbliert. TV/WLAN incl.. Fahrstuhl, Schwimmbad, Sauna im Gebäude. Tiefgaragenplatz ab Sommer 2025 möglich.<br>Uneinsehbare Endetage mit fantastischem Blick ins Alstertal. Ungestört sonnen, grillen, die Natur genießen. Binnen 1-3 Minuten im Alstertal spazieren oder Fahrrad fahren.<br>Für 1-2 Personen.<br><br>Objektzustand: Erstbezug nach Sanierung',
  'creationDate': '2024-10-08T08:36:03.353Z',
  'updateDate': '2024-10-08T08:54:16.687Z',
  'city': 'Hamburg',
  'zipCode': '22399',
  'street': None,
  'district': 'Poppenbüttel',
  'coordinates': None,
  'locationDescription': 'Alsterkehre (Poppenbüttel). Kein Verkehrslärm. Gute Busanbindung zu AEZ u. S-Bahn Poppenbüttel bzw. U-Bahn Borgweg. Direkt neb

In [None]:
from immowelt_urls import PageURLs
from immowelt_urls import ExposeLinkExtractor
from immowelt_urls import AllURLs

base_url = "https://www.immowelt.de/liste/hamburg/wohnungen/mieten"

page_scraper = PageURLs(base_url)
urls = page_scraper.get_page_urls()

In [None]:
urls[0]

extractor = ExposeLinkExtractor(urls[0])
expose_links = extractor.extract_expose_links()

In [None]:
expose_links

In [None]:
all_urls_scraper = AllURLs(base_url)
expose_links = all_urls_scraper.scrape_all_links(urls[0])

In [None]:
expose_links

Add details, and save a template. 

In [None]:
url = expose_links[18]

page = requests.get(url)
elements = BeautifulSoup(page.content, "html.parser")
data = elements.find('script', id='__NEXT_DATA__').text

json_object = json.loads(data)

json_formatted_str = json.dumps(json_object, indent=4)
print(json_formatted_str)

expose_links[18]

In [None]:
url = expose_links[1]

page = requests.get(url)
elements = BeautifulSoup(page.content, "html.parser")
data = elements.find('script', id='__NEXT_DATA__').text

json_object = json.loads(data)

json_formatted_str = json.dumps(json_object, indent=4)
print(json_formatted_str)

expose_links[1]

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import pandas as pd
import json


url = 'https://www.immowelt.de/expose/2dvz55j'

page = requests.get(url)
elements = BeautifulSoup(page.content, "html.parser")
data = elements.find('script', id='__NEXT_DATA__').text

json_object = json.loads(data)

json_formatted_str = json.dumps(json_object, indent=4)
print(json_formatted_str)


kk = json_object["props"]["pageProps"]["classified"]["metadata"]["status"]["status"]

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import pandas as pd
import json


url = 'https://www.immowelt.de/expose/2dvz55j'

page = requests.get(url)
elements = BeautifulSoup(page.content, "html.parser")
data = elements.find('script', id='__NEXT_DATA__').text

json_object = json.loads(data)


data = {}

textDetails_dict = {}
num_extracted_features = 0
textDetails = json_object["props"]["pageProps"]["classified"]["sections"]["description"]["texts"]
for textDetail in textDetails:
    if "headline" in textDetail:
        headline = textDetail["headline"]
    else:
        headline = "Description"
    text = textDetail["text"]
    textDetails_dict[headline] = text
    num_extracted_features += 1

data.update(textDetails_dict)

    

https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=Apartment&locations=AD08DE1113&page=2
https://www.immowelt.de/classified-search?distributionTypes=Buy,Buy_Auction,Compulsory_Auction&estateTypes=House&locations=AD08DE7633&page=3

In [None]:
data

In [None]:
addressPublished = json_object["props"]["pageProps"]["classified"]["sections"]["location"]["isAddressPublished"]
if addressPublished is True:

coordinates = json_object["props"]["pageProps"]["classified"]["sections"]["location"]["geometry"]["coordinates"]