# Immoscout24.ch Scraper

Script to create a `.csv` file for real estate, that is offered on http://immoscout24.ch

In [1]:
import urllib.request
from random import choice, randint
import time
import json
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

In [2]:
def urlquery(url):
    # Function cycles randomly through different user agents and time intervals to simulate more natural queries
    try:
        # sleeptime = float(randint(1, 6) / 5) # uncomment for faster Speed, might get rejected
        sleeptime = float(randint(1, 6)) 
        time.sleep(sleeptime)

        agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
            'Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/125.0.6422.33 Mobile/15E148 Safari/604.1',
            'Mozilla/5.0 (iPad; CPU OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/125.0.6422.33 Mobile/15E148 Safari/604.1',
            'Mozilla/5.0 (iPod; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/125.0.6422.33 Mobile/15E18 Safari/604.1',
            'Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.179 Mobile Safari/537.36'
        ]

        agent = choice(agents)
        opener = urllib.request.build_opener()
        opener.addheaders = [('User-agent', agent)]

        response = opener.open(url)
        html = response.read()
        time.sleep(sleeptime)

        return html.decode('utf-8')

    except Exception as e:
        print(f'Something went wrong with Crawling:\n{e}')
        return None


In [3]:
def immoscout24parser(url):
    ''' Parser holt aus Immoscout24.de Suchergebnisseiten die Immobilien '''
    try:
        html_content = urlquery(url)
        if html_content is None:
            return None
        
        #print(f"HTML content of page {url}:\n{html_content[:500]}...\n")  # Print the first 500 characters of the HTML content
        
        soup = BeautifulSoup(html_content, 'html.parser')
        scripts = soup.findAll('script')
        for script in scripts:
            if 'window.__INITIAL_STATE__' in script.text:
                #print(f"Found script with 'window.__INITIAL_STATE__' keyword:\n{script.text.strip()[:500]}...\n")  # Print the first 500 characters of the script
                json_text = script.string.strip().replace('window.__INITIAL_STATE__=', '', 1).rstrip(';')
                try:
                    initial_state = json.loads(json_text)
                    resultlist_json = initial_state['resultList']['search']['fullSearch']['result']
                    #print(f"Extracted JSON: {resultlist_json}")
                    return resultlist_json
                except json.JSONDecodeError as json_err:
                    print(f"JSON decoding error: {json_err}")
                except KeyError as key_err:
                    print(f"Key error: {key_err}")
                except Exception as e:
                    print(f"Error parsing JSON: {e}")

    except Exception as e:
        print("Fehler in immoscout24 parser: %s" % e)
    return None

In [4]:
immos = {}
previous_len = 0 

# Define parameters
# s = 'kanton-st-gallen' # City, Kanton 
s = 'land-schweiz-fl'
k = 'Wohnung' # Flat or house ('Wohnung' or 'Haus')
w = 'mieten' # Rent or buy ('mieten' or 'buy')

max_ads = 1000  # Set limit max adverts (usually only 1000 are possible due to restriction to page 50)
page = 0
pageCount = None

print('Suche %s / %s' % (k, w))

while True:
    page += 1
    # print(f"Accessing page {page}")
    url = f'https://www.immoscout24.ch/de/immobilien/{w}/{s}?pn={page}'
    
    resultlist_json = None
    while resultlist_json is None:
        try:
            resultlist_json = immoscout24parser(url)
            if resultlist_json is None:
                print(f"Failed to retrieve or parse page {page}.")
                break
        except Exception as e:
            print(f"Request failed: {e}")
            break

    if resultlist_json:
        # Extract page count and current page number if available
        if 'numberOfPages' in resultlist_json and 'pageNumber' in resultlist_json:
            pageCount = int(resultlist_json['numberOfPages'])
            print(f"Number of pages: {pageCount}")
        else:
            pass
            # print("Page count information not available on this page.")
        
        # Process the data
        for resultlistEntry in resultlist_json.get('listings', []):
            listing = resultlistEntry['listing']
            
            realEstate = {}

            realEstate['type'] = listing.get('categories', None)

            realEstate['address'] = listing['address'].get('street', None)
            realEstate['city'] = listing['address']['locality']
            realEstate['postcode'] = listing['address']['postalCode']
        
            try:
                realEstate['lat'] = listing['address']['geoCoordinates']['latitude']
                realEstate['lon'] = listing['address']['geoCoordinates']['longitude']
            except:
                realEstate['lat'] = None
                realEstate['lon'] = None
            
            # Specifics for rent
            if w == 'mieten':
                if 'prices' in listing and 'rent' in listing['prices']:
                    realEstate['rentNet'] = listing['prices']['rent'].get('net', None)
                    realEstate['rentGross'] = listing['prices']['rent'].get('gross', None)
                    realEstate['currency'] = listing['prices'].get('currency', None)
                else:
                    print(f'{page} has no price')
                    realEstate['rentNet'] = None
                    realEstate['rentGross'] = None
                    realEstate['currency'] = None
            
                if 'characteristics' in listing:
                    realEstate['livingSpace'] = listing['characteristics'].get('livingSpace', None)
                    realEstate['arePetsAllowed'] = listing['characteristics'].get('arePetsAllowed', None)
                    realEstate['hasFlatSharingCommunity'] = listing['characteristics'].get('hasFlatSharingCommunity', None)
                    realEstate['isUnderRoof'] = listing['characteristics'].get('isUnderRoof', None)
                    realEstate['CHF/m2'] = realEstate['rentGross'] / realEstate['livingSpace'] if realEstate['rentGross'] is not None and realEstate['livingSpace'] is not None else None
                else:
                    realEstate['livingSpace'] = None
                    realEstate['arePetsAllowed'] = None
                    realEstate['hasFlatSharingCommunity'] = None
                    realEstate['isUnderRoof'] = None
                    realEstate['CHF/m2'] = None    
                
                
            # Specifics for buy
            if w == 'kaufen':
                if 'prices' in listing and 'buy' in listing['prices']:
                    realEstate['buyPrice'] = listing['prices']['buy'].get('price', None)
                    realEstate['currency'] = listing['prices'].get('currency', None)
                else:
                    print(f'{page} has no price')
                    realEstate['buyPrice'] = None
                    realEstate['currency'] = None
                realEstate['livingSpace'] = listing['characteristics'].get('livingSpace', None)
                realEstate['CHF/m2'] = realEstate['buyPrice'] / realEstate['livingSpace'] if realEstate['buyPrice'] is not None and realEstate['livingSpace'] is not None else None
                
                if 'characteristics' in listing:
                    realEstate['livingSpace'] = listing['characteristics'].get('livingSpace', None)
                    realEstate['CHF/m2'] = realEstate['buyPrice'] / realEstate['livingSpace'] if realEstate['buyPrice'] is not None and realEstate['livingSpace'] is not None else None
                else:
                    realEstate['livingSpace'] = None
                    realEstate['CHF/m2'] = None
                    
                    
            # Characteristics
            realEstate['title'] = listing['localization']['de']['text']['title']
            realEstate['categories'] = listing.get('categories', [])
            
            if 'characteristics' in listing:
                realEstate['numberOfRooms'] = listing['characteristics'].get('numberOfRooms', None)
                realEstate['floor'] = listing['characteristics'].get('floor', None)
                realEstate['yearBuilt'] = listing['characteristics'].get('yearBuilt', None)
                realEstate['isNewBuilding'] = listing['characteristics'].get('isNewBuilding', None)
                realEstate['yearLastRenovated'] = listing['characteristics'].get('yearLastRenovated', None)
            
                # Parking
                realEstate['hasGarage'] = listing['characteristics'].get('hasGarage', None)
                realEstate['hasParking'] = listing['characteristics'].get('hasParking', None)     

                # Barriers
                realEstate['hasRamp'] = listing['characteristics'].get('hasRamp', None)
                realEstate['isWheelchairAccessible'] = listing['characteristics'].get('isWheelchairAccessible', None)
            
                # Other
                realEstate['isOldBuilding'] = listing['characteristics'].get('isOldBuilding', None)
                realEstate['hasSwimmingPool'] = listing['characteristics'].get('hasSwimmingPool', None)
                realEstate['hasFireplace'] = listing['characteristics'].get('hasFireplace', None)   
                realEstate['isCornerHouse'] = listing['characteristics'].get('isCornerHouse', None)
                realEstate['isMiddleHouse'] = listing['characteristics'].get('isMiddleHouse', None)
                realEstate['hasGardenShed'] = listing['characteristics'].get('hasGardenShed', None)
                realEstate['hasLiftingPlatform'] = listing['characteristics'].get('hasLiftingPlatform', None)
                realEstate['hasBalcony'] = listing['characteristics'].get('hasBalcony', None)
                realEstate['hasCableTv'] = listing['characteristics'].get('hasCableTv', None)
                realEstate['isGroundFloorRaised'] = listing['characteristics'].get('isGroundFloorRaised', None)
                realEstate['hasNiceView'] = listing['characteristics'].get('hasNiceView', None)
                realEstate['hasConnectedBuildingLand'] = listing['characteristics'].get('hasConnectedBuildingLand', None)
                realEstate['hasElevator'] = listing['characteristics'].get('hasElevator', None)
                realEstate['isChildFriendly'] = listing['characteristics'].get('isChildFriendly', None)
            
                # Range
                realEstate['distanceHighSchool'] = listing['characteristics'].get('distanceHighSchool', None)
                realEstate['distanceKindergarten'] = listing['characteristics'].get('distanceKindergarten', None)
                realEstate['distancePrimarySchool'] = listing['characteristics'].get('distancePrimarySchool', None)
                realEstate['distanceMotorway'] = listing['characteristics'].get('distanceMotorway', None)
                realEstate['distancePublicTransport'] = listing['characteristics'].get('distancePublicTransport', None)
                  
            else:
                print(f'{page} has no characteristics')
                realEstate['numberOfRooms'] = None
                realEstate['floor'] = None
                realEstate['yearBuilt'] = None
                realEstate['isNewBuilding'] = None
                realEstate['yearLastRenovated'] = None
            
                # Parking
                realEstate['hasGarage'] = None
                realEstate['hasParking'] = None   

                # Barriers
                realEstate['hasRamp'] = None
                realEstate['isWheelchairAccessible'] = None
            
                # Other
                realEstate['isOldBuilding'] = None
                realEstate['hasSwimmingPool'] = None
                realEstate['hasFireplace'] = None  
                realEstate['isCornerHouse'] = None
                realEstate['isMiddleHouse'] = None
                realEstate['hasGardenShed'] = None
                realEstate['hasLiftingPlatform'] = None
                realEstate['hasBalcony'] = None
                realEstate['hasCableTv'] = None
                realEstate['isGroundFloorRaised'] = None
                realEstate['hasNiceView'] = None
                realEstate['hasConnectedBuildingLand'] = None
                realEstate['hasElevator'] = None
                realEstate['isChildFriendly'] = None
            
                # Range
                realEstate['distanceHighSchool'] = None
                realEstate['distanceKindergarten'] = None
                realEstate['distancePrimarySchool'] = None
                realEstate['distanceMotorway'] = None
                realEstate['distancePublicTransport'] = None
                  
                
                
            if k == 'Wohnung':
                if 'characteristics' in listing:
                    realEstate['balcony'] = listing['characteristics'].get('balcony', None)
                    realEstate['builtInKitchen'] = listing['characteristics'].get('builtInKitchen', None)
                    realEstate['garden'] = listing['characteristics'].get('garden', None)
                else:
                    realEstate['balcony'] = None
                    realEstate['builtInKitchen'] = None
                    realEstate['garden'] = None
                realEstate['offerType'] = listing.get('offerType', None)
                
            elif k == 'Haus':
                realEstate['plotArea'] = realEstate['listing'].get('plotArea', None)
                realEstate['offerType'] = listing.get('offerType', None)
                realEstate['energyPerformanceCertificate'] = realEstate['listing'].get('energyPerformanceCertificate', None)
                if 'characteristics' in listings:
                    realEstate['hasBuildingLawRestrictions'] = listing['characteristics'].get('hasBuildingLawRestrictions', None)
                else:
                    realEstate['hasBuildingLawRestrictions'] = None
                    
                # Power
                if 'characteristics' in listing:
                    realEstate['hasPowerSupply'] = listing['characteristics'].get('hasPowerSupply', None)
                    realEstate['isMinergieGeneral'] = listing['characteristics'].get('isMinergieGeneral', None)
                    realEstate['isMinergieCertified'] = listing['characteristics'].get('isMinergieCertified', None)
                    realEstate['hasWaterSupply'] = listing['characteristics'].get('hasWaterSupply', None)
                    realEstate['hasGasSupply'] = listing['characteristics'].get('hasGasSupply', None)
                    realEstate['hasSewageSupply'] = listing['characteristics'].get('hasSewageSupply', None)
                else:
                    realEstate['hasPowerSupply'] = None
                    realEstate['isMinergieGeneral'] = None
                    realEstate['isMinergieCertified'] = None
                    realEstate['hasWaterSupply'] = None
                    realEstate['hasGasSupply'] = None
                    realEstate['hasSewageSupply'] = None
            
            realEstate['ID'] = listing['id']
            if 'legacy' in listing:
                realEstate['personId'] = listing['legacy'].get('personId', None)
            else:
                print(f'{page} has no personId')
                realEstate['personId'] = None
            
            immos[realEstate['ID']] = realEstate

        
        print('Scraping Page %i (%i Immobilien %s zum %s gefunden)' % (page, len(immos), k, w))
        
        
        
        # Check if we have reached the last page
        if pageCount and page >= pageCount or page >= max_ads:
            print("Reached the last page.")
            break
        
        if len(immos) >= max_ads:
            print("Reached the max adverts.")
            break
        
        current_len = len(immos)
        if current_len == previous_len:
            print("No new listings found, stopping the loop.")
            break
        previous_len = current_len
    
    else:
        print("Failed to retrieve or parse page data.")
        break


Suche Wohnung / mieten
Scraping Page 1 (20 Immobilien Wohnung zum mieten gefunden)
Scraping Page 2 (40 Immobilien Wohnung zum mieten gefunden)
3 has no price
Scraping Page 3 (60 Immobilien Wohnung zum mieten gefunden)
Scraping Page 4 (80 Immobilien Wohnung zum mieten gefunden)
Scraping Page 5 (100 Immobilien Wohnung zum mieten gefunden)
Scraping Page 6 (120 Immobilien Wohnung zum mieten gefunden)
Scraping Page 7 (140 Immobilien Wohnung zum mieten gefunden)
Scraping Page 8 (160 Immobilien Wohnung zum mieten gefunden)
Scraping Page 9 (180 Immobilien Wohnung zum mieten gefunden)
Scraping Page 10 (200 Immobilien Wohnung zum mieten gefunden)
Scraping Page 11 (220 Immobilien Wohnung zum mieten gefunden)
12 has no price
12 has no price
Scraping Page 12 (240 Immobilien Wohnung zum mieten gefunden)
13 has no price
Scraping Page 13 (260 Immobilien Wohnung zum mieten gefunden)
Scraping Page 14 (280 Immobilien Wohnung zum mieten gefunden)
Scraping Page 15 (300 Immobilien Wohnung zum mieten gefunde

In [5]:
df = pd.DataFrame(immos).T
df.index.name = 'ID'
df.drop(columns=['ID'], inplace=True)
df.livingSpace[df.livingSpace==0] = None
kanton = s.split('-')[1]
df['region'] = kanton
print("Scraped %i Immos" % len(immos))

Scraped 1000 Immos


In [None]:
# Select your desired order
desired_order = [
    'ID', 'personId', 'title', 'offerType', 'type',
    'address', 'city', 'postcode', 'quarter', 'region', 'lat', 'lon',
    'categories', 'numberOfRooms',
    'livingSpace', 'floor', 'yearBuilt', 'yearLastRenovated', 'isNewBuilding',
    'isOldBuilding', 'isUnderRoof', 'rentNet', 'rentGross', 'currency', 'CHF/m2',
    'arePetsAllowed', 'hasFlatSharingCommunity', 'hasGarage', 'hasParking', 'hasRamp',
    'isWheelchairAccessible', 'hasSwimmingPool', 'hasFireplace', 'isCornerHouse',
    'isMiddleHouse', 'hasGardenShed', 'hasLiftingPlatform', 'hasBalcony', 'hasCableTv',
    'isGroundFloorRaised', 'hasNiceView', 'hasConnectedBuildingLand', 'hasElevator',
    'isChildFriendly', 'balcony', 'builtInKitchen', 'garden',
    'distanceHighSchool', 'distanceKindergarten', 'distancePrimarySchool',
    'distanceMotorway', 'distancePublicTransport'
]

df = df[desired_order]

In [6]:
def save_to_csv(df, s, k, w):
  
    # Translation of options
    transaction_dict = {'mieten': 'rent', 'kaufen': 'buy'}
    property_dict = {'Wohnung': 'apartment', 'Haus': 'house'}

    # Translate
    transaction = transaction_dict.get(w, 'unknown')
    property = property_dict.get(k, 'unknown')

    # Get current date and time
    date_time = datetime.now().strftime("%Y%m%d")
    
    s = s.split('-')[1]
    
    # Create filename
    filename = f'{transaction}_{s}_{property.lower()}_{date_time}.csv'

    # Save DataFrame to CSV
    df.to_csv(filename, index=True)
    
    print(f'File saved as: {filename}')


# Call the function
save_to_csv(df, s, k, w)

File saved as: rent_land-schweiz-fl_apartment_20240522.csv


In [7]:
df.columns

Index(['type', 'address', 'city', 'postcode', 'lat', 'lon', 'rentNet',
       'rentGross', 'currency', 'livingSpace', 'arePetsAllowed',
       'hasFlatSharingCommunity', 'isUnderRoof', 'CHF/m2', 'title',
       'categories', 'numberOfRooms', 'floor', 'yearBuilt', 'isNewBuilding',
       'yearLastRenovated', 'hasGarage', 'hasParking', 'hasRamp',
       'isWheelchairAccessible', 'isOldBuilding', 'hasSwimmingPool',
       'hasFireplace', 'isCornerHouse', 'isMiddleHouse', 'hasGardenShed',
       'hasLiftingPlatform', 'hasBalcony', 'hasCableTv', 'isGroundFloorRaised',
       'hasNiceView', 'hasConnectedBuildingLand', 'hasElevator',
       'isChildFriendly', 'distanceHighSchool', 'distanceKindergarten',
       'distancePrimarySchool', 'distanceMotorway', 'distancePublicTransport',
       'balcony', 'builtInKitchen', 'garden', 'offerType', 'personId',
       'region'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0_level_0,type,address,city,postcode,lat,lon,rentNet,rentGross,currency,livingSpace,...,distanceKindergarten,distancePrimarySchool,distanceMotorway,distancePublicTransport,balcony,builtInKitchen,garden,offerType,personId,region
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4000906965,"[HOUSE, SINGLE_HOUSE]",Alterstrasse 4,Filzbach,8757,47.120462,9.131932,1920,1980,CHF,82,...,,,,,,,,RENT,2268787,schweiz
4000997183,"[APARTMENT, STUDIO]",Hauptstr. 30 (Nord),Döttingen,5312,47.570972,8.256332,750,915,CHF,45,...,700.0,700.0,,170.0,,,,RENT,679,schweiz
4001099145,"[APARTMENT, FLAT]",Falmenstrasse 2d,Uster,8610,47.352412,8.718372,1970,2270,CHF,88,...,,,,,,,,RENT,679,schweiz
4001028409,"[APARTMENT, DUPLEX]",Am Mattenhof 2b,Kriens,6010,47.028212,8.301092,2150,2490,CHF,117,...,,,,,,,,RENT,679,schweiz
4000888461,[APARTMENT],Dorfstrasse 31,Benzenschwil,5636,47.247752,8.365612,2015,2300,CHF,94,...,,,,,,,,RENT,1743533,schweiz
