In [0]:
_from bs4 import BeautifulSoup
import json
import urllib.request as urllib2
import random
from random import choice
import time

In [0]:
# urlquery from Achim Tack. Thank you!
# https://github.com/ATack/GoogleTrafficParser/blob/master/google_traffic_parser.py
def urlquery(url):
    # function cycles randomly through different user agents and time intervals to simulate more natural queries
    try:
        sleeptime = float(random.randint(1,6))/5
        time.sleep(sleeptime)

        agents = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17',
        'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
        'Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02',
        'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
        'Mozilla/3.0',
        'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543a Safari/419.3',
        'Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522+ (KHTML, like Gecko) Safari/419.3',
        'Opera/9.00 (Windows NT 5.1; U; en)']

        agent = choice(agents)
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', agent)]

        html = opener.open(url).read()
        time.sleep(sleeptime)
        
        return html

    except Exception as e:
        print('Something went wrong with Crawling:\n%s' % e)

In [0]:
def immoscout24parser(url):
    
    ''' Parser holt aus Immoscout24.de Suchergebnisseiten die Immobilien '''
    
    try:
        soup = BeautifulSoup(urlquery(url), 'html.parser')
        scripts = soup.findAll('script')
        for script in scripts:
            #print script.text.strip()
            if 'IS24.resultList' in script.text.strip():
                s = script.string.split('\n')
                for line in s:
                    #print('\n\n\'%s\'' % line)
                    if line.strip().startswith('resultListModel'):
                        resultListModel = line.strip('resultListModel: ')
                        immo_json = json.loads(resultListModel[:-1])

                        searchResponseModel = immo_json[u'searchResponseModel']
                        resultlist_json = searchResponseModel[u'resultlist.resultlist']
                        
                        return resultlist_json

    except Exception as e:
        print("Fehler in immoscout24 parser: %s" % e)

In [0]:
immos = {}

b = 'Nordrhein-Westfalen' # Bundesland
s = 'Wuppertal' # Stadt
k = 'Wohnung' # Wohnung oder Haus
w = 'Miete' # Miete oder Kauf

page = 0
print('Suche %s / %s' % (k, w))

while True:
    page+=1
    url = 'http://www.immobilienscout24.de/Suche/S-T/P-%s/%s-%s/%s/%s?pagerReporting=true' % (page, k, w, b, s)

    # Because of some timeout or immoscout24.de errors,
    # we try until it works \o/
    resultlist_json = None
    while resultlist_json is None:
        try:
            resultlist_json = immoscout24parser(url)
            numberOfPages = int(resultlist_json[u'paging'][u'numberOfPages'])
            pageNumber = int(resultlist_json[u'paging'][u'pageNumber'])
        except:
            pass

    if page>numberOfPages:
        break

    # Get the data
    for resultlistEntry in resultlist_json['resultlistEntries'][0][u'resultlistEntry']:
        realEstate_json = resultlistEntry[u'resultlist.realEstate']
        
        realEstate = {}

        realEstate[u'Miete/Kauf'] = w
        realEstate[u'Haus/Wohnung'] = k

        realEstate['address'] = realEstate_json['address']['description']['text']
        realEstate['city'] = realEstate_json['address']['city']
        realEstate['postcode'] = realEstate_json['address']['postcode']
        realEstate['quarter'] = realEstate_json['address']['quarter']
        try:
            realEstate['lat'] = realEstate_json['address'][u'wgs84Coordinate']['latitude']
            realEstate['lon'] = realEstate_json['address'][u'wgs84Coordinate']['longitude']
        except:
            realEstate['lat'] = None
            realEstate['lon'] = None
            
        realEstate['title'] = realEstate_json['title']

        realEstate['numberOfRooms'] = realEstate_json['numberOfRooms']
        realEstate['livingSpace'] = realEstate_json['livingSpace']
        
        if k=='Wohnung':
            realEstate['balcony'] = realEstate_json['balcony']
            realEstate['builtInKitchen'] = realEstate_json['builtInKitchen']
            realEstate['garden'] = realEstate_json['garden']
            realEstate['price'] = realEstate_json['price']['value']
            realEstate['privateOffer'] = realEstate_json['privateOffer']
        elif k=='Haus':
            realEstate['isBarrierFree'] = realEstate_json['isBarrierFree']
            realEstate['cellar'] = realEstate_json['cellar']
            realEstate['plotArea'] = realEstate_json['plotArea']
            realEstate['price'] = realEstate_json['price']['value']
            realEstate['privateOffer'] = realEstate_json['privateOffer']
        
        realEstate['floorplan'] = realEstate_json['floorplan']
        realEstate['from'] = realEstate_json['companyWideCustomerId']
        realEstate['ID'] = realEstate_json[u'@id']
        realEstate['url'] = u'https://www.immobilienscout24.de/expose/%s' % realEstate['ID']

        immos[realEstate['ID']] = realEstate

    print('Scrape Page %i/%i (%i Immobilien %s %s gefunden)' % (page, numberOfPages, len(immos), k, w))

Suche Wohnung / Miete
Scrape Page 1/42 (20 Immobilien Wohnung Miete gefunden)
Scrape Page 2/42 (40 Immobilien Wohnung Miete gefunden)
Scrape Page 3/42 (60 Immobilien Wohnung Miete gefunden)
Scrape Page 4/42 (80 Immobilien Wohnung Miete gefunden)
Scrape Page 5/42 (100 Immobilien Wohnung Miete gefunden)
Scrape Page 6/42 (120 Immobilien Wohnung Miete gefunden)
Scrape Page 7/42 (140 Immobilien Wohnung Miete gefunden)
Scrape Page 8/42 (160 Immobilien Wohnung Miete gefunden)
Scrape Page 9/42 (180 Immobilien Wohnung Miete gefunden)
Scrape Page 10/42 (200 Immobilien Wohnung Miete gefunden)
Scrape Page 11/42 (220 Immobilien Wohnung Miete gefunden)
Scrape Page 12/42 (240 Immobilien Wohnung Miete gefunden)
Scrape Page 13/42 (260 Immobilien Wohnung Miete gefunden)
Scrape Page 14/42 (280 Immobilien Wohnung Miete gefunden)
Scrape Page 15/42 (300 Immobilien Wohnung Miete gefunden)
Scrape Page 16/42 (320 Immobilien Wohnung Miete gefunden)
Scrape Page 17/42 (340 Immobilien Wohnung Miete gefunden)
Scrap

In [0]:
print("Scraped %i Immos" % len(immos))

Scraped 836 Immos


In [0]:
from datetime import datetime
timestamp = datetime.strftime(datetime.now(), '%Y-%m-%d-%H-%M')

In [0]:
import pandas as pd
df = pd.DataFrame(immos).T
df.index.name = 'ID'
df.head()

Unnamed: 0_level_0,Haus/Wohnung,ID,Miete/Kauf,address,balcony,builtInKitchen,city,floorplan,from,garden,lat,livingSpace,lon,numberOfRooms,postcode,price,privateOffer,quarter,title,url
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
107576336,Wohnung,107576336,Miete,"Bramdelle 35, Langerfeld-Beyenburg, Wuppertal",False,False,Wuppertal,False,1.947582,False,51.2881,44.62,7.2417,2,42279,334.65,False,Langerfeld-Beyenburg,ERSTBEZUG nach Renovierung**helle 2-Zimmerwohn...,https://www.immobilienscout24.de/expose/107576336
110256105,Wohnung,110256105,Miete,"Hamburger Straße 24, Uellendahl-Katernberg, Wu...",False,False,Wuppertal,True,1.947582,True,51.2697,79.8,7.14778,2,42109,598.5,False,Uellendahl-Katernberg,Großzügige 2-Zimmer DG-Wohnung**Schöner denkma...,https://www.immobilienscout24.de/expose/110256105
110293125,Wohnung,110293125,Miete,"Jesinghauser Str. 83, Langerfeld-Beyenburg, Wu...",False,False,Wuppertal,False,1.1924949,False,51.2826,55.0,7.2524,3,42389,323.0,False,Langerfeld-Beyenburg,Ihr neues Wohlfühlheim! **mit Tageslichtbad**,https://www.immobilienscout24.de/expose/110293125
110293111,Wohnung,110293111,Miete,"Jesinghauser Str. 79, Langerfeld-Beyenburg, Wu...",False,False,Wuppertal,False,1.1924949,False,51.2825,55.0,7.25189,3,42389,324.0,False,Langerfeld-Beyenburg,Einziehen! Wohlfühlen! Fertig! **mit Tageslich...,https://www.immobilienscout24.de/expose/110293111
110613634,Wohnung,110613634,Miete,"Jesinghauser Str. 79, Langerfeld-Beyenburg, Wu...",False,False,Wuppertal,False,1.1924949,False,51.2825,56.0,7.25189,3,42389,330.0,False,Langerfeld-Beyenburg,Einziehen! Wohlfühlem! Fertig! ** mit Tageslic...,https://www.immobilienscout24.de/expose/110613634


In [0]:
f = open('%s-%s-%s.csv' % (timestamp, k, w), 'w')
f.write('# %s %s from immoscout24.de on %s\n' % (k,w,timestamp))
df[(df['Haus/Wohnung']==k) & (df['Miete/Kauf']==w)].to_csv(f, encoding='utf-8')
f.close()

In [0]:
df.to_excel('%s-%s-%s.xlsx' % (timestamp, k, w))

In [0]:
df.head()

Unnamed: 0_level_0,Haus/Wohnung,ID,Miete/Kauf,address,balcony,builtInKitchen,city,floorplan,from,garden,lat,livingSpace,lon,numberOfRooms,postcode,price,privateOffer,quarter,title,url
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
107576336,Wohnung,107576336,Miete,"Bramdelle 35, Langerfeld-Beyenburg, Wuppertal",False,False,Wuppertal,False,1.947582,False,51.2881,44.62,7.2417,2,42279,334.65,False,Langerfeld-Beyenburg,ERSTBEZUG nach Renovierung**helle 2-Zimmerwohn...,https://www.immobilienscout24.de/expose/107576336
110256105,Wohnung,110256105,Miete,"Hamburger Straße 24, Uellendahl-Katernberg, Wu...",False,False,Wuppertal,True,1.947582,True,51.2697,79.8,7.14778,2,42109,598.5,False,Uellendahl-Katernberg,Großzügige 2-Zimmer DG-Wohnung**Schöner denkma...,https://www.immobilienscout24.de/expose/110256105
110293125,Wohnung,110293125,Miete,"Jesinghauser Str. 83, Langerfeld-Beyenburg, Wu...",False,False,Wuppertal,False,1.1924949,False,51.2826,55.0,7.2524,3,42389,323.0,False,Langerfeld-Beyenburg,Ihr neues Wohlfühlheim! **mit Tageslichtbad**,https://www.immobilienscout24.de/expose/110293125
110293111,Wohnung,110293111,Miete,"Jesinghauser Str. 79, Langerfeld-Beyenburg, Wu...",False,False,Wuppertal,False,1.1924949,False,51.2825,55.0,7.25189,3,42389,324.0,False,Langerfeld-Beyenburg,Einziehen! Wohlfühlen! Fertig! **mit Tageslich...,https://www.immobilienscout24.de/expose/110293111
110613634,Wohnung,110613634,Miete,"Jesinghauser Str. 79, Langerfeld-Beyenburg, Wu...",False,False,Wuppertal,False,1.1924949,False,51.2825,56.0,7.25189,3,42389,330.0,False,Langerfeld-Beyenburg,Einziehen! Wohlfühlem! Fertig! ** mit Tageslic...,https://www.immobilienscout24.de/expose/110613634


In [0]:
#Rename some columns
df.rename(columns={'lat':'Latitude',
                       'lon':'Longitude',
                       'numberOfRooms':'Rooms',
                       'postcode':'PostalCode',
                       'quarter':'Neighborhood',
                       'livingSpace':'Space',
                       'builtInKitchen':'Kitchen',
                       'garden':'Garden',
                       'price':'Price'        
                      }, 
                 inplace=True)
df.tail()

Unnamed: 0_level_0,Haus/Wohnung,ID,Miete/Kauf,address,balcony,Kitchen,city,floorplan,from,Garden,Latitude,Space,Longitude,Rooms,PostalCode,Price,privateOffer,Neighborhood,title,url
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
51813825,Wohnung,51813825,Miete,"Grünewalder Berg 20, Elberfeld, Wuppertal",False,False,Wuppertal,False,1.2649978,False,51.2572,73.0,7.13775,3,42105,500.0,True,Elberfeld,"Schöne, helle Dreizimmerwohnung im Luisenviert...",https://www.immobilienscout24.de/expose/51813825
50764285,Wohnung,50764285,Miete,"Sonnenstr. 5, Oberbarmen, Wuppertal",True,False,Wuppertal,False,1.596601,True,51.277,70.0,7.22659,2,42277,455.0,True,Oberbarmen,Wohnen im Grünen und doch citynah in Wuppertal...,https://www.immobilienscout24.de/expose/50764285
40457817,Wohnung,40457817,Miete,"Hahnerberger Straße 109+111, Cronenberg, Wuppe...",False,True,Wuppertal,True,1.1071159,True,51.2242,22.42,7.15206,1,42349,120.17,False,Cronenberg,Studenten-Appartements in Uni-Nähe (Muster-App...,https://www.immobilienscout24.de/expose/40457817
35761307,Wohnung,35761307,Miete,"Rolandstr. 10, Elberfeld, Wuppertal",False,False,Wuppertal,False,1.553128,False,51.2603,72.0,7.14117,2,42105,465.0,True,Elberfeld,hmmm-WHIRLPOOL-2 Zimmer G-WC-Parkett-Citylage,https://www.immobilienscout24.de/expose/35761307
30826857,Wohnung,30826857,Miete,"Hammer Weg 8, Cronenberg, Wuppertal",False,True,Wuppertal,False,1.328073,False,51.2165,24.88,7.15112,1,42349,145.77,True,Cronenberg,"Studenten-Appartement in Uni Nähe, WBS erforde...",https://www.immobilienscout24.de/expose/30826857


In [0]:
#The file contains some columns useless, so we drop them
df_apartment = df.drop(['Haus/Wohnung', 'Garden', 'ID', 'balcony','floorplan','from','privateOffer','title','city','Miete/Kauf','PostalCode','address'], axis=1)
df_apartment.head()

Unnamed: 0_level_0,Kitchen,Latitude,Space,Longitude,Rooms,Price,Neighborhood,url
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
107576336,False,51.2881,44.62,7.2417,2,334.65,Langerfeld-Beyenburg,https://www.immobilienscout24.de/expose/107576336
110256105,False,51.2697,79.8,7.14778,2,598.5,Uellendahl-Katernberg,https://www.immobilienscout24.de/expose/110256105
110293125,False,51.2826,55.0,7.2524,3,323.0,Langerfeld-Beyenburg,https://www.immobilienscout24.de/expose/110293125
110293111,False,51.2825,55.0,7.25189,3,324.0,Langerfeld-Beyenburg,https://www.immobilienscout24.de/expose/110293111
110613634,False,51.2825,56.0,7.25189,3,330.0,Langerfeld-Beyenburg,https://www.immobilienscout24.de/expose/110613634


In [0]:
#I'm reorganizing the columns in order to be easier to understand
df_apartment = df_apartment[['Neighborhood', 'Latitude','Longitude','Price','Rooms','Space','Kitchen','url']]
df_apartment.head()

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Price,Rooms,Space,Kitchen,url
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
107576336,Langerfeld-Beyenburg,51.2881,7.2417,334.65,2,44.62,False,https://www.immobilienscout24.de/expose/107576336
110256105,Uellendahl-Katernberg,51.2697,7.14778,598.5,2,79.8,False,https://www.immobilienscout24.de/expose/110256105
110293125,Langerfeld-Beyenburg,51.2826,7.2524,323.0,3,55.0,False,https://www.immobilienscout24.de/expose/110293125
110293111,Langerfeld-Beyenburg,51.2825,7.25189,324.0,3,55.0,False,https://www.immobilienscout24.de/expose/110293111
110613634,Langerfeld-Beyenburg,51.2825,7.25189,330.0,3,56.0,False,https://www.immobilienscout24.de/expose/110613634


In [0]:
#Now, we have to drop the NaN rows
df_apartment = df_apartment.dropna()
df_apartment.head()

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Price,Rooms,Space,Kitchen,url
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
107576336,Langerfeld-Beyenburg,51.2881,7.2417,334.65,2,44.62,False,https://www.immobilienscout24.de/expose/107576336
110256105,Uellendahl-Katernberg,51.2697,7.14778,598.5,2,79.8,False,https://www.immobilienscout24.de/expose/110256105
110293125,Langerfeld-Beyenburg,51.2826,7.2524,323.0,3,55.0,False,https://www.immobilienscout24.de/expose/110293125
110293111,Langerfeld-Beyenburg,51.2825,7.25189,324.0,3,55.0,False,https://www.immobilienscout24.de/expose/110293111
110613634,Langerfeld-Beyenburg,51.2825,7.25189,330.0,3,56.0,False,https://www.immobilienscout24.de/expose/110613634


In [0]:
#To solve my problem, I'm looking for flats that are under 500€ and which have kitchen
df_apartment = df_apartment[(df_apartment.Price < 500) & (df_apartment.Kitchen == 'true')]
df_apartment.head()

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,Price,Rooms,Space,Kitchen,url
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
110429907,Barmen,51.2632,7.19012,331,2,48,True,https://www.immobilienscout24.de/expose/110429907
110986178,Vohwinkel,51.2328,7.08696,430,2,48,True,https://www.immobilienscout24.de/expose/110986178
77406973,Elberfeld,51.2636,7.14511,280,1,30,True,https://www.immobilienscout24.de/expose/77406973
109896859,Ronsdorf,51.2277,7.20453,250,1,25,True,https://www.immobilienscout24.de/expose/109896859
111024017,Elberfeld West,51.2497,7.12178,470,2,57,True,https://www.immobilienscout24.de/expose/111024017


In [0]:
#In order to save my dataframe, I've to connect the notebook to my drive and enter my token
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [0]:
#Now, I can save my df to my drive
df_apartment.to_csv('scraping_apartments.csv')
!cp scraping_apartments.csv drive/My\ Drive/