# 1. Import Libraries

In [1]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import numpy as np
import itertools
import time
import random
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# 2. Bot for Web Scraping

## 2.1 Define the Scraping Process

In this step define the function, which will scrape the website for flats and their attributes. The attributes that I am collecting are:

- price
- address
- number of rooms
- square metres
- link of the flat
- title, which includes a description of the flat
- extra, which additional features of the flat

In [5]:
def collect_data():
    
    # Prices
    if house_containers[p].find('span', class_='resultlist-value') in (None, np.nan):
        price = np.nan
    else:
        price = house_containers[p].find('span', class_='resultlist-value').text
        price = re.sub('\D', '', price)
        price = float(price)
        price = price / 100
    prices.append(price)

    # Addresses
    if house_containers[p].find('div', class_='resultlist-address') in (None, np.nan):
        address = np.nan
    else:
        address = house_containers[p].find('div', class_='resultlist-address').text
        address = re.sub('\s','',address)
    addresses.append(address)

    # Rooms
    if house_containers[p].find_all('span', class_='resultlist-value')[2] in (None, np.nan):
        room = np.nan
    else:
        room = house_containers[p].find_all('span', class_='resultlist-value')[2].text
        room = re.sub(',','.',room)
        room = re.search('(\d+.\d+)|(\d+)', room).group()
        room = float(room)
    rooms.append(room)

    # Square Metre
    if house_containers[p].find_all('span', class_='resultlist-value')[1] in (None, np.nan):
        sqm = np.nan
    else:
        sqm = house_containers[p].find_all('span', class_='resultlist-value')[1].text
        sqm = re.sub(',','.',sqm)
        sqm = re.search('(\d+.\d+)|(\d+)', sqm).group()
        sqm = float(sqm)
    sqms.append(sqm)

    # Links
    if house_containers[p].find('a', class_='resultlist-title').get('href') in (None, np.nan):
        href = np.nan
    else:
        href = house_containers[p].find('a', class_='resultlist-title').get('href')
        href = 'https:'+href
    hrefs.append(href)               

    # Titles
    if house_containers[p].find('a', class_='resultlist-title') in (None, np.nan):
        title = np.nan
    else:
        title = house_containers[p].find('a', class_='resultlist-title').text
    titles.append(title)

    # Extra
    if house_containers[p].find('ul', class_='resultlist-properties') in (None, np.nan):
        extra = np.nan
    else:
        extra = house_containers[p].find('ul', class_='resultlist-properties').text
        extra = re.sub('\s','',extra)
    extras.append(extra)

## 2.2 Run the Bot

In this section I run the bot to scrape the website and insert the collected information into the appropriate lists. At thend I print how many properties I managed to scrape - 3266.

In [6]:
n_pages = 0
header = ({'User-Agent': 
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})

prices = []
addresses =[]
rooms = []
sqms = []
hrefs = []
titles = []
extras = []



for page in range(273):
    n_pages += 1
    if n_pages == 1:
        url = 'https://www.immobilienscout24.de/wohnen/berlin,berlin/mietwohnungen.html'
    else:
        url = 'https://www.immobilienscout24.de/wohnen/berlin,berlin/mietwohnungen'+',seite-'+str(n_pages)+'.html'
    page = requests.get(url, headers=header)
    soup = BeautifulSoup(page.content, 'html.parser')
    house_containers = soup.find_all('div', class_='grid resultlist-container-big')
    if house_containers != []:
        for p in range(len(house_containers)):
            
            collect_data()
                
    else:
        house_containers = soup.find_all('div', class_='grid resultlist-container')
        for p in range(len(house_containers)):
            
            collect_data() 
        
    time.sleep(random.randint(1,2))
    
print('You scraped {} pages containing {} properties.'.format(n_pages, len(addresses)))
    

You scraped 273 pages containing 3266 properties.


# 3. Save Data

I reformat the data and save it all to one dataframe. I also create a new field caled 'square metre price'.

In [7]:
# Save to dataframe
berlin_rent = pd.DataFrame({'address': addresses,
                   'rooms': rooms,
                   'area sqm': sqms,
                   'price': prices,
                   'links': hrefs,
                   'title': titles,
                   'extra': extras})

# Extract postal code data 
postcode = [re.search('\d+',berlin_rent['address'][r]).group() for r in range(len(berlin_rent))]

# Add postal code data to the dataframe 
berlin_rent = pd.concat([berlin_rent, pd.Series(postcode).rename('postcode')], axis=1)

# Create a new field: square metre price
berlin_rent['price_sqm'] = berlin_rent['price'] / berlin_rent['area sqm']

# 4. Save to csv

I save everything to a csv file.

In [13]:
berlin_rent.to_csv('C:/Users/Alessandro/Desktop/Scraping/Berlin/berlin_rent.csv')