# Screen scraping
Sometimes we need to fetch the data ourselves.

This is an example that fetches a table from Bergen municipality's budget for 2023.

It uses selesium to get the page, and then BeautifulSoup for parsing.

Example inspired by https://medium.com/free-code-camp/better-web-scraping-in-python-with-selenium-beautiful-soup-and-pandas-d6390592e251

In [26]:
# Notebook config, enviroment and logging
import os
import sys

import logging
from dotenv import load_dotenv

import pakkenellik.config as conf
from pakkenellik.log import logger

# Add module path and load config
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

# Autoreload extension
if "autoreload" not in get_ipython().extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Load the .env file into local env
load_dotenv()

# Create the config
config = conf.Config(module_path)

# Enable logging and crank up log level to DEBUG.
# This is particularly useful when developing code in your project module and using it from a notebook.
logger.setLevel(logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [98]:
## Data manipulation and scraping
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests

from pakkenellik.dataframe.clean_column_headers import clean_column_headers

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

## Fetch url

In [15]:
# launch url
url = "https://pub.framsikt.net/2023/bergen/bm-2023-kortversjon_23-26_/#/"

# create a new Firefox session
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(url)

[96m[D] 2023-01-03 14:43:26: driver not found in PATH, trying Selenium Manager[0m
[96m[D] 2023-01-03 14:43:26: Executing: /Users/Lasse.Lambrechts@bt.no/Library/Caches/pypoetry/virtualenvs/bord4-analysis-templates-1LN8Krm6-py3.10/lib/python3.10/site-packages/selenium/webdriver/common/macos/selenium-manager --browser firefox[0m
[96m[D] 2023-01-03 14:43:26: Using driver at: /Users/Lasse.Lambrechts@bt.no/.cache/selenium/geckodriver/mac64/0.32.0/geckodriver[0m
[96m[D] 2023-01-03 14:43:26: Started executable: `geckodriver` in a child process with pid: 15406[0m
[96m[D] 2023-01-03 14:43:27: POST http://localhost:56295/session {"capabilities": {"firstMatch": [{}], "alwaysMatch": {"browserName": "firefox", "acceptInsecureCerts": true, "moz:debuggerAddress": true, "pageLoadStrategy": "normal"}}}[0m
[96m[D] 2023-01-03 14:43:27: Starting new HTTP connection (1): localhost:56295[0m
[96m[D] 2023-01-03 14:43:29: http://localhost:56295 "POST /session HTTP/1.1" 200 798[0m
[96m[D] 2023-01-

In [23]:
# After opening the url above, Selenium clicks the specific budget link for schools
school_link = driver.find_element(By.XPATH, "//a[div/h3[text()='01B-Skole']]")
school_link.click()

[96m[D] 2023-01-03 15:24:56: POST http://localhost:56295/session/e8aeb217-a9ba-4d0a-a2fd-a6e2ad36e272/element {"using": "xpath", "value": "//a[div/h3[text()='01B-Skole']]"}[0m
[96m[D] 2023-01-03 15:24:56: http://localhost:56295 "POST /session/e8aeb217-a9ba-4d0a-a2fd-a6e2ad36e272/element HTTP/1.1" 200 88[0m
[96m[D] 2023-01-03 15:24:56: Remote response: status=200 | data={"value":{"element-6066-11e4-a52e-4f735466cecf":"9c268c2d-b0fc-44f2-8c9b-089b49238d83"}} | headers=HTTPHeaderDict({'content-type': 'application/json; charset=utf-8', 'cache-control': 'no-cache', 'content-length': '88', 'date': 'Tue, 03 Jan 2023 14:24:56 GMT'})[0m
[96m[D] 2023-01-03 15:24:56: Finished Request[0m


## Beautiful Soup 

In [27]:
# Selenium hands the page source to Beautiful Soup
beta_soup = BeautifulSoup(driver.page_source, "lxml")

In [31]:
# Find the h3 tag with text Investeringsbudsjett and find the tables
h3_tag = beta_soup.find(attrs={"tooltiptext": "Investeringsbudsjett"})
tables = h3_tag.findAll("table")

In [40]:
df = pd.read_html(str(tables), header=0)

In [99]:
# The first dataframe contains the headers over two rows
# So lets merge them together
line1 = df[0].iloc[0].reset_index()
line1.columns = ["first", "second"]

# replacing to get rid of annoying values
line1 = line1.replace("Unnamed: 0", "")
line1 = line1.replace("\.\d", "", regex=True)

line1["column_header"] = line1["first"].astype(str) + " " + line1["second"].astype(str)

# Use it for better column headers
df[1].columns = line1.column_header.to_list()

# Clean the column headers
df[1] = clean_column_headers(df[1])

In [101]:
school_df = df[1]

In [102]:
driver.quit()