In [None]:
import re
import os

import requests
import dateparser

from bs4 import BeautifulSoup
from datetime import datetime

import pandas as pd
from time import sleep

In [None]:
print(os.environ.get('BORD4_ENV_TEST'))

In [None]:
timeout = os.environ.get('BORD4_TIMEOUT') or 1

In [None]:
session = requests.session()
sender = os.environ.get('BORD4_SENDER') or 'Bord 4'
session.headers.update({
    "User-Agent": f"BT_badetemperaturbot (Bergens Tidende - {sender})"
})

In [None]:
def get_spots_at_page(url):
    sleep(timeout)
    response = session.get(url)
    
    return list(
        map(
            lambda x: x.find('a').get('href'),
            BeautifulSoup(response.text) \
                .find(class_='ece_module_list') \
                .find_all(class_='ece_module_container')
        )
    )

In [None]:
def get_all_spots():
    url = 'https://www.bergen.kommune.no/hvaskjer/tema/badevann-og-parker/badeplasser'
    
    base_response = session.get(url)
    
    subpages = map(
        lambda x: x.get('href'),
        BeautifulSoup(base_response.text) \
            .find(class_='article__content') \
            .find('ul') \
            .find_all('a')
    )
    
    spots = []
    for subpage in subpages:
        spots += get_spots_at_page(subpage)
    return spots

In [None]:
def get_base_data_for_spot(spot_url):
    data = {}
    data["url"] = f'https://www.bergen.kommune.no{spot_url}'
    
    sleep(timeout)
    response = session.get(data["url"])
    
    page = BeautifulSoup(response.text)

    data["name"] = page.find('h1').text
    iframe = page.find('iframe')
    if iframe is not None:
        data["api"] = iframe.get('src')
    data["description"] = re.sub('(\s*\n\s*)+', '\n', re.sub(' +', ' ', page.find(class_='article__content').text.strip()))
    
    return data

In [None]:
def get_data_from_api(path):
    if path == '':
        return None
    
    sleep(timeout)
    response = session.get(f'https://www.bergen.kommune.no{path}')
    soup = BeautifulSoup(response.text)
    
    data = {}
    data["quality"] = soup.find(class_="bade_kvalitet").find(class_="bade_verdi").text
    data["quality_reported_at"] = soup.find(class_="bade_kvalitet").find(class_="bade_tidspkt").text

    data["temperature"] = soup.find(class_="bade_temperatur").find(class_="bade_verdi").text
    data["temperature_reported_at"] = soup.find(class_="bade_temperatur").find(class_="bade_tidspkt").text
    
    return data

In [None]:
def custom_dateparse(string):
    if string == '':
        return None
    
    string = re.sub('sist målt: ?', '', string, flags=re.I)
    string = re.sub('klokken ?', '', string, flags=re.I)
    return dateparser.parse(string).isoformat()

In [None]:
spots = get_all_spots()

In [None]:
data = map(get_base_data_for_spot, spots)

In [None]:
df = pd.DataFrame.from_records(data)
df.api = df.api.fillna('').astype(str)
df.loc[~df.api.str.startswith('/'), 'api'] = ''

In [None]:
df["api_data"] = list(map(get_data_from_api, df.api))

In [None]:
df_api_data = pd.DataFrame.from_records(
    map(
        lambda x: x if x is not None else {},
        df.api_data
    )
)

df[df_api_data.columns] = df_api_data
df.temperature_reported_at = list(map(custom_dateparse, df.temperature_reported_at.fillna('')))
df.quality_reported_at = list(map(custom_dateparse, df.quality_reported_at.fillna('')))

df.temperature = df.temperature.fillna('') \
    .str.replace(',', '.') \
    .str.replace('°C', '') \
    .replace('', None) \
    .astype(float)

In [None]:
if (
    type(df) == pd.core.frame.DataFrame
    and df.shape[0] > 0
    and df.shape[1] == 9
):
    print('Saved!')
    df.drop(columns=['description']).to_csv(f'data/{datetime.now().strftime("%Y-%m-%d")}.csv', index=False)
    df.to_csv('data/latest.csv', index=False)