In [104]:
import pandas as pd
import numpy as np
import functools as ft
import pickle
import requests

import sys

sys.path.append("..")
from tools import data_path, output_path, cache_path

### Data origin: 
Data was privately provided by authorities. Not reproducible without original data.

In [105]:
# Parsing the format requires a lot of ram ...

In [5]:
f = open(data_path + "brandenburg_15.txt", "r")

In [36]:
content = f.read()
content = content.split("]]}")  # Splitting objects at every instance of }\n{

In [None]:
stack = []
meta_stack = []
for c in content[:-1]:
    meta, data = c.split('"data": [')
    meta = pd.DataFrame(
        [x.split(":") for x in meta[2:].replace('"', "").split(", ")[:-1]]
    )
    data = pd.DataFrame(
        [x.split(", ") for x in data[1:].replace('"', "").split("], [")]
    )
    data[0] = pd.to_datetime(data[0])
    data = data[(data[0].dt.year > 2018) & (data[0].dt.year < 2024)].reset_index(
        drop=True
    )
    data.loc[data[1] == "null", 1] = np.nan
    data[1] = data[1].astype(float)
    data.columns = ["datetime", meta[meta[0] == "station_no"][1].values[0][1:] + "_b"]
    stack.append(data)
    meta_stack.append(meta)

In [None]:
pickle.dump(stack, open(cache_path + "brandenburg_data_stack.p", "wb"))
pickle.dump(meta_stack, open(cache_path + "rivers/brandenburg_meta_stack.p", "wb"))

### reload to free ram

In [106]:
stack = pickle.load(open(cache_path + "/brandenburg_data_stack.p", "rb"))
meta_stack = pickle.load(open(cache_path + "brandenburg_meta_stack.p", "rb"))

In [107]:
# select Q for now
stack = [
    stack[x]
    for x in range(len(stack))
    if meta_stack[x].loc[meta_stack[x][0] == "parametertype_name", 1].values[0] == " Q"
]
meta_stack = [
    meta_stack[x]
    for x in range(len(meta_stack))
    if meta_stack[x].loc[meta_stack[x][0] == "parametertype_name", 1].values[0] == " Q"
]

In [108]:
# some duplicate column names exist
counts = {
    y: [x.columns[1] for x in stack].count(y) for y in [x.columns[1] for x in stack]
}
[x for x in counts.keys() if counts[x] != 1]

['6949700_b', '5896600_b', '5934903_b']

In [109]:
# something is weird here (naN). Maybe come back later. for now drop some data columns:
drop = [102, 104, 105, 115, 117, 126, 128]
stack = [x for n, x, in enumerate(stack) if n not in drop]
meta_stack = [x for n, x, in enumerate(meta_stack) if n not in drop]

In [110]:
stack = ft.reduce(lambda left, right: pd.merge(left, right, on="datetime"), stack)

In [111]:
stack.index = stack["datetime"]
stack.drop(columns="datetime", inplace=True)

In [112]:
# replace tag from earlier mistake
stack.columns = [x[:-2] + "_br" for x in stack.columns]

In [113]:
stack.index = stack.index.tz_localize(None)

In [114]:
# a lot of random parsing
meta_prep = []
for x in meta_stack:
    x.index = x[0]
    x.drop(columns=[0], inplace=True)
    x.columns = [int(x.loc["station_no"].values[0])]
    x.index = x.index.str.replace("{", "")
    meta_prep.append(x)

In [115]:
meta_prep = pd.concat(meta_prep, axis=1).T
meta_prep = meta_prep[meta_prep.columns[:7]]
meta_prep["station_no"] = meta_prep["station_no"].str[1:]

In [116]:
def grab_meta_br(x):

    response = requests.get(
        "https://pegelportal.brandenburg.de/messstelle.php?fgid=5&pkz="
        + x
        + "&thema=q_graph&language=en#loaded"
    )

    infos = (
        response.text.split("station ID:</td>")[1]
        .split("</td> \n</tr>\n<tr>\n<td>status:</td>\n<td>Aktuell")[0]
        .replace("</td>", "")
        .replace("<td>", "")
        .replace("</tr>", "")
        .replace("<tr>", "")
        .replace("<td style='text-align:left'>", "")
        .split("\n")
    )

    infos = [x for x in infos if len(x) > 0]
    infos = [x for n, x in enumerate(infos) if (n % 2 == 0)]

    return infos

In [117]:
meta_upgrade = []
fail = []
for x in meta_prep["station_no"].unique():
    try:
        meta_upgrade.append(grab_meta_br(x))
    except:
        fail.append(x)
        print(fail)

['5873500']
['5873500', '6945301']
['5873500', '6945301', '5895001']
['5873500', '6945301', '5895001', '6910302']
['5873500', '6945301', '5895001', '6910302', '5811701']
['5873500', '6945301', '5895001', '6910302', '5811701', '6970800']


In [118]:
# some repairs by hand:
meta_upgrade[17] = meta_upgrade[17][:9]
for x in range(len(meta_upgrade)):
    if len(meta_upgrade[x]) != 9:
        meta_upgrade[x] = meta_upgrade[x][:4] + [""] + meta_upgrade[x][4:]

In [119]:
meta_upgrade = pd.DataFrame(meta_upgrade)

In [159]:
meta_out = meta_prep.merge(meta_upgrade, left_on="station_no", right_on=0, how="left")

In [160]:
meta_out = meta_out[meta_out.columns[:19]]

## Bring to joint format

In [161]:
# some tiny issues.
meta_out.loc[20, 8] = meta_out.loc[20, 9]
meta_out.loc[meta_out[8].isnull(), 8] = "?"
meta_out.loc[20, 5] = meta_out.loc[20, 6]

In [162]:
# str parsing height
meta_out[8] = meta_out[8].str.split(" m").str[0].str.replace(",", ".")
meta_out.loc[meta_out[8] == "?", 8] = np.nan
meta_out.loc[meta_out[8] == "", 8] = np.nan
meta_out[8] = meta_out[8].astype(float)

In [163]:
meta_out.index = meta_out["station_no"]
meta_out.index.name = "ID"
meta_out = meta_out[[3, "station_latitude", "station_longitude", 5, 8]]
meta_out.columns = ["R", "X", "Y", "D", "H"]
meta_out.loc[meta_out["D"].isnull(), "D"] = "Unknown"

In [164]:
# Standard case.
meta_out.loc[meta_out["D"].str.contains("oberhalb der Mündung"), "D"] = (
    meta_out.loc[meta_out["D"].str.contains("oberhalb der Mündung"), "D"]
    .str.replace(" km<br> oberhalb der Mündung", "")
    .str.replace(",", ".")
)

In [165]:
# Now we need to fix the inconsistencies in the specification.
# We encode the reverse info as negative numbers to distinguish later.
meta_out.loc[
    meta_out["D"].str.contains("km<br> Kilometer an der Wasserstraße"), "D"
] = "-" + (
    meta_out.loc[
        meta_out["D"].str.contains("km<br> Kilometer an der Wasserstraße"), "D"
    ]
    .str.replace(" km<br> Kilometer an der Wasserstraße", "")
    .str.replace(",", ".")
)

In [166]:
# Now we need to fix the inconsistencies in the specification.
# We encode the reverse info as negative numbers to distinguish later.
meta_out.loc[
    meta_out["D"].str.contains("km<br> Kilometer an der Wasserstraße"), "D"
] = "-" + (
    meta_out.loc[
        meta_out["D"].str.contains("km<br> Kilometer an der Wasserstraße"), "D"
    ]
    .str.replace(" km<br> Kilometer an der Wasserstraße", "")
    .str.replace(",", ".")
)

In [167]:
meta_out.loc[meta_out["D"] == "55,63 km<br> unterhalb Grenze CZ / DE", "D"] = "-55.63"
meta_out.loc[meta_out["D"] == "664,95 km<br> unterhalb der Oppamündung", "D"] = (
    "-664.95"
)
meta_out.loc[meta_out["D"] == "554,14 km<br> unterhalb der Oppamündung", "D"] = (
    "-554.14"
)

In [168]:
meta_out.loc[meta_out["D"] == "keine Angabe", "D"] = np.nan
meta_out.loc[meta_out["D"] == "Unknown", "D"] = np.nan

In [169]:
meta_out.isnull().sum()

R    6
X    0
Y    0
D    8
H    6
dtype: int64

In [170]:
meta_out["D"] = meta_out["D"].astype(float)

In [171]:
# hand correct. This infos was previously available but is not crawlable anymore somehow. 

meta_out.loc["6945301", "R"] = "Volzine"
meta_out.loc["5895001", "R"] = "Temnitz"
meta_out.loc["6910302", "R"] = "Brieskower Kanal"
meta_out.loc["5811701", "R"] = "Lychener Gewässer"
meta_out.loc["6970800", "R"] = "Salveybach"


meta_out.loc["6945301", "D"] = 0.04
meta_out.loc["5895001", "D"] = 2.75
meta_out.loc["6910302", "D"] = 3.37	
meta_out.loc["5811701", "D"] = 15.85	
meta_out.loc["6970800", "D"] = 7.15	

meta_out.loc["6945301", "H"] = 1.765	
meta_out.loc["5895001", "H"] = 28.65	
meta_out.loc["5811701", "H"] = 61.97	
meta_out.loc["6970800", "H"] = 12.523	

In [172]:
meta_out["QD"] = -1
meta_out["QH"] = -1
meta_out["QX"] = -1
meta_out["QY"] = -1
meta_out["QR"] = -1

In [174]:
meta_out.loc[~(meta_out["X"].isnull()), "QX"] = 0
meta_out.loc[~(meta_out["Y"].isnull()), "QY"] = 0
meta_out.loc[~(meta_out["D"].isnull()), "QD"] = 0
meta_out.loc[~(meta_out["R"].isnull()), "QR"] = 0
meta_out.loc[~(meta_out["H"].isnull()), "QH"] = 0

In [175]:
## CHeck if leading 0 indices are unique.

In [177]:
meta_out.to_csv(output_path + "brandenburg_meta_data.csv")

In [137]:
stack.to_csv(output_path + "brandenburg_processed.csv")