This Notebook takes input of the XML files obtained through BROLoket: https://www.broloket.nl/ondergrondgegevens
Some minor preprocessing is done.
The data is stored in a csv for the CPT and the Bore data separately.

In [1]:
import os
from tqdm import tqdm
import pandas as pd
import geopandas as gpd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import folium
from geotexxx.gefxml_reader import Bore, Cpt
import scipy.stats as st

In [2]:
def load_cptdata():
    """
    Function to load the CPT data from XML files,
    storing metadata and measurements in a dataframe.
    """
    
    # initiate Cpt instance
    xml = Cpt()
    df_cpt = pd.DataFrame()
    directory = "../Data/SoildataComplete/Geotechnisch sondeeronderzoek BRO"

    # iterate over files
    for file in tqdm(os.listdir(directory)):
        
        # only extract from the xml files
        if file.split(".")[1] == 'xml':
            xml.load_xml(directory + '/' + file)
            
            # add metadata
            xml.data[['easting', 'northing', 'groundlevel']] \
                = xml.easting, xml.northing, xml.groundlevel
            
            # append data to dataframe
            df_cpt = pd.concat([df_cpt, xml.data])
            
    return df_cpt

In [3]:
warnings.filterwarnings(action='ignore')

def load_boredata():
    """
    Function to load the Bore data from XML files,
    storing metadata and measurements in a dataframe.
    """
    
    # initiate Bore instance
    xml = Bore()
    directory = "../Data/soildata/Geotechnisch booronderzoek BRO"
    df_bore = pd.DataFrame()
    
    # iterate over files
    for file in tqdm(os.listdir(directory)):

        # skip the IMBRO_A files
        if file.split(".")[0][-1] == 'A':
            continue

        xml.load_xml(directory + '/' + file)

        # data is stored per 'lab' and 'veld' (research environment)
        for key in list(xml.soillayers.keys()):
            
            # extract the bore data
            data = xml.soillayers[key]
            
            # add the metadata
            data[['environment', 'easting', 'northing', 'groundlevel', 'testid', 'finaldepth']] \
                = key, xml.easting, xml.northing, xml.groundlevel, xml.testid, xml.finaldepth
            data['date'] = pd.to_datetime(pd.DataFrame(xml.date, index=[0]))[0]
            
            # append data to dataframe
            df_bore = pd.concat([df_bore, data])

    return df_bore

In [4]:
# Takes approx 45 minutes to load
df_cpt_init = load_cptdata()

100%|████████████████████████████████████████████████████████████████████████████| 10884/10884 [44:58<00:00,  4.03it/s]


In [5]:
df_cpt = df_cpt_init

In [6]:
df_bore_init = load_boredata()

100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:12<00:00, 18.07it/s]


In [7]:
df_bore = df_bore_init

In [8]:
def preprocess(data, investigation):
    """
    Function to do first preprocessing steps for Bore and CPT data
    """
    
    data = data.reset_index().drop(columns=['index'])
    
    # drop columns with more than 90% nan
    to_drop = [col for col in data if data[col].isna().sum()/len(data) >= 0.9]
    
    # remove noninformative columns from bore data
    if investigation == 'bore':
        to_drop += ['upperBoundaryDetermination', 'lowerBoundaryDetermination', 'activityType', 'grainshape', 'mixed', 'components', 'layer', 'soil']
    
    data = data.drop(to_drop, axis=1)
    data = data.drop_duplicates()

    return data

In [9]:
df_bore = preprocess(df_bore, 'bore')

In [10]:
df_bore.to_csv('Bore.csv', index=True)

In [11]:
df_cpt = preprocess(df_cpt, 'cpt')

In [12]:
df_cpt.to_csv('CPT.csv', index=True)