# Notebook visualising the preprocessing of the 'public_nuisance' dataset.

In [1]:
import pandas as pd

#### Reading the dataset.

In [3]:
public_nuisance = pd.read_csv('../../../data/raw/public_nuisance.csv')

#### Manually translating some columns from Dutch to English. Additionally, some filters such as:
1.  Excluding the Total summed nuisance for each year by applying str.len() > 4.
2. Filtering the time period of the whole df from 2014-2020 inclusive and.
3. Getting the categories which contain `total` in their name.

#### Finally, the aggregated nuisance data is being grouped by year and Neighbourhood .

In [41]:
public_nuisance['year'] = public_nuisance['Perioden'].str[:4].astype(int)
public_nuisance['Neighbourhood'] = public_nuisance['WijkenEnBuurten']
public_nuisance['nuisance'] = public_nuisance['GeregistreerdeOverlast_1'].astype(int)

public_nuisance = public_nuisance.loc[
    (public_nuisance['Perioden'].str.len() > 4) &
    (public_nuisance['year'] >= 2014) & (public_nuisance['year'] <= 2020) &
    (public_nuisance['Overlast'].str.contains(pat='Totaal'))
    ].groupby(['year', 'Neighbourhood'])['nuisance'].mean().reset_index()

#### Saving the preprocessed dataframe to a .csv file.

In [43]:
public_nuisance.to_csv('preprocessed/public_nuisance_preprocessed.csv', index=False)