In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import scipy.stats
import geopandas as gpd
%matplotlib inline
from IPython.display import Markdown
from functools import reduce
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

### Theming

In [10]:
if 1:
    style = {
        # 'background-color': '#1b1b1b', # rgb(27, 27, 27)
        'color': 'white', # font
    }
    pio.templates.default = 'plotly_dark'
else:
    style = None
    pio.templates.default = 'plotly'

### Loading
Table 40 by kraje/regions  
Age of 90 will mean age of 90 or more

In [11]:
import sys
sys.path.append('../')

from preprocessing import rename_columns, replace_with_nan
table_40 = pd.read_csv('../data/RV_O_040_R_KR_SK.CSV', sep=';')
table_40['Vek'] = pd.to_numeric(table_40['Vek'].replace('90 a viac rokov', '90'))

## Grouping education

Grouping to
- unspecified
- without education
- primary education
- secondary education
- higher education

In [12]:
education_levels = set(table_40['Vzdelanie'].unique())
display(education_levels)

grouped_edu = 'Grouped edu'

primary = [x for x in education_levels if "základné" in x]
education_levels -= set(primary) # for quering purposes
secondary = [x for x in education_levels if "stredné" in x]
education_levels -= set(secondary)
higher = [x for x in education_levels if 'vysokoškolské' in x or "vyššie" in x]
education_levels -= set(higher)
without = [x for x in education_levels if 'bez' in x]
education_levels -= set(without)
unspecified = ['nezistené']
education_levels -= set(unspecified)
display(education_levels)

for label, level in zip(['primary', 'secondary', 'higher', 'without', 'unspecified'],
                        [primary, secondary, higher, without, unspecified]):
    level = table_40['Vzdelanie'].isin(level)
    table_40.loc[level, grouped_edu] = label

# table_40.sample(30)


{'bez ukončeného vzdelania – osoby vo veku 0-14 rokov',
 'bez školského vzdelania – osoby vo veku 15 rokov a viac',
 'nezistené',
 'stredné odborné (učňovské) vzdelanie bez maturity (bližšie neuvedené)',
 'stredné odborné (učňovské) vzdelanie bez maturity a bez výučného listu (zaškolenie, zaučenie)',
 'stredné odborné (učňovské) vzdelanie bez maturity s vysvedčením o záverečnej skúške',
 'stredné odborné (učňovské) vzdelanie bez maturity s výučným listom',
 'vysokoškolské vzdelanie (bližšie neuvedené)',
 'vysokoškolské vzdelanie - 1. stupeň (Bc.)',
 'vysokoškolské vzdelanie - 2. stupeň (Ing.; Mgr.; MUDr.; a i.)',
 'vysokoškolské vzdelanie - 3. stupeň (PhD.; a i.)',
 'vyššie odborné vzdelanie (bližšie neuvedené)',
 'vyššie odborné vzdelanie nadstavbové (maturita absolventov učebných odborov stredných odborných škôl)',
 'vyššie odborné vzdelanie pomaturitné (pomaturitné kvalifikačné)',
 'vyššie odborné vzdelanie vyššie odborné (absolventská skúška, absolventský diplom)',
 'základné vzdel

set()

## Computed/Display values
The following function returns a data frame with such properties
- `number` - number of people that satisfy query filter
- `category_percent` - total/denominator is category
- `filtered_percent` - filtered / not filtered
- `age_percent` - total/denominator is all at the same age

In [25]:
def compute_age_based(data, query, filter_result=None, feature=None):
    """
    Parameters:
        data (DataFrame): The input DataFrame containing the data to count.
        query (str): The query string to filter the data.
        filter_result (number, optional): The lower bound of the number of people to be
            displayed on the plot. 0 does nothing
        feature (str, optional): The column name to group by,
            determines categories (optional).
    """
    filtered = data.query(query)
    if feature is None:
        aggregated =     filtered.groupby(['Vek'])['abs.'].sum().reset_index(name='number')
        total_category = filtered                 ['abs.'].sum()
        total_unfiltered =   data.groupby(['Vek'])['abs.'].sum().rename('total_unfiltered')
        total_age =      filtered.groupby(['Vek'])['abs.'].sum().rename('total_age') # Makes no sense, always 100%
        aggregated = pd.merge(aggregated, total_unfiltered, on=['Vek'], how='left')
        aggregated = pd.merge(aggregated, total_age,        on=['Vek'], how='left')
        aggregated['category_percent'] = aggregated['number'] / total_category * 100
        aggregated['filtered_percent'] = aggregated['number'] / aggregated['total_unfiltered'] * 100
        aggregated['age_percent'] =      aggregated['number'] / aggregated['total_age'] * 100
    else:
        aggregated =     filtered.groupby(['Vek', feature])['abs.'].sum().reset_index(name='number')
        total_category = filtered.groupby([       feature])['abs.'].sum().rename('total_category')
        total_unfiltered =   data.groupby(['Vek', feature])['abs.'].sum().rename('total_unfiltered')
        total_age =      filtered.groupby(['Vek'         ])['abs.'].sum().rename('total_age')
        aggregated = pd.merge(aggregated, total_category,   on=[       feature], how='left')
        aggregated = pd.merge(aggregated, total_unfiltered, on=['Vek', feature], how='left')
        aggregated = pd.merge(aggregated, total_age,        on=['Vek',        ], how='left')
        aggregated['category_percent'] = aggregated['number'] / aggregated['total_category'] * 100
        aggregated['filtered_percent'] = aggregated['number'] / aggregated['total_unfiltered'] * 100
        aggregated['age_percent'] =      aggregated['number'] / aggregated['total_age'] * 100
    if filter_result is not None and filter_result > 0:
        aggregated = aggregated[aggregated['number'] >= filter_result]
    return aggregated

feature = 'Pohlavie'
data = compute_age_based(table_40, 'Vzdelanie == "vysokoškolské vzdelanie - 1. stupeň (Bc.)"', feature=feature)
px.line(data, x='Vek', y='number', color=feature, hover_data=['number']).show()
px.line(data, x='Vek', y='category_percent', color=feature, hover_data=['number']).show()
px.line(data, x='Vek', y='filtered_percent', color=feature, hover_data=['number']).show()
px.line(data, x='Vek', y='age_percent', color=feature, hover_data=['number']).show()

## Interactive `dash` plot by age

Data is filtered and than optionally grouped by feature  
Display value can be choosed from
- Number
- Category percent - total is category
- Filtered percent - filtered / not filtered
- Age percent - total is all at the same age

Filtering can be done by and features can be
- Vzdelanie
- Grouped edu
- Názov kraja
- Pohlavie
- Súčasná ekonomická aktivita

In [14]:
properties = [
    'Súčasná ekonomická aktivita',
    'Vzdelanie',
    # 'Názov oblasti',
    'Názov kraja',
    'Pohlavie',
    grouped_edu,
]
unique_data = [table_40[property].unique() for property in properties]
# selected_options = [list(range(len(unique_data[i]))) for i in range(len(properties))]
selected_options = [[0] for i in range(len(properties))]

### Configuring
  
As filling data each time from scratch by hand can be frustrating it is possible to import and export filtering options. Only filtering options are ready to be im/exported. **Remember** to change the file name in the next cell.

`%%script true` at the top of the code block means I do not want to accidentally run it. To run the cell just comment this line. **Do not forget** to uncomment it after.

As for now `properties` structure has changed, so old configs won't work, but with new everything ones will work

In [15]:
# selected_options_source = 'vyssie_vzdelanie.txt'
selected_options_source = 'vyssie_vzdelanie.txt'

In [16]:
%%script true
# Export
print(*[' '.join(map(str, s)) for s in selected_options], sep='\n', file=open(selected_options_source, 'w'))

In [17]:
%%script true
# Import
with open(selected_options_source, 'r') as file:
    for i, line in enumerate(file.readlines()):
        selected_options[i] = list(map(int, line.split()))
display(selected_options)

In [18]:
%%script true
# quickly setting up options
selected_options[0] = list(range(0))
selected_options[3] = []
selected_options[1] = list(range(8))
selected_options[2] = [0, 1]

In [26]:
# %%script true # Skip
app = Dash(__name__)

checklists = [
    html.Div([
        html.H4(f"Select {properties[i]}"),
        dcc.Checklist(
            id=f'selected_{i}',
            options=[{'label': v, 'value': i} for i, v in enumerate(unique_data[i])],
            value=selected_options[i]
        )
    ])
    for i in range(len(properties))
]

app.layout = html.Div(
    style=style,
    children=[
        html.Div(checklists[::2], style={'width': '50%', 'display': 'inline-block'}),
        html.Div(checklists[1::2], style={'width': '50%', 'display': 'inline-block'}),
        html.Br(),
        html.Br(),
        dcc.Graph(id='line-plot'),
        html.Div([
            html.Div([
                html.H4('Enter title'),
                dcc.Input(id='title', type='text'),
                html.H4('Enter lower bound'),
                dcc.Input(id='lower-bound', type='number'),
                html.Br(),
                dcc.Checklist(
                    id='checkbox',
                    options=[
                        { 'label': 'Add markers', 'value': True }
                    ],
                    value=[]
                )
            ], style={'flex': 1, 'padding': 10}),
            html.Div([
                html.H4('Select feature'),
                dcc.RadioItems(
                    id='feature',
                    options=[{ 'label': 'No', 'value': None }] + [{ 'label': v, 'value': i } for i, v, in enumerate(properties)],
                    value=None,
                ),
            ], style={'flex': 1, 'padding': 10}),
            html.Div([
                html.H4('Select display value'),
                dcc.RadioItems([
                    { 'value': i, 'label': l }
                    for i, l in enumerate(['Number', 'Category percent', 'Filtered percent', 'Age percent'])
                ], 0, id='display-value'),
            ], style={'flex': 1, 'padding': 10}),
        ], style={'display': 'flex', 'flexDirection': 'row'})
    ]
)

@app.callback(
    Output('line-plot', 'figure'),
    [
        Input('feature', 'value'),
        Input('lower-bound', 'value'),
        Input('title', 'value'),
        Input('display-value', 'value'),
        Input('checkbox', 'value'),
    ]
     + [Input(f'selected_{i}', 'value') for i in range(len(properties))]
)
def update_figure(feature, lower_bound, title, display_value, checkbox, *arg):
    selected_options[:] = list(arg)
    query = ' and '.join([
        '(' + ' or '.join([f"`{properties[i]}` == '{unique_data[i][s]}'" for s in selected]) + ')'
        for i, selected in enumerate(arg)
        if len(selected) > 0
    ])
    compute_values = ['number', 'category_percent', 
                      'filtered_percent', 'age_percent']
    feature = properties[feature] if feature is not None else None
    data = compute_age_based(table_40, query, filter_result=lower_bound, feature=feature)
    y = compute_values[display_value]
    figure = px.line(data, x='Vek', y=y, color=feature, hover_data=['number'])
    if display_value == 0:
        figure.update_layout(xaxis_title='Age', yaxis_title='Number of people', title=title)
    else:
        figure.update_layout(xaxis_title='Age', yaxis_title='Percent of people', title=title)
    figure.update_traces(mode='lines' + ('+markers' if len(checkbox) > 0 else ''))

    return figure

app.run_server(mode='inline', port=8053)