# Education in Slovakia

## Import packages

In [None]:
import numpy as np

import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
import plotly.graph_objects as go

from IPython.display import display, Markdown

import preprocessing
from isco_occupation import OCCUPATION_ISCO_MAP
from categorize_education import EDUCATION_CATEGORY_MAP
from economic_sectors import ECONOMIC_SECTORS_MAP

import os
import warnings
# TODO: uncomment before publishing
#warnings.filterwarnings('ignore')

## Import data

In [None]:
#TODO: change links before publishing
#data_root = 'https://raw.githubusercontent.com/AndruKP/mathprofi-education/dev/data'
data_root = './data'
url_010_SR = data_root + '/RV_O_010_R_SR_SK.CSV'
url_040_SR = data_root + '/RV_O_040_R_SR_SK.CSV'
url_040_OK = data_root + '/RV_O_040_L_OK_SK.CSV'
url_047_SR = data_root + '/RV_O_047_R_SR_SK.CSV'
url_047_OK = data_root + '/RV_O_047_L_OK_SK.CSV'
url_067_SR = data_root + '/RV_O_067_R_SR_SK.CSV'
url_067_KR = data_root + '/RV_O_067_R_KR_SK.CSV'

districts_url = 'https://bbrejova.github.io/viz/data/districts.json'

In [None]:
table_010_SR = pd.read_csv(url_010_SR, sep=';')
table_040_SR = pd.read_csv(url_040_SR, sep=';')
table_040_OK = pd.read_csv(url_040_OK, sep=';')
table_047_SR = pd.read_csv(url_047_SR, sep=';')
table_047_OK = pd.read_csv(url_047_OK, sep=';')
table_067_SR = pd.read_csv(url_067_SR, sep=';')
table_067_KR = pd.read_csv(url_067_KR, sep=';')

districts_table = gpd.read_file(districts_url)

display(table_010_SR.head())
display(table_040_SR.head())
display(table_040_OK.head())
display(table_047_SR.head())
display(table_047_OK.head())
display(table_067_SR.head())
display(table_067_KR.head())

display(districts_table.head())

## Preprocess data

In [None]:
# TODO: aggregate all preprocessing functions into one

table_010_SR = preprocessing.rename_columns(preprocessing.translate_sex(table_010_SR))
table_040_OK = preprocessing.rename_columns(preprocessing.translate_sex(table_040_OK))
table_040_SR = preprocessing.rename_columns(preprocessing.translate_sex(table_040_SR))
table_047_OK = preprocessing.rename_columns(preprocessing.translate_sex(table_047_OK))
table_047_SR = preprocessing.rename_columns(preprocessing.translate_sex(table_047_SR))
table_067_SR = preprocessing.rename_columns(preprocessing.translate_sex(table_067_SR))
table_067_KR = preprocessing.rename_columns(preprocessing.translate_sex(table_067_KR))
tables = [table_010_SR, table_040_OK, table_040_SR, table_047_OK, table_047_SR, table_047_SR,table_067_SR, table_067_KR]
for table in tables:
    if 'age' in table.columns:
        table["age"] = pd.to_numeric(table["age"].replace({"90 a viac rokov": "90"}))
    if 'education' in table.columns:
        table['education_category'] = table['education'].map(EDUCATION_CATEGORY_MAP)
    if 'ISCO_occupation' in table.columns:
        table['ISCO_group'] = table['ISCO_occupation'].map(OCCUPATION_ISCO_MAP)

display('table_010_SR:')
display(table_010_SR.head())
display('table_040_SR:')
display(table_040_SR.head())
display('table_040_OK:')
display(table_040_OK.head())
display('table_047_SR:')
display(table_047_SR.head())
display('table_047_OK:')
display(table_047_OK.head())
display('table_067_SR:')
display(table_067_SR.head())
display('table_067_KR:')
display(table_067_KR.head())

## Basic demographic analysis

### Population pyramid

In [None]:
grouped_by_sex_age = table_040_SR.groupby(['sex', 'age'])['count'].sum()
women_counts = grouped_by_sex_age.loc['female']
men_counts = grouped_by_sex_age.loc['male']

min_counts = np.minimum(men_counts, women_counts)

In [None]:
# TODO: legend and beauty
figure, axes = plt.subplots(figsize=(10, 10))
colors_ = sns.color_palette("Paired")

# Surplus
women_surplus_graph = sns.histplot(y=women_counts.index, weights=women_counts, binwidth=1, color=colors_[5], ax=axes,
                                   label='women_s')
men_surplus_graph = sns.histplot(y=men_counts.index, weights=-1 * men_counts, binwidth=1, color=colors_[1], ax=axes,
                                 label='men_s')

# Population
women_pop_graph = sns.histplot(y=min_counts.index, weights=min_counts, binwidth=1, color=colors_[4], ax=axes,
                               label='women')
men_pop_graph = sns.histplot(y=min_counts.index, weights=-1 * min_counts, binwidth=1, color=colors_[0], ax=axes,
                             label='men')

plt.title('Population pyramid')
plt.ylabel('Age')
plt.xticks([-40000, -20000, 0, 20000, 40000], labels=['40000', '20000', '0', '20000', '40000'])

men_pop_patch = mpatches.Patch(color=colors_[0], label='Men population')
men_surplus_patch = mpatches.Patch(color=colors_[1], label='Men surplus')
women_pop_patch = mpatches.Patch(color=colors_[4], label='Women population')
women_surplus_patch = mpatches.Patch(color=colors_[5], label='Women surplus')

axes.legend(handles=[women_surplus_patch, women_pop_patch, men_surplus_patch, men_pop_patch])
pass

## Education analysis

### Education specialization

In [None]:
table_067_KR_copy = table_067_KR.copy()

In [None]:
education_levels = table_067_KR_copy.loc[:, 'education'].unique()
higher_edu = [x for x in education_levels if 'vysokoškolské' in x or "vyššie" in x]
secondary_edu = [x for x in education_levels if "stredné" in x]
primary_edu = [x for x in education_levels if "základné" in x]
without_edu = ['bez školského vzdelania – osoby vo veku 15 rokov a viac']
undefined_edu = [x for x in education_levels if x not in higher_edu + secondary_edu + primary_edu + without_edu]

In [None]:
education_dict = {'higher': higher_edu,
                  'secondary': secondary_edu,
                  'primary': primary_edu,
                  'without education': without_edu,
                  'undefined': undefined_edu}

main_education_dict = {value: key for key in education_dict.keys() for value in education_dict[key]}

In [None]:
table_067_KR_copy['education_level'] = table_067_KR_copy['education'].map(main_education_dict)
education_series = table_067_KR_copy.query('`education_level` != "undefined"').groupby('education_level')['count'].sum()

counts_df = pd.DataFrame({'education_level': education_series.index,
                          'count': education_series.values})

In [None]:
# TODO: beauty
figure, axes = plt.subplots(figsize=(10, 10))
sns.barplot(data=counts_df, x='education_level', y='count', color=sns.color_palette(as_cmap=True)[0], order=['primary', 'secondary', 'higher', 'without education'], ax=axes)

axes.semilogy()
axes.set_xlabel('Achieved education')
axes.set_ylabel('Count')

plt.title('The number of people by achieved education (y-axis is logarithmic)')
plt.show()

### Geographic specialization of education

In [None]:
# TODO: Kolya's graphs
from os.path import join

data_path = "data"

districts_url = join(data_path, "districts.json")
districts = gpd.read_file(districts_url)
districts_geojson_url = join(data_path, "districts.geojson")
districts_geojson = gpd.read_file(districts_geojson_url, crs="EPSG:4326")

districts_geojson_indexed = districts_geojson.set_index("IDN3")
districts_indexed = districts.set_index("IDN3")
districts_indexed[["geometry", "Area", "AreaHA"]] = districts_geojson_indexed[
    ["geometry", "Shape_Area", "VYMERA_ha"]
]
geo_frame = districts_indexed

In [None]:
# dash_geo functions
def compute_groups(data, groupby, chosen_query="", filter_query=""):
    if filter_query != "":
        data = data.query(filter_query)
    if chosen_query != "":
        selected = data.query(chosen_query)
    else:
        selected = data
    aggregated = (
        selected.groupby(groupby, observed=True)["count"]
        .sum()
        .rename("number")
        .to_frame()
    )
    aggregated["number_percent"] = aggregated["number"] / selected["count"].sum() * 100
    aggregated["total"] = data.groupby(groupby, observed=True)["count"].sum()
    aggregated["percent"] = aggregated["number"] / aggregated["total"] * 100
    aggregated = aggregated.reset_index()
    return aggregated


def plot_groups(data, groupby, value, title=""):
    figure = None
    if groupby not in ["NUTS2_CODE", "NUTS3_CODE", "LAU1_CODE"]:
        data = data.sort_values(by=value, ascending=False)
        if value == "percent":
            figure = px.bar(
                data,
                y=groupby,
                color=groupby,
                x="percent",
                orientation="h",
                hover_data=["number", "percent"],
            )
        elif value == "number":
            figure = px.treemap(
                data,
                path=[px.Constant("all"), groupby],
                values="number",
                hover_data=["number", "number_percent"],
            )
    else:
        merged = geo_frame.merge(data, on=groupby)
        figure = px.choropleth_mapbox(
            data_frame=merged,
            geojson=merged.geometry,
            locations=merged.index,
            color=value,
            mapbox_style="carto-positron",
            center={"lat": 48.6737532, "lon": 19.696058},
            zoom=6,
            opacity=0.5,
            hover_data=["LAU1", "number", "percent", "AreaHA"],
        )
    figure.update_layout(title=title)
    return figure

#### Where are people in Slovakia?'

In [None]:
data = compute_groups(table_040_OK, groupby="LAU1_CODE")
figure = plot_groups(data, groupby="LAU1_CODE", value="number")
figure.update_layout(
    coloraxis_colorbar=dict(title='Number of people'),
)
figure.show()

That will help us to understand the following data
#### Where is the largest number of people with a degree?

In [None]:
data = compute_groups(table_040_OK, groupby="LAU1_CODE", chosen_query="`education_category`.isin(['higher'])")
figure = plot_groups(data, groupby="LAU1_CODE", value="number", title="Number of people achieved higher education")
figure.show()

So after all it is just about the big cities  
#### Where is the largest *percent* of people with a degree?

In [None]:
figure = plot_groups(data, groupby="LAU1_CODE", value="percent", title="Percent of people achieved higher education")
figure.show()

Now if we were to be aliens we would be assured it is more to Bratislava and Košice than we thought before
#### Where is the largest number of students?

In [None]:
data = compute_groups(table_040_OK, groupby='LAU1_CODE', chosen_query="`current_economic_activity`.isin(['študent vysokej školy'])")
figure = plot_groups(data, groupby='LAU1_CODE', value='number')
figure.show()

Same story here
#### Percent?

In [None]:
figure = plot_groups(data, groupby='LAU1_CODE', value='percent')
figure.show()

Interesting
#### Where is the largest percent of people continuing their education among people after 22

In [None]:
data = compute_groups(table_040_OK, groupby='LAU1_CODE', chosen_query="`education_category`.isin(['higher']) and `current_economic_activity`.isin(['študent vysokej školy'])", filter_query='22 <= `age` < 91')
figure = plot_groups(data, groupby='LAU1_CODE', value='percent')
figure.show()

#### Where degree means less?  
Where your diploma with some chance won't give you a desired *type* of job?  
It depends on what you want. If you thinking first three ISCO major groups then here you are

In [None]:
data = compute_groups(table_047_OK, groupby='LAU1_CODE', chosen_query="`education_category`.isin(['higher']) and `ISCO_group`.isin(['Craft and related trades workers', 'Elementary occupations', 'Plant and machine operators and assemblers', 'Skilled agricultural and fishery workers', 'Armed forces', 'Service workers and shop and market sales workers', 'Clerks'])", filter_query='')
figure = plot_groups(data, groupby='LAU1_CODE', value='percent', title='Percent of people with higher education working on a simple job')
figure.show()

If working as a clerk suits you

In [None]:
data = compute_groups(table_047_OK, groupby='LAU1_CODE', chosen_query="`education_category`.isin(['higher']) and `ISCO_group`.isin(['Craft and related trades workers', 'Elementary occupations', 'Plant and machine operators and assemblers', 'Skilled agricultural and fishery workers', 'Armed forces', 'Service workers and shop and market sales workers'])", filter_query='')
figure = plot_groups(data, groupby='LAU1_CODE', value='percent', title='Percent of people with higher education working on a simple job')
figure.show()

If job as a service worker or shop and market sales worker works for you as well

In [None]:
data = compute_groups(table_047_OK, groupby='LAU1_CODE', chosen_query="`education_category`.isin(['higher']) and `ISCO_group`.isin(['Craft and related trades workers', 'Elementary occupations', 'Plant and machine operators and assemblers', 'Skilled agricultural and fishery workers', 'Armed forces'])", filter_query='')
figure = plot_groups(data, groupby='LAU1_CODE', value='percent', title='Percent of people with higher education working on a simple job')
figure.show()

At this point it is not so bad. Let's look at something different.  
#### What percent of people with higher education works in occupation group, in which 60% of workers do not have higher education?

In [None]:
data = compute_groups(table_047_OK, groupby='LAU1_CODE', chosen_query="`ISCO_group`.isin(['Armed forces', 'Clerks', 'Craft and related trades workers', 'Elementary occupations', 'Plant and machine operators and assemblers', 'Service workers and shop and market sales workers', 'Skilled agricultural and fishery workers', 'Technicians and associate professionals'])", filter_query="`education_category`.isin(['higher'])")
figure = plot_groups(data, groupby='LAU1_CODE', value='percent', title='Percent of people working on simple job among people with a degree')
figure.show()

### Gender specialization of education

#### Gender distribution of education by gender

In [None]:
education_by_sex = table_047_SR.groupby(['sex', 'education_category'])['count'].sum()
education_by_sex = education_by_sex.reset_index()
education_by_sex.sort_values(by='count', ascending=False, inplace=True)

In [None]:
# TODO: beauty
figure, axes = plt.subplots(figsize=(10, 10))

sns.barplot(data=education_by_sex, y="education", x="count", hue="sex", ax=axes)

axes.set_xlabel('Count')
axes.set_ylabel('Education')
axes.set_title('Distribution of education')
pass

#### When the higher education began to open to women in Slovakia

In [None]:
# dash_age functions
def compute_age_based(data, query, groupby=None, filter_result=None):
    if query != "":
        chosen = data.query(query)
    else:
        chosen = data
    if groupby is None:
        aggregated =     chosen.groupby(['age'])['count'].sum().reset_index(name='number')
        total_category = chosen                 ['count'].sum()
        total_unfiltered = data.groupby(['age'])['count'].sum().rename('total_unfiltered')
        total_age =      chosen.groupby(['age'])['count'].sum().rename('total_age') # Makes no sense, always 100%
        aggregated = pd.merge(aggregated, total_unfiltered, on=['age'], how='left')
        aggregated = pd.merge(aggregated, total_age,        on=['age'], how='left')
        aggregated['category_percent'] = aggregated['number'] / total_category * 100
        aggregated['filtered_percent'] = aggregated['number'] / aggregated['total_unfiltered'] * 100
        aggregated['age_percent'] =      aggregated['number'] / aggregated['total_age'] * 100
    else:
        aggregated =     chosen.groupby(['age', groupby])['count'].sum().reset_index(name='number')
        total_category = chosen.groupby([       groupby])['count'].sum().rename('total_category')
        total_unfiltered = data.groupby(['age', groupby])['count'].sum().rename('total_unfiltered')
        total_age =      chosen.groupby(['age'         ])['count'].sum().rename('total_age')
        aggregated = pd.merge(aggregated, total_category,   on=[       groupby], how='left')
        aggregated = pd.merge(aggregated, total_unfiltered, on=['age', groupby], how='left')
        aggregated = pd.merge(aggregated, total_age,        on=['age',        ], how='left')
        aggregated['category_percent'] = aggregated['number'] / aggregated['total_category'] * 100
        aggregated['filtered_percent'] = aggregated['number'] / aggregated['total_unfiltered'] * 100
        aggregated['age_percent'] =      aggregated['number'] / aggregated['total_age'] * 100
    if filter_result is not None and filter_result > 0:
        aggregated = aggregated[aggregated['number'] >= filter_result]
    return aggregated


def plot_age_based(
    data,
    groupby=None,
    title="",
    display_value="number",
    markers=False,
):
    figure = px.line(
        data, x="age", y=display_value, color=groupby, hover_data=["number"]
    )
    figure.update_layout(xaxis_title="Age", title=title)
    if display_value == 0:
        figure.update_layout(yaxis_title="Number of people")
    else:
        figure.update_layout(yaxis_title="Percent of people")
    figure.update_traces(
        mode="lines" + ("+markers" if markers else ""),
        connectgaps=True,
    )
    return figure

In [None]:
# TODO: fix colors 
data = compute_age_based(table_040_SR, query="`education_category`.isin(['higher']) and 0 <= `age` < 91", groupby='sex', filter_result=1)
figure = plot_age_based(data, groupby='sex', title='Percent of people with a degree', display_value='filtered_percent', markers=False)
figure.show()

Women started to getting higher education on the same level as man only about (60 - 18) = 42 years ago, so in year around 1970

In [None]:
data = compute_age_based(table_040_OK, query="18 <= `age` < 90 and `education_category`.isin(['primary', 'without'])", groupby='sex', filter_result=1)
figure = plot_age_based(data, groupby='sex', title='Percent of people with primary education or without', display_value='filtered_percent', markers=False)
figure.show()

### Adult illiteracy

#### Geographic aspect of illiteracy

In [None]:
#TODO: maybe we should add primary (age > threshold = 18) as well? 

In [None]:
districts_illiteracy = districts_table.set_index('LAU1_CODE')

table_OK_population = table_040_OK.groupby('LAU1_CODE')['count'].sum()

# TODO: uneducated filter applies twice -- introduce func
table_OK_uneducated = (table_040_OK
                       .query(
    'education == "bez školského vzdelania – osoby vo veku 15 rokov a viac" & current_economic_activity != "žiak základnej školy"')
                       .groupby(['LAU1_CODE'])['count']
                       .sum())

table_uneducated_percent = 100 * (table_OK_uneducated / table_OK_population)

geotable_uneducated = gpd.GeoDataFrame(table_OK_uneducated, geometry=districts_illiteracy['geometry'])
geotable_uneducated_percent = gpd.GeoDataFrame(table_uneducated_percent, geometry=districts_illiteracy['geometry'])

display(table_OK_population.head(2))
display(geotable_uneducated.head(2))
display(geotable_uneducated_percent.head(2))

In [None]:
# TODO: beauty
plot = geotable_uneducated.plot(column='count', legend=True, legend_kwds={"orientation": "horizontal"},
                                cmap=sns.color_palette("flare", as_cmap=True))
plot.set_title("Number of people without education")
plot.set_axis_off()

plot = geotable_uneducated_percent.plot(column='count', legend=True, legend_kwds={"orientation": "horizontal"},
                                        cmap=sns.color_palette("flare", as_cmap=True))
plot.set_title("Percent of people without education")
plot.set_axis_off()

In [None]:
#TODO: add description and map about Romani people

#### Employment of uneducated people

In [None]:
table_uneducated_by_isco = (table_067_SR
                            .query("education == 'bez školského vzdelania – osoby vo veku 15 rokov a viac'")
                            .groupby('ISCO_occupation')['count']
                            .sum()
                            .sort_values(ascending=False)
                            .reset_index())

unknown_occupation_count = table_uneducated_by_isco.query('ISCO_occupation == "nezistené"')['count'].sum()
inapplicable_occupation_count = table_uneducated_by_isco.query('ISCO_occupation == "neaplikovateľné"')['count'].sum()
applicable_occupation = table_uneducated_by_isco.query(
    'ISCO_occupation != "nezistené" & ISCO_occupation != "neaplikovateľné"')
applicable_occupation_count = applicable_occupation['count'].sum()

display(applicable_occupation.head())

In [None]:
# TODO: beauty

plt.pie([unknown_occupation_count, inapplicable_occupation_count, applicable_occupation_count],
        labels=['unknown', 'inapplicable', 'applicable'], autopct='%1.1f%%')

plt.show()

In [None]:
plot = sns.barplot(data=applicable_occupation.head(10), x='count', y='ISCO_occupation', color=sns.color_palette(as_cmap=True)[0])
plot.bar_label(plot.containers[0], fontsize=8, padding=3)
plot.set_xlabel('počet ľudí')
plot.set_ylabel('')
plt.subplots_adjust(left=0.5, right=1.6)

#### Age distribution of illiteracy

In [None]:
table_SR_uneducated = (table_040_SR
.query(
    'education == "bez školského vzdelania – osoby vo veku 15 rokov a viac" & current_economic_activity != "žiak základnej školy"'))

#table = table[table['Vek'] != '90 a viac rokov']
#table['5-rocne skupiny'] = table['Vek'].apply(int).apply(f)

#table = pd.DataFrame(table.groupby('5-rocne skupiny')['abs.'].sum()).reset_index()
table_SR_uneducated

In [None]:
# TODO: beauty
plot = sns.histplot(data=table_SR_uneducated, x="age", weights="count", binwidth=1)

plot.set_xlabel('Age')
plot.set_ylabel('Number of people')
plot.set_title('Number of people without education')

plt.xticks(rotation=45)
plt.show()

## Employment analysis

### Employment specialization

In [None]:
table_067_KR_copy['occupation_type'] = table_067_KR_copy['ISCO_occupation'].map(OCCUPATION_ISCO_MAP)

In [None]:
occupation_bar = table_067_KR_copy.dropna().groupby('occupation_type')['count'].sum()
px.bar(y=occupation_bar.index, x=occupation_bar.values, labels={'x': 'count', 'y': 'occupation'},
       title='The number of people by type of occupation', orientation='h')

In [None]:
help_for_par1 = \
table_067_KR_copy.query('`education_level` != "undefined"').groupby(['education_level', 'occupation_type'])[
    'count'].sum()

fig = go.Figure(go.Parcats(
    dimensions=[{'label': 'education',
                 'values': [i[0] for i in help_for_par1.index.values]},
                {'label': 'occupation',
                 'values': [i[1] for i in help_for_par1.index.values]}],

    counts=help_for_par1.values
))

fig.update_layout(height=600, width=800)

fig.show()

In [None]:
# help_for_par2 = table_067_KR_copy.groupby(['region_name', 'education_level'])['count'].sum()

# fig = go.Figure(go.Parcats(
#     dimensions=[{'label': 'region',
#          'values': [i[0] for i in help_for_par2.index.values]},
#         {'label': 'education',
#          'values': [i[1] for i in help_for_par2.index.values]}],

#     counts=help_for_par2.values
# ))

# fig.update_layout(height=600, width=800)

# fig.show()

### Sectoral structure of the economy

In [None]:
table_067_KR_copy['economic_sector'] = table_067_KR_copy['NACE_section'].map(ECONOMIC_SECTORS_MAP)
table_067_KR_copy = table_067_KR_copy.query('`education_level` != "undefined"')
table_067_KR_copy = table_067_KR_copy.query('`economic_sector` != "undefined"')

In [None]:
sector_sum = table_067_KR_copy.groupby('economic_sector')['count'].sum()
sector_sum_edu = table_067_KR_copy.groupby(['economic_sector', 'education_level'])['count'].sum()


def map_func(index1, index2):
    return sector_sum_edu[(index1, index2)]

In [None]:
table_067_KR_copy['count_for_sector'] = table_067_KR_copy['economic_sector'].map(sector_sum)
table_067_KR_copy['count_edu_sector'] = table_067_KR_copy.apply(
    lambda x: map_func(x['economic_sector'], x['education_level']), axis=1)

table_067_KR_copy['percentage'] = table_067_KR_copy['count_edu_sector'] * 100 / table_067_KR_copy['count_for_sector']

In [None]:
# TODO: beauty
tmp_sectors = table_067_KR_copy['economic_sector'].unique()
tmp_education = table_067_KR_copy[table_067_KR_copy['education_level']!='without education']['education_level'].unique()

starts = pd.Series([0] * tmp_sectors.shape[0])

figure, axes = plt.subplots(figsize=(10, 6))

for group in tmp_education:
    tmp_data = table_067_KR_copy.query("`education_level` == @group").drop_duplicates('count_edu_sector')
    rectangles = axes.barh(y=tmp_data['economic_sector'], width=tmp_data['percentage'], left=starts, label=group)
    axes.bar_label(rectangles, label_type='center', fmt="%.0f%%")
    starts += tmp_data['percentage'].reset_index(drop=True)

axes.set_title('Economic sectors + Level of education')
axes.legend(bbox_to_anchor=(1, 1), loc=2)
axes.xaxis.set_visible(False)
axes.set_frame_on(False)
pass

In [None]:
table_for_pie = table_067_KR_copy.drop_duplicates(['education_level', 'economic_sector', 'count_for_sector'])

In [None]:
figure, axes = plt.subplots(figsize=(8, 5))
axes.pie(x=table_for_pie.query('`education_level` == "without education"')['count_edu_sector'],
         labels=table_for_pie.query('`education_level` == "without education"')['economic_sector'], autopct="%.1f%%")
axes.set_title('People without education by economic sectors')
pass

In [None]:
figure, axes = plt.subplots(figsize=(8, 5))
axes.pie(x=table_for_pie.query('`education_level` == "higher"')['count_edu_sector'],
         labels=table_for_pie.query('`education_level` == "higher"')['economic_sector'], autopct="%.1f%%")
axes.set_title('People with higher education by economic sectors')
pass

In [None]:
figure, axes = plt.subplots(figsize=(8, 5))
axes.pie(x=table_for_pie.query('`education_level` == "primary"')['count_edu_sector'],
         labels=table_for_pie.query('`education_level` == "primary"')['economic_sector'], autopct="%.1f%%")
axes.set_title('People with primary education by economic sectors')
pass

### Geographic specialization of economy

In [None]:
# TODO: data

In [None]:
# TODO: graph (asi Kolia)

### Analysis of unemployment

In [None]:
def calculate_counts(data):
    return data.groupby('year_5_age_groups').agg({
        'ISCO_occupation': lambda x: x.isna().sum(),
        'count': 'sum'
    }).rename(columns={'ISCO_occupation': 'NaN count', 'count': 'Total count'}).reset_index()

In [None]:
nan_age_groups = table_047_SR[table_047_SR['ISCO_occupation'] == 'nezistené']
not_nan_age_groups = table_047_SR[table_047_SR['ISCO_occupation'] != 'nezistené']

unemployed_table = calculate_counts(nan_age_groups)
employed_table = calculate_counts(not_nan_age_groups)

merged_table = pd.merge(employed_table, unemployed_table, on='year_5_age_groups', suffixes=('_employed', '_unemployed'))

In [None]:
sns.set_style("whitegrid")
sns.set_context("notebook")
plt.figure(figsize=(10, 6))

sns.barplot(x='year_5_age_groups', y='Total count_employed', data=merged_table, color='blue', label='Employed')
sns.barplot(x='year_5_age_groups', y='Total count_unemployed', data=merged_table, color='red', label='Unemployed')

plt.xticks(rotation='vertical')
plt.xlabel('5-year age groups')
plt.ylabel('Total count')
plt.title('Total count for each 5-year age group (Employed vs Unemployed)')
plt.legend()

plt.savefig('Total_count_Employed_vs_Unemployed.png')
plt.show()

In [None]:
colors = sns.color_palette('pastel')[:2]
nan_age_groups_gender = nan_age_groups.groupby('sex')['count'].sum().reset_index()

plt.figure(figsize=(6, 6))
plt.pie(x=nan_age_groups_gender['count'], labels=nan_age_groups_gender['sex'], autopct='%1.1f%%', colors=colors)
plt.title('Sex Distribution over Unemployment', fontweight='bold')
plt.show()

In [None]:
app = Dash(__name__)

age_group_options = [{'label': age_group, 'value': age_group} for age_group in
                     nan_age_groups['year_5_age_groups'].unique()]

app.layout = html.Div([
    html.H1("Education vs. Unemployment by Age Group"),
    html.Label("Select Age Group(s)"),
    dcc.Dropdown(
        id='age-group-dropdown',
        options=age_group_options,
        value=[age_group_options[0]['value']],
        multi=True
    ),
    dcc.Checklist(
        id='include-no-education',
        options=[{'label': 'Include No education', 'value': 'no_education'}],
        value=['no_education']
    ),
    dcc.Graph(id='education-vs-unemployment-plot', style={'height': '800px'})
])


@app.callback(
    Output('education-vs-unemployment-plot', 'figure'),
    [Input('age-group-dropdown', 'value'),
     Input('include-no-education', 'value')]
)
def update_plot(selected_age_groups, include_no_education):
    filtered_data = nan_age_groups[nan_age_groups['year_5_age_groups'].isin(selected_age_groups)]
    if 'no_education' not in include_no_education:
        filtered_data = filtered_data[filtered_data['education'] != 'No education']

    aggregated_data = filtered_data.groupby(['education', 'year_5_age_groups'])['count'].sum().reset_index()

    sorted_education = aggregated_data.groupby('education')['count'].sum().sort_values(ascending=False).index

    fig = px.bar(aggregated_data, x='education', y='count', color='year_5_age_groups', barmode='group',
                 title='Education vs. Unemployment by Age Group', labels={'count': 'Total Count'},
                 category_orders={'education': sorted_education})
    return fig


if __name__ == '__main__':
    app.run_server(debug=True, port=8051)

## Interactive section

In [None]:
app = Dash(__name__)

app.layout = html.Div([
    html.Div(children=[
        html.Label('Education: '),
        dcc.Dropdown(['Without education', 'Primary education', 'Secondary education', 'Higher education'],
                     'Primary education', id='education-type')]),

    html.Div(children=[
        html.Label('Color by economic sector: '),
        dcc.RadioItems(['yes', 'no'], 'no', id='color-choice')
    ]),

    dcc.Graph(id='graph-content')
])


@app.callback(
    Output('graph-content', 'figure'),
    [
        Input('education-type', 'value'),
        Input('color-choice', 'value')
    ]
)
def update_figure(selected_education, color_c):
    education_dict = {
        'Without education': 'without education',
        'Primary education': 'primary',
        'Secondary education': 'secondary',
        'Higher education': 'higher',
    }

    education = education_dict.get(selected_education)

    education_subset = \
    table_067_KR_copy.query('`education_level` == @education').groupby(['occupation_type', 'economic_sector'])[
        'count'].sum()

    if color_c == 'yes':
        color_c = [i[1] for i in education_subset.index.values]
    else:
        color_c = None

    figure = px.histogram(education_subset, x='count', y=[i[0] for i in education_subset.index.values],
                          width=900, height=600, color=color_c, orientation='h',
                          labels={'y': 'occupation', 'sum of count': 'count', 'color': 'economic sector'})

    figure.update_layout(title_text='Occupations by level of education')

    return figure


if __name__ == '__main__':
    app.run_server(debug=True, port=8051)

In [None]:
# TODO: powerful graph Nikolai i <3 u

In [None]:
# TODO: powerful graph Nikolai i <2 u

In [None]:
# TODO: powerful graph Nikolai i <1 u