In [3]:
import os
import datetime
import warnings
import polars as pl
import pandas as pd
import altair as alt

from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')
warnings.filterwarnings('ignore')

In [4]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","100.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="left")
df = df.to_pandas()
df = df[df["leader"].str[6].isin(["a", "t"])]
df = df[~df["leader"].str[7].isin(["b", "i", "s", " "])]
df = df[(df["008"].str[15:17] == "xr") & (df["008"].str[35:38] == "cze")]
df = pl.from_pandas(df)
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","020.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","022.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","245.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","300.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","655.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","700.parquet")), left_on="001", right_on="001", how="left")
df = df.explode("022_a").filter(pl.col("022_a").is_null())
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))
df = df.with_columns(pl.col('300_a').map_elements(pocet_stran, return_dtype=int).alias('stran'))
df = df.with_columns(pl.col('245_a').map_elements(bez_bordelu, return_dtype=str))
df = df.explode('245_p').with_columns(pl.col('245_p').map_elements(bez_bordelu, return_dtype=str))
print(len(df))
df = df.filter(pl.col("rok") >= 1800)

df = df.filter((~pl.col("245_h").str.contains("grafika")) | pl.col("245_h").is_null()).unique(subset=["008","100_a","245_a","245_p"], keep="first")
print(len(df))

716107
705250


In [5]:
aut = pl.read_parquet(os.path.join("data","aut_vyber.parquet"))
cesi = aut.explode("370_c").filter(pl.col("370_c").str.contains("Česk")).explode("100_7").select(pl.col("100_7")).to_series().to_list()
len(cesi)

363993

In [6]:
df_700 = pl.read_parquet(os.path.join("data","podil_lidi_s_pauzou.parquet")).with_columns(pl.col("rok").map_elements(lambda x: x.year)).filter(pl.col("rok").is_between(1900,2000))

In [7]:
alt.Chart(alt_friendly(df_700)).mark_line().encode(alt.X("rok:T"), alt.Y("podil_cechu_ill_10"))

In [8]:
alt.Chart(alt_friendly(df_700)).mark_line().encode(alt.X("rok:T"), alt.Y("podil_cechu_trl_10"))

## Autorstvo s pauzou a končící autorstvo

In [10]:
df_100 = pl.read_parquet(os.path.join("data","podil_autorstva_s_pauzou.parquet")).with_columns(pl.col("rok").map_elements(lambda x: x.year)).filter(pl.col("rok").is_between(1900,2000))

In [11]:
alt.Chart(alt_friendly(df_100), width=300).mark_line().encode(alt.X("rok:T"), alt.Y("podil_cechu_15"))

In [12]:
df_koncici = pl.read_parquet(os.path.join('data','podily_koncicich_lidi_cs.parquet'))

In [13]:
df_koncici

rok,skoncilo,celkem,podil,podil_ill,podil_trl,podil_aut
i64,u32,u32,f64,f64,f64,f64
1884,8,82,0.097561,0.076923,0.04878,0.375
1897,11,124,0.08871,0.0625,0.055556,0.125
1899,11,124,0.08871,0.071429,0.086957,0.071429
1900,10,148,0.067568,0.066667,0.061728,0.125
1901,9,131,0.068702,0.066667,0.046875,0.05
1902,10,159,0.062893,0.05,0.089552,0.054054
1904,11,141,0.078014,0.076923,0.112903,0.064516
1908,10,161,0.062112,0.055556,0.044776,0.088235
1914,15,147,0.102041,0.05,0.157895,0.058824
1918,11,145,0.075862,0.041667,0.051724,0.085714


In [14]:
alt.Chart(alt_friendly(df_koncici.filter(pl.col("rok").is_between(1900,2000))), width=300).mark_line().encode(alt.X("rok:T"), alt.Y("podil_aut:Q"))

In [15]:
alt.Chart(alt_friendly(df_koncici.filter(pl.col("rok").is_between(1900,2000))), width=300).mark_line().encode(alt.X("rok:T"), alt.Y("podil_ill:Q"))

In [16]:
alt.Chart(alt_friendly(df_koncici.filter(pl.col("rok").is_between(1900,2000))), width=300).mark_line().encode(alt.X("rok:T"), alt.Y("podil_trl:Q"))

In [17]:
df_koncici_autorstvo = pl.read_parquet(os.path.join('data','podily_konciciho_autorstva_cs.parquet'))
df_koncici_autorstvo.columns

['rok', 'skoncilo', 'celkem', 'podil']

In [18]:
smytec = alt.Chart(
    alt_friendly(df_koncici_autorstvo.filter(pl.col("rok").is_between(1900,2000))), width=300, title=alt.Title(["Jak velká část českých autorů vyšla naposledy…"])
).mark_line().encode(
    alt.X("rok:T", title=None, axis=alt.Axis(domainOpacity=0, tickColor='#DCDDD6')), 
    alt.Y("podil:Q", title=None, axis=alt.Axis(orient='left', domainOpacity=0, tickColor='#DCDDD6', labelExpr="datum.label * 100 + ' %'"))
)

pauza15 = alt.Chart(
    alt_friendly(df_100), width=300, title=alt.Title('…a jak velká část se odmlčela na 15 a více let')
).mark_line().encode(
    alt.X("rok:T", title=None, axis=alt.Axis(domainOpacity=0, tickColor='#DCDDD6')), 
    alt.Y("podil_cechu_15", title=None, axis=alt.Axis(orient='left', domainOpacity=0, tickColor='#DCDDD6', labelExpr="datum.label * 100 + ' %'"), 
          scale=alt.Scale(domain=[0,0.2]))
)

alt.vconcat(smytec, pauza15).configure_view(stroke='transparent').resolve_scale(color='independent',x="shared",y='shared')

## Kdo zmizel

In [20]:
pred_unorem = set(df.filter(pl.col("rok") <= 1948).select(pl.col("100_7")).to_series().to_list())
po_revoluci = set(df.filter(pl.col("rok") >= 1988).select(pl.col("100_7")).to_series().to_list())
v_sedesatkach = set(df.filter(pl.col("rok").is_between(1960,1969)).select(pl.col("100_7")).to_series().to_list())
behem_socialismu = set(df.filter(pl.col("rok").is_between(1949,1987)).select(pl.col("100_7")).to_series().to_list())
behem_padesatek = set(df.filter(pl.col("rok").is_between(1949,1959)).select(pl.col("100_7")).to_series().to_list())
pred_normalizaci = set(df.filter(pl.col("rok").is_between(1965,1970)).select(pl.col("100_7")).to_series().to_list())
po_normalizaci = set(df.filter(pl.col("rok").is_between(1990,1995)).select(pl.col("100_7")).to_series().to_list())
behem_normalizace = set(df.filter(pl.col("rok").is_between(1972,1988)).select(pl.col("100_7")).to_series().to_list())

znormalizovani = pred_normalizaci.intersection(po_normalizaci).difference(behem_normalizace)
zunorovani = pred_unorem.intersection(po_revoluci).difference(behem_socialismu)

In [21]:
len(znormalizovani)

631

In [76]:
len(df.filter(pl.col("100_7").is_in(set(cesi).intersection(znormalizovani))).unique(subset=['100_7']))

499

In [22]:
df = df.with_columns(pl.col("100_a").map_elements(hezke_jmeno).alias("jmeno"))

In [23]:
kolik = 11
znormalizovani_top_domaci = df.filter(pl.col("100_7").is_in(cesi)).filter(pl.col("rok").is_between(1965,1995)).filter(pl.col("100_7").is_in(znormalizovani)).group_by(["jmeno",'100_7']).len().sort(by='len',descending=True).head(kolik).select(pl.col("100_7")).to_series().to_list()
znormalizovani_top_domaci_razeni = df.filter(pl.col("100_7").is_in(cesi)).filter(pl.col("rok").is_between(1965,1995)).filter(pl.col("100_7").is_in(znormalizovani)).group_by(["jmeno",'100_7']).len().sort(by='len',descending=True).head(kolik).select(pl.col("jmeno")).to_series().to_list()
znormalizovani_top_zahranicni = df.filter(~pl.col("100_7").is_in(cesi)).filter(pl.col("rok").is_between(1965,1995)).filter(pl.col("100_7").is_in(znormalizovani)).group_by(["jmeno",'100_7']).len().sort(by='len',descending=True).head(kolik).select(pl.col("100_7")).to_series().to_list()
znormalizovani_top_zahranicni_razeni = df.filter(~pl.col("100_7").is_in(cesi)).filter(pl.col("rok").is_between(1965,1995)).filter(pl.col("100_7").is_in(znormalizovani)).group_by(["jmeno",'100_7']).len().sort(by='len',descending=True).head(kolik).select(pl.col("jmeno")).to_series().to_list()

In [24]:
norm_dom = alt.Chart(alt_friendly(df.filter(pl.col("rok").is_between(1900,2000)).filter(pl.col("100_7").is_in(znormalizovani_top_domaci))), title=alt.TitleParams(["Normalizační průrva domácí literaturou…"], subtitle=["Komu z českých autorů autorek toho vyšlo nejvíc v druhé","půlce 60. a první půlce 90. let a zároveň nic mezi lety 1973 a 1987."]), width=300).mark_circle(size=8) \
        .encode(
            x=alt.X("rok:T", title=None, axis=alt.Axis(domainOpacity=0, tickColor='#DCDDD6')), 
            y=alt.Y("jmeno:N", sort=znormalizovani_top_domaci_razeni, title=None, axis=alt.Axis(orient='left', domainOpacity=0, tickColor='#DCDDD6')), 
            yOffset=alt.YOffset("jitter:Q", scale=alt.Scale(range=[3, 15])), 
            color=alt.Color('jmeno:N', scale=alt.Scale(range=['#D6534B', '#445B78', '#DB842F', '#70871E']), 
                            sort=znormalizovani_top_domaci_razeni).legend(None)) \
        .transform_calculate(jitter="sqrt(-2*log(random()))*cos(2*PI*random())")
norm_zahr = alt.Chart(alt_friendly(df.filter(pl.col("rok").is_between(1900,2000)).filter(pl.col("100_7").is_in(znormalizovani_top_zahranicni))), title=alt.TitleParams("…a překladovou literaturou"), width=300).mark_circle(size=8) \
        .encode(
            x=alt.X("rok:T", title=None, axis=alt.Axis(domainOpacity=0, tickColor='#DCDDD6')), 
            y=alt.Y("jmeno:N", sort=znormalizovani_top_zahranicni_razeni, title=None, axis=alt.Axis(orient='left', domainOpacity=0, tickColor='#DCDDD6')), 
            yOffset=alt.YOffset("jitter:Q", scale=alt.Scale(range=[3, 15])), 
            color=alt.Color('jmeno:N', scale=alt.Scale(range=['#D6534B', '#445B78', '#DB842F', '#70871E']), 
                            sort=znormalizovani_top_zahranicni_razeni).legend(None)) \
        .transform_calculate(jitter="sqrt(-2*log(random()))*cos(2*PI*random())")

alt.vconcat(norm_dom, norm_zahr).configure_view(stroke='transparent').resolve_scale(color='independent',x="shared")

## Stopky

In [26]:
nechceme = ['jn19990210621']
df = df.filter(~pl.col('100_7').is_in(nechceme))

In [27]:
def nevysli_po(rok, obdobi, nasobek):
    vysli_pred = df.filter(pl.col("rok").is_between(rok-obdobi,rok)).group_by('100_7').len().filter(pl.col('len') > 3).select(pl.col("100_7")).to_series().to_list()
    vysli_po = set(df.filter(pl.col("rok") > rok).select(pl.col("100_7")).to_series().to_list())
    ukonceni = [v for v in vysli_pred if v not in vysli_po]
    print(f"Filtru vyhovuje {len(ukonceni)} lidí.")
    return df.filter(pl.col("100_7").is_in(ukonceni)).filter(pl.col("rok").is_between(rok-(obdobi * nasobek),rok)).group_by(["jmeno","100_7"]).len().sort(by="len",descending=True).head(100)

In [28]:
osmnact = nevysli_po(1918, 4, 10).head(3)
osmnact

Filtru vyhovuje 23 lidí.


jmeno,100_7,len
str,str,u32
"""Albert Vojtěch Velflík""","""jk01141974""",26
"""Josef Flekáček""","""jk01031426""",26
"""J. E Marel""","""jx20040622007""",18


In [29]:
ctyricetosm = nevysli_po(1948, 4, 10).head(3)
ctyricetosm

Filtru vyhovuje 237 lidí.


jmeno,100_7,len
str,str,u32
"""Quido Maria Vyskočil""","""jk01151235""",149
"""Ladislav Riedl Německobrodský""","""jk01102414""",100
"""František Pulec""","""jk01101720""",75


In [30]:
nevysli_po(1968, 2, 10).head(3)

Filtru vyhovuje 24 lidí.


jmeno,100_7,len
str,str,u32
"""Miluše Nohejlová""","""jx20040805028""",23
"""Jitka Voříšková""","""jx20040721068""",22
"""Antonín Václavovič""","""jk01141051""",21


In [31]:
osmdesatdevet = nevysli_po(1989, 4, 10).head(3)
osmdesatdevet

Filtru vyhovuje 260 lidí.


jmeno,100_7,len
str,str,u32
"""Jarmila Šteflíčková""","""jk01131190""",96
"""Antonín Zápotocký""","""jk01152141""",95
"""František Buriánek""","""jk01020403""",68


In [32]:
df = df.with_columns(pl.col("jmeno").replace({"Ladislav Riedl Německobrodský": "L. R. Německobrodský"}))

In [33]:
#fertig = pl.concat([osmnact, ctyricetosm, osmdesatdevet])

fertig_18_kdo = osmnact.select(pl.col('100_7')).to_series().to_list()
fertig_18_razeni = osmnact.select(pl.col('jmeno')).to_series().to_list()
fertig_18_do_grafu = df.filter(pl.col('100_7').is_in(fertig_18_kdo))

fertig_48_kdo = ctyricetosm.select(pl.col('100_7')).to_series().to_list()
fertig_48_razeni = ctyricetosm.select(pl.col('jmeno')).to_series().to_list()
fertig_48_do_grafu = df.filter(pl.col('100_7').is_in(fertig_48_kdo))

fertig_89_kdo = osmdesatdevet.select(pl.col('100_7')).to_series().to_list()
fertig_89_razeni = osmdesatdevet.select(pl.col('jmeno')).to_series().to_list()
fertig_89_do_grafu = df.filter(pl.col('100_7').is_in(fertig_89_kdo))

In [34]:
fertig_48_razeni

['Quido Maria Vyskočil', 'Ladislav Riedl Německobrodský', 'František Pulec']

In [35]:
fertig_18_graf = alt.Chart(alt_friendly(fertig_18_do_grafu.filter(pl.col('rok').is_between(1900,2000))), title=alt.TitleParams(["Čeští autoři, kteří přestali vycházet","po vyhlášení Československa…"], 
)).mark_circle(size=8) \
        .encode(
            x=alt.X("rok:T", title=None, axis=alt.Axis(domainOpacity=0, tickColor='#DCDDD6'), scale=alt.Scale(domain=[{'year': 1900},{'year':2000}])), 
            y=alt.Y("jmeno:N", sort=fertig_18_razeni, title=None, axis=alt.Axis(orient='left', domainOpacity=0, tickColor='#DCDDD6')), 
            yOffset=alt.YOffset("jitter:Q", scale=alt.Scale(range=[3, 15])), 
            color=alt.Color('jmeno:N', scale=alt.Scale(range=['#D6534B', '#445B78', '#DB842F', '#70871E']), 
                            sort=fertig_18_razeni).legend(None)).transform_calculate(jitter="sqrt(-2*log(random()))*cos(2*PI*random())")

fertig_48_graf = alt.Chart(alt_friendly(fertig_48_do_grafu.filter(pl.col('rok').is_between(1900,2000))), title=alt.TitleParams(["…po únorovém převratu…"], 
)).mark_circle(size=8) \
        .encode(
            x=alt.X("rok:T", title=None, axis=alt.Axis(domainOpacity=0, tickColor='#DCDDD6'), scale=alt.Scale(domain=[{'year': 1900},{'year':2000}])), 
            y=alt.Y("jmeno:N", sort=fertig_48_razeni, title=None, axis=alt.Axis(orient='left', domainOpacity=0, tickColor='#DCDDD6')), 
            yOffset=alt.YOffset("jitter:Q", scale=alt.Scale(range=[3, 15])), 
            color=alt.Color('jmeno:N', scale=alt.Scale(range=['#D6534B', '#445B78', '#DB842F', '#70871E']), 
                            sort=fertig_48_razeni).legend(None)).transform_calculate(jitter="sqrt(-2*log(random()))*cos(2*PI*random())")

fertig_89_graf = alt.Chart(alt_friendly(fertig_89_do_grafu.filter(pl.col('rok').is_between(1900,2000))), title=alt.TitleParams(["…a po revoluci"], 
)).mark_circle(size=8) \
        .encode(
            x=alt.X("rok:T", title=None, axis=alt.Axis(domainOpacity=0, tickColor='#DCDDD6'), scale=alt.Scale(domain=[{'year': 1900},{'year':2000}])), 
            y=alt.Y("jmeno:N", sort=fertig_89_razeni, title=None, axis=alt.Axis(orient='left', domainOpacity=0, tickColor='#DCDDD6')), 
            yOffset=alt.YOffset("jitter:Q", scale=alt.Scale(range=[3, 15])), 
            color=alt.Color('jmeno:N', scale=alt.Scale(range=['#D6534B', '#445B78', '#DB842F', '#70871E']), 
                            sort=fertig_89_razeni).legend(None)).transform_calculate(jitter="sqrt(-2*log(random()))*cos(2*PI*random())")

alt.vconcat(fertig_18_graf, fertig_48_graf, fertig_89_graf).resolve_axis(x="shared")