In [213]:
import os
import datetime
import polars as pl
import pandas as pd
import altair as alt
from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')

ThemeRegistry.enable('irozhlas')

In [135]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","100.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="left")
df = df.to_pandas()
df = df[df["leader"].str[6].isin(["a", "t"])]
df = df[~df["leader"].str[7].isin(["b", "i", "s", " "])]
df = df[(df["008"].str[15:17] == "xr") & (df["008"].str[35:38] == "cze")]
df = pl.from_pandas(df)
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","020.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","022.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","245.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","300.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","655.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","700.parquet")), left_on="001", right_on="001", how="left")
df = df.explode("022_a").filter(pl.col("022_a").is_null())
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))
df = df.with_columns(pl.col('300_a').map_elements(pocet_stran, return_dtype=int).alias('stran'))
df = df.with_columns(pl.col('245_a').map_elements(bez_bordelu, return_dtype=str))
df = df.explode('245_p').with_columns(pl.col('245_p').map_elements(bez_bordelu, return_dtype=str))
print(len(df))
df = df.filter(pl.col("rok") >= 1800)

df = df.filter((~pl.col("245_h").str.contains("grafika")) | pl.col("245_h").is_null()).unique(subset=["008","100_a","245_a","245_p"], keep="first")
print(len(df))

716107
705250


In [317]:
aut = pl.read_parquet(os.path.join("data","aut_vyber.parquet"))
cesi = aut.explode("370_c").filter(pl.col("370_c").str.contains("Česk")).explode("100_7").select(pl.col("100_7")).to_series().to_list()
len(cesi)

363993

In [23]:
df_700 = pl.read_parquet(os.path.join("data","podil_lidi_s_pauzou.parquet")).with_columns(pl.col("rok").map_elements(lambda x: x.year)).filter(pl.col("rok").is_between(1900,2000))

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("rok").map_elements(lambda x: ...)
with this one instead:
  + pl.col("rok").dt.year()

  df_700 = pl.read_parquet(os.path.join("data","podil_lidi_s_pauzou.parquet")).with_columns(pl.col("rok").map_elements(lambda x: x.year)).filter(pl.col("rok").is_between(1900,2000))
  df_700 = pl.read_parquet(os.path.join("data","podil_lidi_s_pauzou.parquet")).with_columns(pl.col("rok").map_elements(lambda x: x.year)).filter(pl.col("rok").is_between(1900,2000))


In [35]:
df

rok,podil_vsech_5,podil_cechu_5,podil_cechu_aut_5,podil_cechu_trl_5,podil_vsech_10,podil_cechu_10,podil_cechu_aut_10,podil_cechu_trl_10,podil_vsech_15,podil_cechu_15,podil_cechu_aut_15,podil_cechu_trl_15,podil_vsech_20,podil_cechu_20,podil_cechu_aut_20,podil_cechu_trl_20,podil_cechu_ill_5,podil_cechu_ill_10,podil_cechu_ill_15,podil_cechu_ill_20
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1820,0.4,0.4,0.0,0.318182,0.28,0.28,0.0,0.272727,0.28,0.28,0.0,0.272727,0.12,0.12,0.0,0.136364,,,,
1821,0.357143,0.357143,0.0,0.333333,0.285714,0.285714,0.0,0.291667,0.107143,0.107143,0.0,0.166667,0.071429,0.071429,0.0,0.125,,,,
1822,0.344828,0.344828,0.0,0.346154,0.275862,0.275862,0.0,0.307692,0.103448,0.103448,0.0,0.153846,0.068966,0.068966,0.0,0.115385,,,,
1823,0.354839,0.354839,0.0,0.333333,0.290323,0.290323,0.0,0.296296,0.129032,0.129032,0.0,0.148148,0.096774,0.096774,0.0,0.111111,,,,
1824,0.363636,0.363636,0.5,0.310345,0.272727,0.272727,0.5,0.241379,0.121212,0.121212,0.5,0.137931,0.060606,0.060606,0.5,0.103448,,,,
1825,0.368421,0.342857,0.333333,0.266667,0.342105,0.314286,0.333333,0.233333,0.210526,0.171429,0.333333,0.133333,0.131579,0.085714,0.333333,0.1,,,,
1826,0.409091,0.410256,0.333333,0.323529,0.272727,0.25641,0.333333,0.205882,0.181818,0.153846,0.333333,0.147059,0.113636,0.076923,0.333333,0.117647,,,,
1827,0.466667,0.45,0.333333,0.323529,0.333333,0.3,0.333333,0.205882,0.222222,0.175,0.333333,0.117647,0.177778,0.125,0.333333,0.117647,,,,
1828,0.58,0.534884,0.333333,0.351351,0.4,0.325581,0.333333,0.189189,0.32,0.232558,0.333333,0.108108,0.26,0.162791,0.333333,0.108108,,,,
1829,0.592593,0.543478,0.25,0.375,0.407407,0.326087,0.25,0.225,0.333333,0.23913,0.25,0.15,0.296296,0.195652,0.25,0.15,,,,


In [25]:
alt.Chart(alt_friendly(df_700)).mark_line().encode(alt.X("rok:T"), alt.Y("podil_cechu_ill_10"))

In [27]:
alt.Chart(alt_friendly(df_700)).mark_line().encode(alt.X("rok:T"), alt.Y("podil_cechu_trl_10"))

In [33]:
df_100 = pl.read_parquet(os.path.join("data","podil_autorstva_s_pauzou.parquet")).with_columns(pl.col("rok").map_elements(lambda x: x.year)).filter(pl.col("rok").is_between(1900,2000))

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("rok").map_elements(lambda x: ...)
with this one instead:
  + pl.col("rok").dt.year()

  df_100 = pl.read_parquet(os.path.join("data","podil_autorstva_s_pauzou.parquet")).with_columns(pl.col("rok").map_elements(lambda x: x.year)).filter(pl.col("rok").is_between(1900,2000))
  df_100 = pl.read_parquet(os.path.join("data","podil_autorstva_s_pauzou.parquet")).with_columns(pl.col("rok").map_elements(lambda x: x.year)).filter(pl.col("rok").is_between(1900,2000))


In [19]:
df.columns

['rok',
 'podil_vsech_5',
 'podil_cechu_5',
 'podil_vsech_10',
 'podil_cechu_10',
 'podil_vsech_15',
 'podil_cechu_15',
 'podil_vsech_20',
 'podil_cechu_20']

In [127]:
alt.Chart(alt_friendly(df_100), width=300).mark_line().encode(alt.X("rok:T"), alt.Y("podil_cechu_15"))

In [111]:
df_koncici = pl.read_parquet(os.path.join('data','podily_koncicich_lidi_cs.parquet'))

In [113]:
df_koncici

rok,skoncilo,celkem,podil,podil_ill,podil_trl,podil_aut
i64,u32,u32,f64,f64,f64,f64
1884,8,82,0.097561,0.076923,0.04878,0.375
1897,11,124,0.08871,0.0625,0.055556,0.125
1899,11,124,0.08871,0.071429,0.086957,0.071429
1900,10,148,0.067568,0.066667,0.061728,0.125
1901,9,131,0.068702,0.066667,0.046875,0.05
1902,10,159,0.062893,0.05,0.089552,0.054054
1904,11,141,0.078014,0.076923,0.112903,0.064516
1908,10,161,0.062112,0.055556,0.044776,0.088235
1914,15,147,0.102041,0.05,0.157895,0.058824
1918,11,145,0.075862,0.041667,0.051724,0.085714


In [117]:
alt.Chart(alt_friendly(df_koncici.filter(pl.col("rok").is_between(1900,2000))), width=300).mark_line().encode(alt.X("rok:T"), alt.Y("podil_aut:Q"))

In [119]:
alt.Chart(alt_friendly(df_koncici.filter(pl.col("rok").is_between(1900,2000))), width=300).mark_line().encode(alt.X("rok:T"), alt.Y("podil_ill:Q"))

In [121]:
alt.Chart(alt_friendly(df_koncici.filter(pl.col("rok").is_between(1900,2000))), width=300).mark_line().encode(alt.X("rok:T"), alt.Y("podil_trl:Q"))

In [123]:
df_koncici_autorstvo = pl.read_parquet(os.path.join('data','podily_konciciho_autorstva_cs.parquet'))
df_koncici_autorstvo.columns

['rok', 'skoncilo', 'celkem', 'podil']

In [125]:
alt.Chart(alt_friendly(df_koncici_autorstvo.filter(pl.col("rok").is_between(1900,2000))), width=300).mark_line().encode(alt.X("rok:T"), alt.Y("podil:Q"))

In [293]:
pred_unorem = set(df.filter(pl.col("rok") <= 1948).select(pl.col("100_7")).to_series().to_list())
po_revoluci = set(df.filter(pl.col("rok") >= 1988).select(pl.col("100_7")).to_series().to_list())
v_sedesatkach = set(df.filter(pl.col("rok").is_between(1960,1969)).select(pl.col("100_7")).to_series().to_list())
behem_socialismu = set(df.filter(pl.col("rok").is_between(1949,1987)).select(pl.col("100_7")).to_series().to_list())
behem_padesatek = set(df.filter(pl.col("rok").is_between(1949,1959)).select(pl.col("100_7")).to_series().to_list())
pred_normalizaci = set(df.filter(pl.col("rok").is_between(1965,1970)).select(pl.col("100_7")).to_series().to_list())
po_normalizaci = set(df.filter(pl.col("rok").is_between(1990,1995)).select(pl.col("100_7")).to_series().to_list())
behem_normalizace = set(df.filter(pl.col("rok").is_between(1972,1988)).select(pl.col("100_7")).to_series().to_list())

znormalizovani = pred_normalizaci.intersection(po_normalizaci).difference(behem_normalizace)
zunorovani = pred_unorem.intersection(po_revoluci).difference(behem_socialismu)

In [217]:
df = df.with_columns(pl.col("100_a").map_elements(hezke_jmeno).alias("jmeno"))

  df = df.with_columns(pl.col("100_a").map_elements(hezke_jmeno).alias("jmeno"))


In [365]:
kolik = 11
znormalizovani_top_domaci = df.filter(pl.col("100_7").is_in(cesi)).filter(pl.col("rok").is_between(1965,1995)).filter(pl.col("100_7").is_in(znormalizovani)).group_by(["jmeno",'100_7']).len().sort(by='len',descending=True).head(kolik).select(pl.col("100_7")).to_series().to_list()
znormalizovani_top_domaci_razeni = df.filter(pl.col("100_7").is_in(cesi)).filter(pl.col("rok").is_between(1965,1995)).filter(pl.col("100_7").is_in(znormalizovani)).group_by(["jmeno",'100_7']).len().sort(by='len',descending=True).head(kolik).select(pl.col("jmeno")).to_series().to_list()
znormalizovani_top_zahranicni = df.filter(~pl.col("100_7").is_in(cesi)).filter(pl.col("rok").is_between(1965,1995)).filter(pl.col("100_7").is_in(znormalizovani)).group_by(["jmeno",'100_7']).len().sort(by='len',descending=True).head(kolik).select(pl.col("100_7")).to_series().to_list()
znormalizovani_top_zahranicni_razeni = df.filter(~pl.col("100_7").is_in(cesi)).filter(pl.col("rok").is_between(1965,1995)).filter(pl.col("100_7").is_in(znormalizovani)).group_by(["jmeno",'100_7']).len().sort(by='len',descending=True).head(kolik).select(pl.col("jmeno")).to_series().to_list()

In [367]:
norm_dom = alt.Chart(alt_friendly(df.filter(pl.col("rok").is_between(1960,2000)).filter(pl.col("100_7").is_in(znormalizovani_top_domaci))), title=alt.TitleParams(["Normalizační průrva domácí literaturou…"], subtitle=["Komu z českých autorů autorek toho vyšlo nejvíc v druhé","půlce 60. a první půlce 90. let a zároveň nic mezi lety 1973 a 1987."]), width=300).mark_circle(size=8) \
        .encode(
            x=alt.X("rok:T", title=None, axis=alt.Axis(domainOpacity=0, tickColor='#DCDDD6')), 
            y=alt.Y("jmeno:N", sort=znormalizovani_top_domaci_razeni, title=None, axis=alt.Axis(orient='left', domainOpacity=0, tickColor='#DCDDD6')), 
            yOffset=alt.YOffset("jitter:Q", scale=alt.Scale(range=[3, 15])), 
            color=alt.Color('jmeno:N', scale=alt.Scale(range=['#D6534B', '#445B78', '#DB842F', '#70871E']), 
                            sort=znormalizovani_top_domaci_razeni).legend(None)) \
        .transform_calculate(jitter="sqrt(-2*log(random()))*cos(2*PI*random())")
norm_zahr = alt.Chart(alt_friendly(df.filter(pl.col("rok").is_between(1960,2000)).filter(pl.col("100_7").is_in(znormalizovani_top_zahranicni))), title=alt.TitleParams("…a překladovou literaturou"), width=300).mark_circle(size=8) \
        .encode(
            x=alt.X("rok:T", title=None, axis=alt.Axis(domainOpacity=0, tickColor='#DCDDD6')), 
            y=alt.Y("jmeno:N", sort=znormalizovani_top_zahranicni_razeni, title=None, axis=alt.Axis(orient='left', domainOpacity=0, tickColor='#DCDDD6')), 
            yOffset=alt.YOffset("jitter:Q", scale=alt.Scale(range=[3, 15])), 
            color=alt.Color('jmeno:N', scale=alt.Scale(range=['#D6534B', '#445B78', '#DB842F', '#70871E']), 
                            sort=znormalizovani_top_zahranicni_razeni).legend(None)) \
        .transform_calculate(jitter="sqrt(-2*log(random()))*cos(2*PI*random())")

alt.vconcat(norm_dom, norm_zahr).configure_view(stroke='transparent').resolve_scale(color='independent')

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df