In [7]:
import os
import datetime
import warnings
import polars as pl
import pandas as pd
import altair as alt

from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin
from src.zjisti_vazbu import zjisti_vazbu

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')
warnings.filterwarnings('ignore')

In [8]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","100.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="left")
df = df.to_pandas()
df = df[df["leader"].str[6].isin(["a", "t"])]
df = df[~df["leader"].str[7].isin(["b", "i", "s", " "])]
df = df[(df["008"].str[15:17] == "xr") & (df["008"].str[35:38] == "cze")]
df = pl.from_pandas(df)
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","020.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","022.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","245.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","300.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","655.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","700.parquet")), left_on="001", right_on="001", how="left")
df = df.explode("022_a").filter(pl.col("022_a").is_null())
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))
df = df.with_columns(pl.col('300_a').map_elements(pocet_stran, return_dtype=int).alias('stran'))
df = df.with_columns(pl.col('245_a').map_elements(bez_bordelu, return_dtype=str))
df = df.explode("020_q").with_columns(pl.col("020_q").map_elements(zjisti_vazbu, return_dtype=str).alias('vazba'))
df = df.explode('245_p').with_columns(pl.col('245_p').map_elements(bez_bordelu, return_dtype=str))
df = df.filter(pl.col("stran") > 15)
print(len(df))
df = df.filter((~pl.col("245_h").str.contains("grafika")) | pl.col("245_h").is_null()).unique(subset=["100_a","245_a"], keep="first")
print(len(df))

727498
525308


In [9]:
aut = pl.read_parquet(os.path.join("data","aut_vyber.parquet"))
cesi = aut.explode("370_c").filter(pl.col("370_c").str.contains("Česk")).explode("100_7").select(pl.col("100_7")).to_series().to_list()
len(cesi)

364420

In [10]:
df = df.filter(pl.col("100_7").is_in(cesi))

In [11]:
vek = pl.read_parquet(os.path.join("data","narozeni-umrti-gender.parquet"))
df = df.join(vek, on="100_7", how="left").with_columns((pl.col("rok") - pl.col("narozeni")).alias("vek"))

In [35]:
historicky_podil = df.filter(
    pl.col('gender') == 'm').group_by("rok").len().rename({'len':'m'}).join(
    df.filter(pl.col('gender') == 'f').group_by("rok").len().rename({'len':'f'}), on='rok'
).with_columns(((pl.col('f') / (pl.col('m') + pl.col('f'))).alias('podil'))).sort(by='rok').filter(
    pl.col("rok").is_between(1800,2024)).with_columns(
    pl.col("podil").rolling_mean(window_size=2)
    )

In [37]:
historicky_podil.head(20)

rok,m,f,podil
i64,u32,u32,f64
1819,13,1,
1825,33,1,0.05042
1826,18,2,0.064706
1827,17,2,0.102632
1828,20,1,0.076441
1830,25,2,0.060847
1834,17,1,0.064815
1838,20,1,0.051587
1840,27,1,0.041667
1841,24,1,0.037857


In [39]:
historicky_podil.tail(20)

rok,m,f,podil
i64,u32,u32,f64
2005,3651,1361,0.26482
2006,3654,1510,0.281979
2007,3809,1560,0.291483
2008,3864,1614,0.292595
2009,3554,1569,0.300449
2010,3720,1615,0.304492
2011,3292,1463,0.305197
2012,3123,1467,0.313642
2013,3534,1695,0.321881
2014,3683,1813,0.327015


In [59]:
historicky_podil_graf = alt.Chart(
    alt_friendly(historicky_podil.filter(pl.col("rok") >= 1850)), width=300, height=100,
    title=["Podíl ženských autorek na nově vydaných","původních českých knihách všech žánrů"]
).mark_line().encode(
    alt.X('rok:T', title=None, axis=alt.Axis(tickCount=6)),
    alt.Y('podil:Q', axis=alt.Axis(tickCount=5, labelExpr="datum.label * 100 + ' %'", orient='right'),
          title=None, scale=alt.Scale(domain=[0,0.45]))
).configure_view(stroke='transparent')

historicky_podil_graf

In [16]:
from src.me_to_neurazi import me_to_neurazi

In [29]:
me_to_neurazi(historicky_podil_graf, ["data: Národní knihovna, Wikidata ~ vizualizace: iROZHLAS.cz ~ 2025"], "historicky_podil_zen")

ahoj


In [18]:
df_do_sta = df.filter(pl.col("vek") < 100) #.filter(pl.col("rok").is_between(1999,2024))

In [19]:
ceska_poezie = df_do_sta.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a") == "česká poezie"
)

In [20]:
df_do_sta.filter(pl.col("rok") == 2012).explode("655_a").group_by("655_a").len().sort(by="len",descending=True)

655_a,len
str,u32
"""monografie""",602
"""monographs""",594
"""učebnice vysokých škol""",320
"""textbooks (higher)""",314
"""česká poezie""",298
"""Czech poetry""",292
"""publikace pro děti""",254
"""kolektivní monografie""",253
"""collective monographs""",252
"""children's literature""",250


In [21]:
ceske_romany = df_do_sta.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a") == "české romány"
)

In [22]:
ceske_kratke = df_do_sta.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a").is_in(["české novely","české povídky","české příběhy","české prózy"])
)

In [23]:
df_podil = ceske_romany.filter(
    pl.col("gender") == 'f'
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceske_romany.filter(pl.col("gender") == 'm').group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("romány").alias("zanr")).with_columns(pl.col("podil_zen").rolling_mean(window_size=2))

In [24]:
df_podil_kratke = ceske_kratke.filter(
    pl.col("gender") == 'f'
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceske_kratke.filter(pl.col("gender") == 'm').group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("kratší prózy").alias("zanr")).with_columns(pl.col("podil_zen").rolling_mean(window_size=2))

In [25]:
df_podil_poezie = ceska_poezie.filter(
    (pl.col("gender") == 'f')
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceska_poezie.filter(pl.col("gender") == 'm').group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("poezie").alias("zanr")).with_columns(pl.col("podil_zen").rolling_mean(window_size=2))

In [71]:
podil_v_soucasnosti = alt.Chart(
    alt_friendly(pl.concat([df_podil, df_podil_kratke, df_podil_poezie]).filter(pl.col('rok') >= 2000)), 
    title=alt.Title(["Podíl ženských autorek","na nově vydaných českých knihách"], lineHeight=21), width=250, height=80
         ).mark_line(
         ).encode(
    alt.X("rok:T", title=None, axis=alt.Axis(tickCount=5)),
    alt.Y('podil_zen:Q', title=None, axis=alt.Axis(
        tickCount=5, labelExpr="datum.label * 100 + ' %'", orient='right'), scale=alt.Scale(domain=[0,0.5])
         ),
    alt.Row("zanr:N",header=alt.Header(labelAngle=0, labelAlign='left', labelAnchor='middle', labelFont='Asap'), sort=["romány","kratší prózy","poezie"], title=None)
).configure_view(stroke='transparent').resolve_scale(y='shared').resolve_axis(x="independent")
podil_v_soucasnosti

In [63]:
me_to_neurazi(podil_v_soucasnosti, ["data: Národní knihovna, Wikidata ~ vizualizace: iROZHLAS.cz ~ 2025"], "soucasny_podil_zen")

ahoj


## Věková struktura

In [77]:
cr2 = pl.concat([ceske_romany, ceske_kratke]).group_by(["gender","vek"]).len().sort(by="vek")
cr2b = pl.concat([ceske_romany, ceske_kratke]).filter(pl.col('rok').is_between(2015,2024)).group_by(["gender","vek"]).len().sort(by="vek")
cr2c = pl.concat([ceske_romany, ceske_kratke]).filter(pl.col('rok').is_between(2000,2010)).group_by(["gender","vek"]).len().sort(by="vek")

In [79]:
alt.Chart(cr2b.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_line().encode(alt.X("vek"),alt.Y("len"),alt.Color("gender"))

In [81]:
alt.Chart(cr2c.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_line().encode(alt.X("vek"),alt.Y("len"),alt.Color("gender"))

In [83]:
alt.Chart(cr2.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_line().encode(alt.X("vek"),alt.Y("len"),alt.Color("gender"))

In [85]:
alt.Chart(cr2.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_bar().encode(alt.X("vek"),alt.Y("len"),alt.Row("gender"))

In [87]:
cr3 = ceske_romany.group_by(["gender","rok"]).agg(pl.col('vek').median())
cr3

gender,rok,vek
str,i64,f64
"""m""",1909,51.0
"""m""",1927,47.0
"""f""",1933,37.0
"""m""",1937,46.0
"""f""",2008,48.0
,2019,61.0
"""m""",1854,35.0
"""f""",1962,50.0
"""m""",2013,56.0
"""m""",1885,24.0


In [89]:
alt.Chart(alt_friendly(cr3.filter(pl.col('gender').is_in(['m','f'])))).mark_line().encode(alt.X("rok"),alt.Y("vek"),alt.Row("gender"))

## Věková struktura, druhý pohled

In [91]:
pl.concat([ceske_romany, ceske_kratke]).group_by("rok").len().sort(by="rok")

rok,len
i64,u32
1804,1
1814,1
1815,1
1818,1
1819,1
1823,2
1824,3
1825,12
1826,1
1828,4


In [93]:
pl.concat([ceske_romany, ceske_kratke]).filter(pl.col('rok') >= 2000)

100_ind1,100_a,100_7,100_4,100_d,100_q,100_c,100_b,100_e,001,leader,008,020_q,020_c,020_a,020_z,022_a,022_y,022_z,022_ind1,022_l,245_ind1,245_ind2,245_a,245_b,245_c,245_n,245_p,245_h,245_f,245_s,300_a,300_b,300_c,300_e,300_f,300_3,…,655_2,655_ind1,655_y,655_z,700_ind1,700_a,700_4,700_d,700_7,700_t,700_q,700_l,700_ind2,700_c,700_b,700_i,700_m,700_k,700_n,700_r,700_p,700_o,700_s,700_j,700_x,700_e,700_f,700_5,700_9,700_g,rok,stran,vazba,narozeni,umrti,gender,vek
str,str,str,list[str],str,str,list[str],str,str,str,str,str,str,list[str],list[str],list[str],str,list[str],list[str],str,str,str,str,str,str,str,list[str],str,str,str,str,list[str],list[str],list[str],list[str],str,str,…,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],i64,i64,str,i64,i64,str,i64
"""1""","""Reindl, Jane,""","""xx0188150""","[""aut""]","""1972-""",,,,,"""nkc20193148434""",""" nam a22 i 4500""","""191017s2019 xr a g 0…","""(brožováno)""",,"[""978-80-7612-120-1""]",,,,,,,"""1""","""0""","""Dar""",,"""Jane Reindl""",,,,,,"[""275 stran :""]","[""ilustrace ;""]","[""21 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,275,"""brožovaná""",1972,,"""f""",47
"""1""","""Poberová, Slávka,""","""jk01100162""","[""aut""]","""1943-""",,,,,"""nkc20102029148""",""" nam a22 a 4500""","""100810s2010 xr g 0…","""(brož.)""",,"[""978-80-7376-182-0""]",,,,,,,"""1""","""0""","""Životy na schovávanou""",,"""Slávka Poberová""",,,,,,"[""192 s. ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2010,192,"""brožovaná""",1943,,"""f""",67
"""1""","""Durych, Václav,""","""jn20001103143""","[""aut""]","""1930-2011""",,,,,"""cpk20021139851""",""" nam a22 a 4500""","""020916s2001 xr e 0…","""(brož.)""",,"[""80-86200-56-6""]",,,,,,,"""1""","""0""","""Ouřk""",,"""Václav Durych""",,,,,,"[""135 s. ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2001,135,"""brožovaná""",1930,2011,"""m""",71
"""1""","""Drescher, Alex,""","""xx0066461""","[""aut""]","""1972-""",,,,,"""nkc20071726763""",""" cam a22 a 4500""","""070820s2007 xr g 0…","""(brož.) :""","[""Kč 219,00 (cena Klubu čtenářů SF Kč 207,00)""]","[""978-80-85951-47-9""]",,,,,,,"""1""","""0""","""Dokonalý obchod""",,"""Alex Drescher""",,,,,,"[""272 s. ;""]",,"[""16 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007,272,"""brožovaná""",1972,,"""m""",35
"""1""","""Žamboch, Miroslav,""","""xx0000241""","[""aut""]","""1972-""",,,,,"""nkc20142624769""",""" nam a22 a 4500""","""140916s2014 xr g 0…","""(brož.)""",,"[""978-80-7387-795-8""]",,,,,,,"""1""","""0""","""In nomine sanguinis""",,"""Miroslav Žamboch""",,,,,,"[""254 s. ;""]",,"[""20 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2014,254,"""brožovaná""",1972,,"""m""",42
"""1""","""Dvořáková, Petra,""","""xx0044613""","[""aut""]","""1977-""",,,,,"""nkc20182992996""",""" cam a22 i 4500""","""180206s2018 xr g 0…","""(vázáno)""",,"[""978-80-7577-476-7""]",,,,,,,"""1""","""0""","""Dědina""","""pole, závist, chtíč a otčina /""","""Petra Dvořáková""",,,,,,"[""243 stran ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018,243,"""pevná""",1977,,"""f""",41
"""1""","""Ströbinger, Rudolf,""","""jk01121482""","[""aut""]","""1931-2005""",,,,,"""cpk20051525396""",""" nam a22 a 4500""","""050621s2005 xr g 0…","""(váz.) :""","[""Kč 199,00""]","[""80-243-1599-8""]",,,,,,,"""1""","""0""","""Smrt Šípkové Růženky""",,"""Rudolf Ströbinger""",,,,,,"[""223 s. ;""]",,"[""19 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2005,223,"""pevná""",1931,2005,"""m""",74
"""1""","""Urban, Miloslav,""","""jk01140821""","[""aut""]","""1940-2006""",,,,,"""cpk20000927225""",""" nam a22 a 4500""","""000315s2000 xr e 0…","""(váz.)""",,"[""80-86297-02-0""]","[""80-901667-6-8""]",,,,,,"""1""","""0""","""Stopařky už neberu""",,"""Miloslav Urban""",,,,,,"[""191 s. ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2000,191,"""pevná""",1940,2006,"""m""",60
"""1""","""Mazurek, Jiří,""","""xx0070341""","[""aut""]","""1974-""",,,,,"""nkc20112172432""",""" nam a22 a 4500""","""110303s2011 xr g 0…","""(brož.)""",,"[""978-80-7425-089-7""]",,,,,,,"""1""","""0""","""Město čarodějů""",,"""Jiří Mazurek""",,,,,,"[""197 s. ;""]",,"[""17 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011,197,"""brožovaná""",1974,,"""m""",37
"""1""","""Dubská, Kateřina,""","""osa2013757226""","[""aut""]","""1965-""",,,,,"""nkc20233575993""",""" cam a22 i 4500""","""231128s2023 xr a g 0…","""(vázáno)""",,"[""978-80-271-3998-9""]",,,,,,,"""1""","""0""","""Třicátý kilometr""",,"""Kateřina Dubská""",,,,,,"[""207 stran :""]","[""ilustrace ;""]","[""22 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2023,207,"""pevná""",1965,,"""f""",58


In [95]:
hranicni_rok = 2010
zavedeni = df.filter(pl.col('rok').is_between(1980,hranicni_rok)).select(pl.col('100_7')).to_series().to_list()
print(len(zavedeni))
struktura_zavedenych = pl.concat([ceske_romany, ceske_kratke]).filter(pl.col('rok') > hranicni_rok).filter(pl.col('100_7').is_in(zavedeni)).group_by(['gender','vek']).len().sort(by='vek')
struktura_zavedenych_m = struktura_zavedenych.filter(pl.col('gender') == 'm').with_columns(pl.col('len').rolling_mean(window_size=3))
struktura_zavedenych_f = struktura_zavedenych.filter(pl.col('gender') == 'f').with_columns(pl.col('len').rolling_mean(window_size=3))
struktura_zavedenych = pl.concat([struktura_zavedenych_m, struktura_zavedenych_f])

alt.Chart(struktura_zavedenych.to_pandas()).mark_line().encode(alt.X('vek'),alt.Y('len'),alt.Color('gender'))

126207


In [97]:
struktura_zavedenych.filter(pl.col('vek').is_between(47,52)).sort(by='vek')

gender,vek,len
str,i64,f64
"""m""",47,66.0
"""f""",47,70.666667
"""m""",48,68.0
"""f""",48,73.0
"""m""",49,71.0
"""f""",49,71.0
"""m""",50,63.666667
"""f""",50,68.666667
"""m""",51,69.333333
"""f""",51,62.0


In [99]:
pl.concat([ceske_romany, ceske_kratke]).filter(pl.col('rok') >= 2000).group_by('vek').len()

vek,len
i64,u32
12,2
24,91
15,17
21,73
18,29
36,335
30,211
45,348
27,147
33,259
