In [121]:
import os
import datetime
import warnings
import polars as pl
import pandas as pd
import altair as alt

from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin
from src.zjisti_vazbu import zjisti_vazbu

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')
warnings.filterwarnings('ignore')

In [242]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","100.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="left")
df = df.to_pandas()
df = df[df["leader"].str[6].isin(["a", "t"])]
df = df[~df["leader"].str[7].isin(["b", "i", "s", " "])]
df = df[(df["008"].str[15:17] == "xr") & (df["008"].str[35:38] == "cze")]
df = pl.from_pandas(df)
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","020.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","022.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","245.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","300.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","655.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","700.parquet")), left_on="001", right_on="001", how="left")
df = df.explode("022_a").filter(pl.col("022_a").is_null())
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))
df = df.with_columns(pl.col('300_a').map_elements(pocet_stran, return_dtype=int).alias('stran'))
df = df.with_columns(pl.col('245_a').map_elements(bez_bordelu, return_dtype=str))
df = df.explode("020_q").with_columns(pl.col("020_q").map_elements(zjisti_vazbu, return_dtype=str).alias('vazba'))
df = df.explode('245_p').with_columns(pl.col('245_p').map_elements(bez_bordelu, return_dtype=str))
df = df.filter(pl.col("stran") > 15)
print(len(df))
df = df.filter((~pl.col("245_h").str.contains("grafika")) | pl.col("245_h").is_null()).unique(subset=["100_a","245_a"], keep="first")
print(len(df))

727498
525308


In [243]:
aut = pl.read_parquet(os.path.join("data","aut_vyber.parquet"))
cesi = aut.explode("370_c").filter(pl.col("370_c").str.contains("Česk")).explode("100_7").select(pl.col("100_7")).to_series().to_list()
len(cesi)

364420

In [244]:
df = df.filter(pl.col("100_7").is_in(cesi))

In [245]:
vek = pl.read_parquet(os.path.join("data","narozeni-umrti-gender.parquet"))
df = df.join(vek, on="100_7", how="left").with_columns((pl.col("rok") - pl.col("narozeni")).alias("vek"))

In [282]:
historicky_podil = df.filter(
    pl.col('gender') == 'm').group_by("rok").len().rename({'len':'m'}).join(
    df.filter(pl.col('gender') == 'f').group_by("rok").len().rename({'len':'f'}), on='rok'
).with_columns(((pl.col('f') / (pl.col('m') + pl.col('f'))).alias('podil'))).sort(by='rok').filter(
    pl.col("rok").is_between(1800,2024)).with_columns(
    pl.col("podil").rolling_mean(window_size=2)
    )

In [284]:
historicky_podil.head(20)

rok,m,f,podil
i64,u32,u32,f64
1819,13,1,
1825,33,1,0.05042
1826,18,2,0.064706
1827,17,2,0.102632
1828,20,1,0.076441
1830,25,2,0.060847
1834,17,1,0.064815
1838,20,1,0.051587
1840,27,1,0.041667
1841,24,1,0.037857


In [256]:
historicky_podil.tail(20)

rok,m,f,podil
i64,u32,u32,f64
2005,3651,1361,0.271548
2006,3654,1510,0.292409
2007,3809,1560,0.290557
2008,3864,1614,0.294633
2009,3554,1569,0.306266
2010,3720,1615,0.302718
2011,3292,1463,0.307676
2012,3123,1467,0.319608
2013,3534,1695,0.324154
2014,3683,1813,0.329876


In [362]:
alt.Chart(
    alt_friendly(historicky_podil.filter(pl.col("rok") >= 1900)), width=300, height=100,
    title=["Podíl ženských autorek na nově vydaných","původních českých knihách všech žánrů"]
).mark_line().encode(
    alt.X('rok:T', title=None, axis=alt.Axis(tickCount=4)),
    alt.Y('podil:Q', axis=alt.Axis(tickCount=4, labelExpr="datum.label * 100 + ' %'", orient='right'), title=None, scale=alt.Scale(domain=[0,0.5]))
).configure_view(stroke='transparent')

In [405]:
df_do_sta = df.filter(pl.col("vek") < 100) #.filter(pl.col("rok").is_between(1999,2024))

In [421]:
ceska_poezie = df_do_sta.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a") == "česká poezie"
)

In [342]:
df_do_sta.filter(pl.col("rok") == 2012).explode("655_a").group_by("655_a").len().sort(by="len",descending=True)

655_a,len
str,u32
"""monografie""",602
"""monographs""",594
"""učebnice vysokých škol""",320
"""textbooks (higher)""",314
"""česká poezie""",298
"""Czech poetry""",292
"""publikace pro děti""",254
"""kolektivní monografie""",253
"""collective monographs""",252
"""children's literature""",250


In [407]:
ceske_romany = df_do_sta.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a") == "české romány"
)

In [541]:
ceske_kratke = df_do_sta.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a").is_in(["české novely","české povídky","české příběhy","české prózy"])
)

In [543]:
df_podil = ceske_romany.filter(
    pl.col("gender") == 'f'
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceske_romany.filter(pl.col("gender") == 'm').group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("romány").alias("zanr")).with_columns(pl.col("podil_zen").rolling_mean(window_size=2))

In [545]:
df_podil_kratke = ceske_kratke.filter(
    pl.col("gender") == 'f'
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceske_kratke.filter(pl.col("gender") == 'm').group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("kratší prózy").alias("zanr")).with_columns(pl.col("podil_zen").rolling_mean(window_size=2))

In [547]:
df_podil_poezie = ceska_poezie.filter(
    (pl.col("gender") == 'f')
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceska_poezie.filter(pl.col("gender") == 'm').group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("poezie").alias("zanr")).with_columns(pl.col("podil_zen").rolling_mean(window_size=2))

In [549]:
alt.Chart(
    alt_friendly(pl.concat([df_podil, df_podil_kratke, df_podil_poezie]).filter(pl.col('rok') >= 2000)), 
    title=alt.Title(["Podíl ženských autorek","na nově vydaných českých knihách"], lineHeight=21), width=250, height=80
         ).mark_line(
         ).encode(
    alt.X("rok:T", title=None, axis=alt.Axis(tickCount=5)),
    alt.Y('podil_zen:Q', title=None, axis=alt.Axis(tickCount=4, labelExpr="datum.label * 100 + ' %'", orient='right'), scale=alt.Scale(domain=[0,0.5])),
    alt.Row("zanr:N",header=alt.Header(labelAngle=0, labelAlign='left', labelAnchor='middle', labelFont='Asap'), sort=["romány","kratší prózy","poezie"], title=None)
).configure_view(stroke='transparent').resolve_scale(y='shared').resolve_axis(x="independent")

## Věková struktura

In [142]:
cr2 = pl.concat(ceske_romany, .group_by(["gender","vek"]).len().sort(by="vek")
cr2b = ceske_romany.filter(pl.col('rok').is_between(2015,2024)).group_by(["gender","vek"]).len().sort(by="vek")
cr2c = ceske_romany.filter(pl.col('rok').is_between(2000,2010)).group_by(["gender","vek"]).len().sort(by="vek")

In [143]:
alt.Chart(cr2b.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_line().encode(alt.X("vek"),alt.Y("len"),alt.Color("gender"))

In [144]:
alt.Chart(cr2c.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_line().encode(alt.X("vek"),alt.Y("len"),alt.Color("gender"))

In [145]:
alt.Chart(cr2.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_line().encode(alt.X("vek"),alt.Y("len"),alt.Color("gender"))

In [551]:
alt.Chart(cr2.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_bar().encode(alt.X("vek"),alt.Y("len"),alt.Row("gender"))

In [147]:
cr3 = ceske_romany.group_by(["gender","rok"]).agg(pl.col('vek').median())
cr3

gender,rok,vek
str,i64,f64
,2015,36.0
,2002,58.5
"""f""",2018,45.0
"""m""",2024,51.0
"""m""",2023,53.0
"""m""",2002,55.0
"""m""",2004,51.0
"""f""",2004,49.0
"""m""",2005,53.5
"""f""",2007,47.0


In [148]:
alt.Chart(alt_friendly(cr3.filter(pl.col('gender').is_in(['m','f'])))).mark_line().encode(alt.X("rok"),alt.Y("vek"),alt.Row("gender"))

## Věková struktura, druhý pohled

In [429]:
pl.concat([ceske_romany, ceske_kratke]).group_by("rok").len().sort(by="rok")

rok,len
i64,u32
1804,1
1815,1
1818,1
1819,1
1823,2
1824,1
1825,10
1828,4
1829,2
1830,3


In [433]:
pl.concat([ceske_romany, ceske_kratke]).filter(pl.col('rok') >= 2000)

100_ind1,100_a,100_7,100_4,100_d,100_q,100_c,100_b,100_e,001,leader,008,020_q,020_c,020_a,020_z,022_a,022_y,022_z,022_ind1,022_l,245_ind1,245_ind2,245_a,245_b,245_c,245_n,245_p,245_h,245_f,245_s,300_a,300_b,300_c,300_e,300_f,300_3,…,655_2,655_ind1,655_y,655_z,700_ind1,700_a,700_4,700_d,700_7,700_t,700_q,700_l,700_ind2,700_c,700_b,700_i,700_m,700_k,700_n,700_r,700_p,700_o,700_s,700_j,700_x,700_e,700_f,700_5,700_9,700_g,rok,stran,vazba,narozeni,umrti,gender,vek
str,str,str,list[str],str,str,list[str],str,str,str,str,str,str,list[str],list[str],list[str],str,list[str],list[str],str,str,str,str,str,str,str,list[str],str,str,str,str,list[str],list[str],list[str],list[str],str,str,…,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],i64,i64,str,i64,i64,str,i64
"""1""","""Strauss, Josef,""","""xx0004281""","[""aut""]","""1963-""",,,,,"""nkc20051632619""",""" nam a22 a 4500""","""051124s2005 xr g 0…","""(brož.)""",,"[""80-86362-52-3""]",,,,,,,"""1""","""0""","""Jarka, to je bůh člověk""",,"""Josef Strauss""",,,,,,"[""212 s. ;""]",,"[""19 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2005,212,"""brožovaná""",1963,,"""m""",42
"""1""","""Nachtmanová, Petra,""","""xx0040379""","[""aut""]","""1972-""",,,,,"""nkc20071759405""",""" cam a22 a 4500""","""071024s2007 xr d 0…","""(váz.)""",,"[""978-80-7268-443-4""]",,,,,,,"""1""","""0""","""Prstýnkové mámení""",,"""Petra Nachtmanová""",,,,,,"[""166 s. ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007,166,"""pevná""",1972,,"""f""",35
"""1""","""Cubeca, Karel,""","""mzk2005309135""","[""aut""]","""1960-""",,,,,"""nkc20091994734""",""" nam a22 a 4500""","""090916s2009 xr e 0…","""(váz.)""",,"[""978-80-7362-723-2""]",,,,,,,"""1""","""0""","""A Bůh mlčel""",,"""Karel Cubeca""",,,,,,"[""448 s. ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2009,448,"""pevná""",1960,,"""m""",49
"""1""","""Drescher, Alex,""","""xx0066461""","[""aut""]","""1972-""",,,,,"""nkc20081815854""",""" cam a22 i 4500""","""091103s2008 xr g 0…","""(brožováno) :""","[""Kč 249,00""]","[""978-80-85951-52-3""]",,,,,,,"""1""","""0""","""S čím kdo schází""",,"""Alex Drescher""",,,,,,"[""395 stran ;""]",,"[""16 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2008,395,"""brožovaná""",1972,,"""m""",36
"""1""","""Cimický, Jan,""","""jk01020718""","[""aut""]","""1948-""",,,,,"""nkc20172930000""",""" nam a22 i 4500""","""170830s2017 xr g 0…","""(vázáno)""",,"[""978-80-269-0678-0""]",,,,,,,"""1""","""0""","""Kamarádi""",,"""Jan Cimický""",,,,,,"[""271 stran ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017,271,"""pevná""",1948,,"""m""",69
"""1""","""Matějovský, Bohumil,""","""jo2008420235""","[""aut""]","""1961-""",,,,,"""nkc20112161824""",""" nam a22 a 4500""","""110119s2010 xr g 0…","""(váz.)""",,"[""978-80-7268-702-2""]",,,,,,,"""1""","""0""","""Dobrodružství potkanů""",,"""Bohumil Matějovský""",,,,,,"[""235 s. ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2010,235,"""pevná""",1961,,"""m""",49
"""1""","""Řeháčková, Věra,""","""jn19990209737""","[""aut""]","""1950-""",,,,,"""nkc20071723431""",""" nam a22 a 4500""","""070608s2007 xr c 0…","""(váz.)""",,"[""978-80-7301-208-3""]",,,,,,,"""1""","""0""","""Síla měsíčního náhrdelníku""","""napínavé čtení pro holky i klu…","""Věra Řeháčková""",,,,,,"[""136 s. ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007,136,"""pevná""",1950,,"""f""",57
"""1""","""Hofmanová, Jaroslava,""","""xx0022910""","[""aut""]","""1963-""",,,,,"""nkc20061703028""",""" nam a22 a 4500""","""061228s2007 xr g 0…","""(váz.)""",,"[""978-80-7214-970-4""]",,,,,,,"""1""","""0""","""Láska není minulost""",,"""Jaroslava Hofmanová""",,,,,,"[""127 s. ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007,127,"""pevná""",1963,,"""f""",44
"""1""","""Gruber, Václav,""","""nlk20010096082""","[""aut""]","""1953-""",,,,,"""nkc20193071368""",""" nam a22 i 4500""","""190118s2019 xr g 0…","""(vázáno)""",,"[""978-80-243-8621-8""]",,,,,,,"""1""","""0""","""Domino""",,"""Václav Gruber""",,,,,,"[""238 stran ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,238,"""pevná""",1953,,"""m""",66
"""1""","""Šumbera, Jan,""","""xx0194094""","[""aut""]","""1991-""",,,,,"""nkc20172900478""",""" cam a22 i 4500""","""170505s2017 xr g 0…","""(vázáno)""",,"[""978-80-7229-607-1""]",,,,,,,"""1""","""0""","""Lovec a motýl""",,"""Jan Šumbera""",,,,,,"[""109 stran ;""]",,"[""22 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017,109,"""pevná""",1991,,"""m""",26


4667

In [553]:
hranicni_rok = 2010
zavedeni = df.filter(pl.col('rok').is_between(1980,hranicni_rok)).select(pl.col('100_7')).to_series().to_list()
print(len(zavedeni))
struktura_zavedenych = pl.concat([ceske_romany, ceske_kratke]).filter(pl.col('rok') > hranicni_rok).filter(pl.col('100_7').is_in(zavedeni)).group_by(['gender','vek']).len().sort(by='vek')
struktura_zavedenych_m = struktura_zavedenych.filter(pl.col('gender') == 'm').with_columns(pl.col('len').rolling_mean(window_size=3))
struktura_zavedenych_f = struktura_zavedenych.filter(pl.col('gender') == 'f').with_columns(pl.col('len').rolling_mean(window_size=3))
struktura_zavedenych = pl.concat([struktura_zavedenych_m, struktura_zavedenych_f])

alt.Chart(struktura_zavedenych.to_pandas()).mark_line().encode(alt.X('vek'),alt.Y('len'),alt.Color('gender'))

126207


In [555]:
struktura_zavedenych.filter(pl.col('vek').is_between(47,52)).sort(by='vek')

gender,vek,len
str,i64,f64
"""m""",47,66.0
"""f""",47,70.666667
"""m""",48,68.0
"""f""",48,73.0
"""m""",49,71.0
"""f""",49,71.0
"""m""",50,63.666667
"""f""",50,68.666667
"""m""",51,69.333333
"""f""",51,62.0


In [435]:
pl.concat([ceske_romany, ceske_kratke]).filter(pl.col('rok') >= 2000).group_by('vek').len()

vek,len
i64,u32
21,50
18,23
15,12
24,57
42,191
33,164
48,179
27,93
45,187
36,208
