In [50]:
import os
import datetime
import warnings
import polars as pl
import pandas as pd
import altair as alt

from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin
from src.zjisti_vazbu import zjisti_vazbu

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')
warnings.filterwarnings('ignore')

In [51]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","100.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="left")
df = df.to_pandas()
df = df[df["leader"].str[6].isin(["a", "t"])]
df = df[~df["leader"].str[7].isin(["b", "i", "s", " "])]
df = df[(df["008"].str[15:17] == "xr") & (df["008"].str[35:38] == "cze")]
df = pl.from_pandas(df)
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","020.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","022.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","245.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","300.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","655.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","700.parquet")), left_on="001", right_on="001", how="left")
df = df.explode("022_a").filter(pl.col("022_a").is_null())
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))
df = df.with_columns(pl.col('300_a').map_elements(pocet_stran, return_dtype=int).alias('stran'))
df = df.with_columns(pl.col('245_a').map_elements(bez_bordelu, return_dtype=str))
df = df.explode("020_q").with_columns(pl.col("020_q").map_elements(zjisti_vazbu, return_dtype=str).alias('vazba'))
df = df.explode('245_p').with_columns(pl.col('245_p').map_elements(bez_bordelu, return_dtype=str))
print(len(df))
df = df.filter(pl.col("rok").is_between(2000,2024))
df = df.filter((~pl.col("245_h").str.contains("grafika")) | pl.col("245_h").is_null()).unique(subset=["008","100_a","245_a","245_p"], keep="first")
print(len(df))

794838
294356


In [52]:
aut = pl.read_parquet(os.path.join("data","aut_vyber.parquet"))
cesi = aut.explode("370_c").filter(pl.col("370_c").str.contains("Česk")).explode("100_7").select(pl.col("100_7")).to_series().to_list()
len(cesi)

363993

In [53]:
df = df.filter(pl.col("100_7").is_in(cesi))

In [54]:
vek = pl.read_parquet(os.path.join("data","narozeni-umrti.parquet"))
df = df.join(vek, on="100_7", how="left").with_columns((pl.col("rok") - pl.col("narozeni")).alias("vek"))

In [91]:
df = df.filter(pl.col("vek") < 100)

In [92]:
aut_muzi = aut.explode("375_a").filter(pl.col("375_a") == "muž").explode('100_7').select(pl.col('100_7')).to_series().to_list()
aut_zeny = aut.explode("375_a").filter(pl.col("375_a") == "žena").explode('100_7').select(pl.col('100_7')).to_series().to_list()

In [93]:
wikid = pl.read_parquet(os.path.join("data","wikidata.parquet"))

In [94]:
wikid_muzi = wikid.filter(pl.col("w_gender") == "muž").select(pl.col("__index_level_0__")).to_series().to_list()
wikid_zeny = wikid.filter(pl.col("w_gender") == "žena").select(pl.col("__index_level_0__")).to_series().to_list()

In [95]:
zeny = set(aut_zeny + wikid_zeny)
muzi = set(aut_muzi + wikid_muzi)

In [118]:
muzi

{'mub20221149097',
 'mub20241212676',
 'mub2011660015',
 'mub20221165127',
 'xx0218677',
 'xx0261005',
 'pna20211129904',
 'uk20221171706',
 'xx0055079',
 'jn20011211164',
 'uk2007318245',
 'js2018989233',
 'xx0220946',
 'hka2011631093',
 'xx0083171',
 'jk01082876',
 'js20191057361',
 'xx0075384',
 'av20241219195',
 'kup19980000098312',
 'mzk2015859686',
 'xx0267020',
 'xx0251098',
 'ctu20221148166',
 'mub2017969713',
 'mub20231176570',
 'xx0204815',
 'vse2013770753',
 'xx0173033',
 'mub2015886955',
 'xx0226250',
 'jo2018984839',
 'mub20241225795',
 'xx0221740',
 'mzk2008442840',
 'xx0278753',
 'jo2017941599',
 'kup19980000009550',
 'xx0170558',
 'ola2002153152',
 'js20181003606',
 'jx20110110009',
 'mzk2015867731',
 'jo2016914220',
 'pna2016935070',
 'skuk0003369',
 'ola2014850547',
 'hka2012690797',
 'xx0255639',
 'js2014840761',
 'mub20221154145',
 'jo2015866709',
 'xx0270136',
 'mub20231174538',
 'xx0219710',
 'jo2015865749',
 'xx0206537',
 'jk01092080',
 'xx0308842',
 'av201693380

In [122]:
df

100_ind1,100_a,100_7,100_4,100_d,100_q,100_c,100_b,100_e,001,leader,008,020_q,020_c,020_a,020_z,022_a,022_y,022_z,022_ind1,022_l,245_ind1,245_ind2,245_a,245_b,245_c,245_n,245_p,245_h,245_f,245_s,300_a,300_b,300_c,300_e,300_f,300_3,…,655_2,655_ind1,655_x,655_z,655_y,700_ind1,700_a,700_4,700_d,700_7,700_t,700_q,700_l,700_ind2,700_c,700_b,700_i,700_m,700_n,700_k,700_r,700_p,700_o,700_s,700_j,700_6,700_x,700_e,700_f,700_5,700_g,rok,stran,vazba,narozeni,umrti,vek
str,str,str,list[str],str,str,list[str],str,str,str,str,str,str,list[str],list[str],list[str],str,list[str],list[str],str,str,str,str,str,str,str,list[str],str,str,str,str,list[str],list[str],list[str],list[str],str,str,…,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str,list[str],list[str],list[str],list[str],list[str],i64,i64,str,i64,i64,i64
"""1""","""Mikolášek, Antonín,""","""jk01081566""","[""aut""]","""1927-2023""",,,,,"""cpk20000961570""",""" nam a22 a 4500""","""000418s2000 xr a e 0…","""(brož.)""",,"[""80-7169-921-7""]",,,,,,,"""1""","""0""","""Saunujeme děti, aneb, Co dokáž…",,"""Antonín Mikolášek""",,,,,,"[""104 s. :""]","[""il. ;""]","[""21 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2000,104,"""brožovaná""",1927,2023,73
"""1""","""Foldyna, Libor,""","""mzk2009502677""","[""aut""]","""1962-""",,,,,"""nkc20102087145""",""" nam a22 a 4500""","""100226s2009 xr e p 0…","""(Sdružení požárního a bezpečno…",,"[""978-80-7385-077-7""]",,,,,,,"""1""","""0""","""Nouzové přežití""",,"""Libor Foldyna""",,,,,,"[""61 s. ;""]",,"[""22 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2009,61,,1962,,47
"""1""","""Kliková, Christiana,""","""xx0008643""","[""aut""]","""1944-""",,,,,"""nkc20193093382""",""" nam a22 i 4500""","""190425s2019 xr a e p 0…","""(vázáno)""",,"[""978-80-87291-23-8""]",,,,,,,"""1""","""0""","""Hospodářská a sociální politik…",,"""Christiana Kliková, Igor Kotlá…",,,,,,"[""388 stran :""]","[""ilustrace ;""]","[""26 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,"[""1""]","[""Kotlán, Igor,""]","[""aut""]","[""1974-""]","[""mzk2003171435""]",,,,,,,,,,,,,,,,,,,,,,2019,388,"""pevná""",1944,,75
"""1""","""Lemberková, Marcela,""","""mzk2008434121""","[""aut""]","""1964-""",,,,,"""nkc20102152326""",""" nam a22 a 4500""","""120120s2010 xr a e f 0…","""(brož.)""",,"[""978-80-254-8872-0""]",,,,,,,"""1""","""0""","""Přírodní rezervace a přírodní …",,"""[text M. Lemberková, J. Seidlo…",,"""Okres Svitavy""",,,,"[""11 s. :""]","[""barev. il. ;""]","[""21 cm +""]","[""samolepky ([4] l.)""]",,,…,"[""czenas"", ""eczenas""]",,,,,"[""1"", ""1"", ""1""]","[""Seidlová, Jana"", ""Svobodová, Jana"", ""Růžičková, Zuzana,""]","[""aut"", ""aut"", ""pht""]","[null, null, ""1966-""]","[""xx0145649"", ""xx0026504"", ""mzk2007381985""]",,,,,,,,,,,,,,,,,,,,,,2010,11,"""brožovaná""",1964,,46
"""1""","""Štrunc, Matěj Metoděj,""","""xx0249115""","[""aut""]","""1996-""",,,,,"""nkc20203256599""",""" cam a22 i 4500""","""200701s2020 xr g 0…","""(v knize neuvedeno ;""",,"[""978-80-11-00413-2""]",,,,,,,"""1""","""0""","""Ze Severu""",,"""Matěj Metoděj Štrunc""",,,,,,"[""105 stran :""]","[""barevné ilustrace ;""]","[""19 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020,105,,1996,,24
"""1""","""Zlámal, Jaroslav,""","""jn20010309444""","[""aut""]","""1947-""",,,,,"""cpk20010889498""",""" cam a22 a 4500""","""010808s2001 xr e p 0…","""(Univerzita Palackého ;""","[""neprodejné""]","[""80-244-0279-3""]",,,,,,,"""1""","""0""","""Účetnictví""",,"""Jaroslav Zlámal""",,,,,,"[""73 s. ;""]",,"[""29 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2001,73,,1947,,54
"""1""","""Komárek, Stanislav,""","""jn19990209376""","[""aut""]","""1958 srpen 6.-""",,,,,"""nkc20243605440""",""" nam a22 i 4500""","""240510s2024 xr e 0…","""(vázáno)""",,"[""978-80-200-3572-1""]",,,,,,,"""1""","""0""","""Stíny v našich duších""","""kronika virového šílení : III.…","""Stanislav Komárek""",,,,,,"[""260 stran ;""]",,"[""20 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,"[""1""]","[""Komárek, Stanislav,""]",,"[""1958 srpen 6.-""]",,"[""Města a městečka""]",,,,,,"[""První dva díly vyšly pod názvem:""]",,,,,,,,,,,,,,,2024,260,"""pevná""",1958,,66
"""1""","""Ráb, Miloš,""","""jk01101825""","[""aut""]","""1928-""",,,,,"""cpk20041412635""",""" cam a22 a 4500""","""041022t20041989xr a e p 0…","""(brož.) :""","[""Kč 60,00""]","[""80-210-3416-5""]",,,,,,,"""1""","""0""","""Metody řešení obyčejných difer…",,"""Miloš Ráb""",,,,,,"[""96 s. :""]","[""il. ;""]","[""24 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2004,96,"""brožovaná""",1928,2007,76
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20132502013""",""" cam a22 a 4500""","""130826s2013 xr e d 0…","""(kroužková vazba)""",,"[""978-80-7457-232-6""]",,,,,,,"""1""","""0""","""Česko-španělský slovník jmen p…",,"""J. Vedral""",,,,,,"[""112 s. ;""]",,"[""21 cm""]",,,,…,"[""czenas"", ""czenas"", … ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013,112,,1973,,40
"""1""","""Panuš, Jan,""","""mzk2005318068""","[""aut""]","""1976-""",,,,,"""nkc20142651534""",""" nam a22 a 4500""","""150121s2014 xr a e p 0…","""(brož.)""",,"[""978-80-7395-866-4""]",,,,,,,"""1""","""0""","""Základy algoritmizace""",,"""Jan Panuš""",,,,,,"[""98 s. :""]","[""il. ;""]","[""30 cm""]",,,,…,"[""czenas"", ""eczenas""]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2014,98,"""brožovaná""",1976,,38


In [132]:
df = df.with_columns(
    pl.when(pl.col("100_7").is_in(muzi))
    .then(pl.lit("m"))
    .when(pl.col("100_7").is_in(zeny))
    .then(pl.lit("f"))
    .otherwise(None)
    .alias("gender")
)

In [133]:
ceska_poezie = df.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a") == "česká poezie"
)

In [134]:
ceske_romany = df.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a") == "české romány"
)

In [135]:
ceske_kratke = df.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a").is_in(["české novely","české povídky"])
)

In [136]:
df_podil = ceske_romany.filter(
    pl.col("100_7").is_in(zeny)
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceske_romany.filter(pl.col("100_7").is_in(muzi)).group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("romány").alias("zanr"))

In [137]:
df_podil_kratke = ceske_kratke.filter(
    pl.col("100_7").is_in(zeny)
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceske_kratke.filter(pl.col("100_7").is_in(muzi)).group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("novely a povídky").alias("zanr"))

In [138]:
df_podil_poezie = ceska_poezie.filter(
    pl.col("100_7").is_in(zeny)
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceska_poezie.filter(pl.col("100_7").is_in(muzi)).group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("poezie").alias("zanr"))

In [139]:
df_podil_poezie

rok,zeny,muzi,celkem,podil_zen,zanr
i64,u32,u32,u32,f64,str
2000,43,169,212,0.20283,"""poezie"""
2001,42,171,213,0.197183,"""poezie"""
2002,40,142,182,0.21978,"""poezie"""
2003,68,196,264,0.257576,"""poezie"""
2004,49,192,241,0.20332,"""poezie"""
2005,59,203,262,0.225191,"""poezie"""
2006,78,198,276,0.282609,"""poezie"""
2007,65,243,308,0.211039,"""poezie"""
2008,83,232,315,0.263492,"""poezie"""
2009,91,242,333,0.273273,"""poezie"""


In [140]:
alt.Chart(alt_friendly(pl.concat([df_podil, df_podil_kratke, df_podil_poezie])), title=alt.Title(["Podíl ženských autorek","na nově vydaných českých knihách"], lineHeight=21), width=250, height=90).mark_area().encode(
    alt.X("rok:T", title=None, axis=alt.Axis(tickCount=5)),
    alt.Y('podil_zen:Q', title=None, axis=alt.Axis(tickCount=4, labelExpr="datum.label * 100 + ' %'", orient='right'), scale=alt.Scale(domain=[0,0.5])),
    alt.Row("zanr:N",header=alt.Header(labelAngle=0, labelAlign='left', labelAnchor='middle', labelFont='Asap'), sort=["romány","novely a povídky","poezie"], title=None)
).configure_view(stroke='transparent').resolve_scale(y='shared')

## Věková struktura

In [155]:
cr2 = ceske_romany.group_by(["gender","vek"]).len().sort(by="vek")
cr2 

gender,vek,len
str,i64,u32
"""f""",13,1
,14,1
"""f""",14,1
,15,1
"""f""",15,7
"""m""",15,2
"""m""",16,1
"""f""",16,3
,16,4
"""m""",17,6


In [167]:
alt.Chart(cr2.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_bar().encode(alt.X("vek"),alt.Y("len"),alt.Row("gender"))

In [173]:
cr3 = ceske_romany.group_by(["gender","rok"]).agg(pl.col('vek').median())
cr3

gender,rok,vek
str,i64,f64
"""m""",2024,51.0
,2017,70.0
"""f""",2021,42.0
"""m""",2012,54.0
"""m""",2015,56.0
"""m""",2011,55.0
"""m""",2016,53.5
,2000,42.5
"""m""",2021,54.5
,2005,49.0


In [181]:
alt.Chart(alt_friendly(cr3.filter(pl.col('gender').is_in(['m','f'])))).mark_line().encode(alt.X("rok"),alt.Y("vek"),alt.Row("gender"))