In [96]:
import os
import datetime
import warnings
import polars as pl
import pandas as pd
import altair as alt

from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin
from src.zjisti_vazbu import zjisti_vazbu

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')
warnings.filterwarnings('ignore')

In [97]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","100.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="left")
df = df.to_pandas()
df = df[df["leader"].str[6].isin(["a", "t"])]
df = df[~df["leader"].str[7].isin(["b", "i", "s", " "])]
df = df[(df["008"].str[15:17] == "xr") & (df["008"].str[35:38] == "cze")]
df = pl.from_pandas(df)
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","020.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","022.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","245.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","300.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","655.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","700.parquet")), left_on="001", right_on="001", how="left")
df = df.explode("022_a").filter(pl.col("022_a").is_null())
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))
df = df.with_columns(pl.col('300_a').map_elements(pocet_stran, return_dtype=int).alias('stran'))
df = df.with_columns(pl.col('245_a').map_elements(bez_bordelu, return_dtype=str))
df = df.explode("020_q").with_columns(pl.col("020_q").map_elements(zjisti_vazbu, return_dtype=str).alias('vazba'))
df = df.explode('245_p').with_columns(pl.col('245_p').map_elements(bez_bordelu, return_dtype=str))
print(len(df))
df = df.filter(pl.col("rok").is_between(2000,2024))
df = df.filter((~pl.col("245_h").str.contains("grafika")) | pl.col("245_h").is_null()).unique(subset=["008","100_a","245_a","245_p"], keep="first")
print(len(df))

794838
294356


In [98]:
aut = pl.read_parquet(os.path.join("data","aut_vyber.parquet"))
cesi = aut.explode("370_c").filter(pl.col("370_c").str.contains("Česk")).explode("100_7").select(pl.col("100_7")).to_series().to_list()
len(cesi)

363993

In [99]:
df = df.filter(pl.col("100_7").is_in(cesi))

In [100]:
vek = pl.read_parquet(os.path.join("data","narozeni-umrti-gender.parquet"))
df = df.join(vek, on="100_7", how="left").with_columns((pl.col("rok") - pl.col("narozeni")).alias("vek"))

In [101]:
df = df.filter(pl.col("vek") < 100)

In [102]:
ceska_poezie = df.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a") == "česká poezie"
)

In [103]:
ceske_romany = df.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a") == "české romány"
)

In [104]:
ceske_kratke = df.sort(by="rok").unique(
        subset=["100_a","245_a"]
    ).explode("655_a").filter(
        pl.col("655_a").is_in(["české novely","české povídky"])
)

In [105]:
df.columns

['100_ind1',
 '100_a',
 '100_7',
 '100_4',
 '100_d',
 '100_q',
 '100_c',
 '100_b',
 '100_e',
 '001',
 'leader',
 '008',
 '020_q',
 '020_c',
 '020_a',
 '020_z',
 '022_a',
 '022_y',
 '022_z',
 '022_ind1',
 '022_l',
 '245_ind1',
 '245_ind2',
 '245_a',
 '245_b',
 '245_c',
 '245_n',
 '245_p',
 '245_h',
 '245_f',
 '245_s',
 '300_a',
 '300_b',
 '300_c',
 '300_e',
 '300_f',
 '300_3',
 '655_ind2',
 '655_a',
 '655_7',
 '655_2',
 '655_ind1',
 '655_x',
 '655_z',
 '655_y',
 '700_ind1',
 '700_a',
 '700_4',
 '700_d',
 '700_7',
 '700_t',
 '700_q',
 '700_l',
 '700_ind2',
 '700_c',
 '700_b',
 '700_i',
 '700_m',
 '700_n',
 '700_k',
 '700_r',
 '700_p',
 '700_o',
 '700_s',
 '700_j',
 '700_6',
 '700_x',
 '700_e',
 '700_f',
 '700_5',
 '700_g',
 'rok',
 'stran',
 'vazba',
 'narozeni',
 'umrti',
 'gender',
 'vek']

In [106]:
df_podil = ceske_romany.filter(
    pl.col("gender") == 'f'
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceske_romany.filter(pl.col("gender") == 'm').group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("romány").alias("zanr"))

In [107]:
df_podil_kratke = ceske_kratke.filter(
    pl.col("gender") == 'f'
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceske_kratke.filter(pl.col("gender") == 'm').group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("novely a povídky").alias("zanr"))

In [108]:
df_podil_poezie = ceska_poezie.filter(
    (pl.col("gender") == 'f')
).group_by("rok").len().sort(by="rok").rename({"len":'zeny'}).join(
    ceska_poezie.filter(pl.col("gender") == 'm').group_by("rok").len().sort(by="rok").rename({"len":'muzi'}), on='rok'
).with_columns(
    (pl.col('zeny') + pl.col('muzi')).alias('celkem')
).with_columns(
    (pl.col('zeny') / pl.col('celkem')).alias('podil_zen')
).with_columns(pl.lit("poezie").alias("zanr"))

In [109]:
df_podil_poezie

rok,zeny,muzi,celkem,podil_zen,zanr
i64,u32,u32,u32,f64,str
2000,47,176,223,0.210762,"""poezie"""
2001,44,181,225,0.195556,"""poezie"""
2002,43,154,197,0.218274,"""poezie"""
2003,73,214,287,0.254355,"""poezie"""
2004,57,212,269,0.211896,"""poezie"""
2005,70,217,287,0.243902,"""poezie"""
2006,101,219,320,0.315625,"""poezie"""
2007,80,261,341,0.234604,"""poezie"""
2008,100,258,358,0.27933,"""poezie"""
2009,110,256,366,0.300546,"""poezie"""


In [110]:
alt.Chart(alt_friendly(pl.concat([df_podil, df_podil_kratke, df_podil_poezie])), title=alt.Title(["Podíl ženských autorek","na nově vydaných českých knihách"], lineHeight=21), width=250, height=90).mark_area().encode(
    alt.X("rok:T", title=None, axis=alt.Axis(tickCount=5)),
    alt.Y('podil_zen:Q', title=None, axis=alt.Axis(tickCount=4, labelExpr="datum.label * 100 + ' %'", orient='right'), scale=alt.Scale(domain=[0,0.5])),
    alt.Row("zanr:N",header=alt.Header(labelAngle=0, labelAlign='left', labelAnchor='middle', labelFont='Asap'), sort=["romány","novely a povídky","poezie"], title=None)
).configure_view(stroke='transparent').resolve_scale(y='shared')

## Věková struktura

In [112]:
cr2 = ceske_romany.group_by(["gender","vek"]).len().sort(by="vek")
cr2 

gender,vek,len
str,i64,u32
"""f""",13,1
"""f""",14,2
"""f""",15,7
,15,1
"""m""",15,2
"""m""",16,2
"""f""",16,6
"""m""",17,7
"""f""",17,13
"""f""",18,17


In [153]:
cr2b = ceske_romany.filter(pl.col('rok').is_between(2015,2024)).group_by(["gender","vek"]).len().sort(by="vek")
cr2b 

gender,vek,len
str,i64,u32
"""f""",15,2
"""m""",15,1
"""f""",16,3
"""m""",17,4
"""f""",17,6
"""f""",18,5
"""m""",18,1
"""f""",19,9
"""m""",19,2
"""m""",20,2


In [155]:
alt.Chart(cr2b.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_line().encode(alt.X("vek"),alt.Y("len"),alt.Color("gender"))

In [137]:
alt.Chart(cr2.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_line().encode(alt.X("vek"),alt.Y("len"),alt.Color("gender"))

In [113]:
alt.Chart(cr2.filter(pl.col('gender').is_in(['m','f'])).to_pandas()).mark_bar().encode(alt.X("vek"),alt.Y("len"),alt.Row("gender"))

In [114]:
cr3 = ceske_romany.group_by(["gender","rok"]).agg(pl.col('vek').median())
cr3

gender,rok,vek
str,i64,f64
"""f""",2022,43.0
"""m""",2001,62.0
"""m""",2004,55.0
"""m""",2007,58.5
"""m""",2020,57.0
"""f""",2014,47.0
"""f""",2002,51.0
"""f""",2010,50.0
"""f""",2005,51.0
"""f""",2011,48.0


In [157]:
alt.Chart(alt_friendly(cr3.filter(pl.col('gender').is_in(['m','f'])))).mark_line().encode(alt.X("rok"),alt.Y("vek"),alt.Row("gender"))