In [1]:
import os
import datetime
import warnings
import polars as pl
import pandas as pd
import altair as alt

from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin
from src.zjisti_vazbu import zjisti_vazbu

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')
warnings.filterwarnings('ignore')

In [47]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","100.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="left")
df = df.to_pandas()
df = df[df["leader"].str[6].isin(["a", "t"])]
df = df[~df["leader"].str[7].isin(["b", "i", "s", " "])]
df = df[(df["008"].str[15:17] == "xr") & (df["008"].str[35:38] == "cze")]
df = pl.from_pandas(df)
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","020.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","022.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","245.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","300.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","655.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","700.parquet")), left_on="001", right_on="001", how="left")
df = df.explode("022_a").filter(pl.col("022_a").is_null())
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))
df = df.with_columns(pl.col('300_a').map_elements(pocet_stran, return_dtype=int).alias('stran'))
df = df.with_columns(pl.col('245_a').map_elements(bez_bordelu, return_dtype=str))
df = df.explode("020_q").with_columns(pl.col("020_q").map_elements(zjisti_vazbu, return_dtype=str).alias('vazba'))
df = df.explode('245_p').with_columns(pl.col('245_p').map_elements(bez_bordelu, return_dtype=str))
df = df.filter(pl.col("stran") > 15)
print(len(df))
df = df.filter(~pl.col('rok').is_null()).sort(by='rok')
df = df.filter((~pl.col("245_h").str.contains("grafika")) | pl.col("245_h").is_null()).unique(subset=["100_a","245_a"], keep="first")
print(len(df))

727498
522219


In [48]:
aut = pl.read_parquet(os.path.join("data","aut_vyber.parquet"))

In [49]:
cesi = aut.explode("370_c").filter(pl.col("370_c").str.contains("Česk")).explode("100_7").select(pl.col("100_7")).to_series().to_list()
df = df.filter(pl.col("100_7").is_in(cesi))

In [50]:
aut.sample(10)

024_2,024_a,046_f,046_g,100_7,100_a,100_d,100_ind1,110_a,370_a,370_b,370_c,370_f,372_a,373_a,374_a,375_a,377_a,400_a,400_d,400_i,400_ind1,410_a,411_a,430_a,450_a,500_a,500_i,500_ind1,550_7,678_a,856_u,leader,001
list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str,str
,,,,,,,,"[""Česko.""]",,,"[""Česko""]",,,,,,"[""cze""]",,,,,,,,,,,,,"[""Zákon č. 513/1991 Sb., Obchodní zákoník, změny 1994-2011 (novely 351/2011 Sb., 355/2011 Sb., 420/2011 Sb.), zrušeno k 1.1.2014 (89/2012 Sb.).""]",,""" nz a22 n 4500""","""kon20221155018"""
,,"[""1792""]","[""1876""]","[""jn20030822005""]","[""Capponi, Gino,""]","[""1792-1876""]","[""1""]",,,,,,,,,,,"[""Capponi, Gino Alessandro Giuseppe Gaspero,""]","[""1792-1876""]",,"[""1""]",,,,,,,,,,,""" cz a22 n 4500""","""jn20030822005"""
,,"[""1953""]",,"[""mzk2011655987""]","[""Arnold, Norbert,""]","[""1953-""]","[""1""]",,,,,,,,,,,,,,,,,,,,,,,"[""Německý molekulární biolog, též odborník na zdravotnickou politiku.""]",,""" nz a22 n 4500""","""mzk2011655987"""
,,,,"[""vut2010599342""]","[""Schaefer, Hans""]",,"[""1""]",,,,,,,,,,,"[""Schaefer, H.""]",,,"[""1""]",,,,,,,,,"[""Informatik působící v Norsku. Specializace v oblasti testování softwaru.""]",,""" nz a22 n 4500""","""vut2010599342"""
,,,,,,,,,,,"[""Česko""]",,"[""film""]",,,,"[""cze""]",,,,,,"[""International Film Festival Karlovy Vary"", ""MFF KV (festival)""]",,,,,,,,,""" cz a22 n 4500""","""kv2017950902"""
,,,,,,,,"[""Státní odborná škola drogistická při Obchodní akademii Obchodní a živnostenské komory v Brně (Brno, Česko)""]",,,"[""Česko""]",,,,,,,,,,,,,,,,,,,,,""" nz a22 n 4500""","""ko2010595964"""
,,"[""1922""]","[""1998""]","[""xx0145807""]","[""Stuart, Alan,""]","[""1922-1998""]","[""1""]",,,,,,,,,,,,,,,,,,,,,,,"[""Britský ekonom, specializace na statistiku.""]",,""" nz a22 n 4500""","""xx0145807"""
"[""isni""]","[""0000000370394896""]",,,"[""jo20211134306""]","[""Kempara-Sobowska, Kamila""]",,"[""1""]",,,,,,"[""překlady""]",,"[""překladatelky""]","[""žena""]","[""pol"", ""eng""]","[""Sobowska, Kamila Kempara-""]",,,"[""1""]",,,,,,,,,"[""Polská překladatelka se zaměřením na ezoterickou literaturu a knihy pro děti.""]",,""" nz a22 n 4500""","""jo20211134306"""
,,"[""1919""]","[""1991""]","[""pna20211122201""]","[""Boyle, Andrew,""]","[""1919-1991""]","[""1""]",,,,"[""Velká Británie""]","[""Skotsko""]","[""žurnalistika"", ""literární činnost"", ""biografie""]",,"[""novináři"", ""spisovatelé""]","[""muž""]","[""eng""]",,,,,,,,,,,,,"[""Britský (skotský) novinář a autor biografií.""]",,""" nz a22 n 4500""","""pna20211122201"""
,,,,"[""xx0201103""]","[""Maine, Trevor""]",,"[""1""]",,,,"[""Belgie""]",,"[""teologie""]",,"[""teologové""]","[""muž""]","[""eng""]",,,,,,,,,,,,,"[""Belgický teolog.""]",,""" nz a22 n 4500""","""xx0201103"""


In [51]:
df = df.join(aut.explode("100_7").explode("370_f"), how="left", on="100_7").explode('370_b')

In [52]:
df = df

In [77]:
df.filter(
    pl.col('370_f').is_not_null() | pl.col('370_f').is_not_null()
).group_by("rok").len().sort(by='rok')

rok,len
i64,u32
1803,1
1805,1
1806,3
1807,1
1808,13
1811,2
1812,5
1813,4
1814,9
1815,2


In [99]:
mesto = df.filter(
    pl.col('370_f').is_not_null() | pl.col('370_f').is_not_null()
).group_by("rok").len().join(
    df.filter(
        pl.col("370_f").str.contains("Ústí nad Labem") | pl.col("370_b").str.contains("Ústí nad Labem")
    ).group_by("rok").len(), how="left", on="rok").sort(by='rok').with_columns((pl.col('len_right') / pl.col('len')).alias('podil'))

alt.Chart(alt_friendly(mesto.filter(pl.col('rok').is_between(1900,2000)))).mark_line().encode(
    alt.X('rok:T'),
    alt.Y('podil:Q')
)

In [83]:
mesto

rok,len,len_right,podil
i64,u32,u32,f64
1803,1,,
1805,1,,
1806,3,,
1807,1,,
1808,13,,
1811,2,,
1812,5,,
1813,4,,
1814,9,,
1815,2,,
