In [1]:
import os
import datetime
import warnings
import polars as pl
import pandas as pd
import altair as alt

from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin
from src.zjisti_vazbu import zjisti_vazbu

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')
warnings.filterwarnings('ignore')

In [3]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","100.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="left")
df = df.to_pandas()
df = df[df["leader"].str[6].isin(["a", "t"])]
df = df[~df["leader"].str[7].isin(["b", "i", "s", " "])]
df = df[(df["008"].str[15:17] == "xr") & (df["008"].str[35:38] == "cze")]
df = pl.from_pandas(df)
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","020.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","022.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","041.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","245.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","300.parquet")), left_on="001", right_on="001", how="left")
df = df.explode("022_a").filter(pl.col("022_a").is_null())
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))
df = df.with_columns(pl.col('300_a').map_elements(pocet_stran, return_dtype=int).alias('stran'))
df = df.with_columns(pl.col('245_a').map_elements(bez_bordelu, return_dtype=str))
df = df.explode("020_q").with_columns(pl.col("020_q").map_elements(zjisti_vazbu, return_dtype=str).alias('vazba'))
df = df.explode('245_p').with_columns(pl.col('245_p').map_elements(bez_bordelu, return_dtype=str))
print(len(df))
df = df.sort(by="rok")
df = df.filter((~pl.col("245_h").str.contains("grafika")) | pl.col("245_h").is_null()).unique(subset=["100_a","245_a"], keep="first")
print(len(df))

795736
566175


In [7]:
vedral = df.filter(pl.col("100_7") == "mzk2003169026")

In [9]:
vedral

100_ind1,100_a,100_7,100_4,100_d,100_q,100_c,100_b,100_e,001,leader,008,020_q,020_c,020_a,020_z,022_a,022_y,022_z,022_ind1,022_l,041_ind1,041_a,041_h,041_b,041_k,041_g,041_f,041_d,041_e,041_j,041_n,041_m,245_ind1,245_ind2,245_a,245_b,245_c,245_n,245_p,245_h,245_f,245_s,300_a,300_b,300_c,300_e,300_f,300_3,rok,stran,vazba
str,str,str,list[str],str,str,list[str],str,str,str,str,str,str,list[str],list[str],list[str],str,list[str],list[str],str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str,str,str,str,str,str,str,list[str],str,str,str,str,list[str],list[str],list[str],list[str],str,str,i64,i64,str
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20061684664""",""" nam a22 a 4500""","""060807s2006 xr e d 0…","""(brož.)""",,"[""80-7374-025-7""]",,,,,,,"""0""","[""cze"", ""ger""]",,,,,,,,,,,"""1""","""0""","""Česko-německý slovník jmen hub""",,"""J. Vedral""",,,,,,"[""12 s. ;""]",,"[""21 cm""]",,,,2006,12,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""cpk20031191583""",""" nam a22 a 4500""","""030722s2002 xr e d 0…","""(brož.)""",,"[""80-86261-80-8""]",,,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Anglicko-český hudební slovník""",,"""J. Vedral""",,,,,,"[""44 s. ;""]",,"[""21 cm""]",,,,2002,44,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20142611740""",""" cam a22 a 4500""","""140620s2014 xr f e 0…","""(brož.)""",,"[""978-80-7457-327-9""]",,,,,,,"""0""","[""cze"", ""fre""]",,,,,,,,,,,"""1""","""0""","""Francouzsko-český slovník nádo…","""(Classification internationale…","""J. Vedral""",,,,,,"[""20 s. ;""]",,"[""21 cm""]",,,,2014,20,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20112176273""",""" nam a22 a 4500""","""110503s2011 xr f e 0…","""(brož.)""",,"[""978-80-7457-014-8""]",,,,,,,"""0""","[""cze"", ""tha""]",,,,,,,,,,,"""1""","""0""","""Česko-thajský lékařský slovník""",,"""J. Vedral""",,,,,,"[""136 s. ;""]",,"[""21 cm""]",,,,2011,136,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20142611724""",""" cam a22 a 4500""","""140620s2014 xr e e 0…","""(brož.)""",,"[""978-80-7457-303-3""]",,,,,,,"""0""","[""cze"", ""gre""]",,,,,,,,,,,"""1""","""0""","""Řecko-český slovník třídění od…",,"""J. Vedral""",,,,,,"[""24 s. ;""]",,"[""21 cm""]",,,,2014,24,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20112176258""",""" nam a22 a 4500""","""110503s2011 xr f e 0…","""(kroužková vazba)""",,"[""978-80-7457-009-4""]",,,,,,,"""0""","[""cze"", ""est""]",,,,,,,,,,,"""1""","""0""","""Estonsko-český lékařský slovní…",,"""J. Vedral""",,,,,,"[""188 s. ;""]",,"[""21 cm""]",,,,2011,188,
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20142636466""",""" cam a22 a 4500""","""140925s2014 xr e d 0…","""(brož.)""",,"[""978-80-7457-360-6""]",,,,,,,"""0""","[""cze"", ""por""]",,,,,,,,,,,"""1""","""0""","""Portugalsko-český slovník ekon…",,"""J. Vedral""",,,,,,"[""20 s. ;""]",,"[""21 cm""]",,,,2014,20,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20142636448""",""" cam a22 i 4500""","""140925s2014 xr e d 0…","""(kroužková vazba)""",,"[""978-80-7457-387-3""]",,,,,,,"""0""","[""cze"", ""lav""]",,,,,,,,,,,"""1""","""0""","""Lotyšsko-český slovník PRODCOM""",,"""J. Vedral""",,,,,,"[""164 stran ;""]",,"[""21 cm""]",,,,2014,164,
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20112164448""",""" nam a22 a 4500""","""120425s2012 xr e e 0…","""(brož.)""",,"[""978-80-87345-98-6""]",,,,,,,"""0""","[""cze"", ""spa""]",,,,,,,,,,,"""1""","""0""","""Španělsko-český slovník jmen m…",,"""J. Vedral""",,,,,,"[""16 s. ;""]",,"[""21 cm""]",,,,2012,16,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""cpk20031191598""",""" nam a22 a 4500""","""030703s2002 xr e d 0…","""(brož.)""",,"[""80-86261-93-X""]",,,,,,,"""0""","[""cze"", ""ger""]",,,,,,,,,,,"""1""","""0""","""Německo-český rybářský slovník""",,"""J. Vedral""",,,,,,"[""28 s. ;""]",,"[""22 cm""]",,,,2002,28,"""brožovaná"""


In [57]:
def jazyk(retezec):
    for x in retezec.split(" "):
        if "-" in x:
            return x.lower()

In [31]:
def ceho(retezec):
    slova = retezec.split(" ")
    temata = []
    for s in slova:
        if "-" not in s:
            if s.lower() != "slovník":
                temata.append(s)
    return ' '.join(temata)

In [33]:
vedral = vedral.with_columns(
    pl.col("245_a").map_elements(jazyk).alias('jazyk')
).with_columns(pl.col('245_a').map_elements(ceho).alias('ceho'))

In [35]:
vedral

100_ind1,100_a,100_7,100_4,100_d,100_q,100_c,100_b,100_e,001,leader,008,020_q,020_c,020_a,020_z,022_a,022_y,022_z,022_ind1,022_l,041_ind1,041_a,041_h,041_b,041_k,041_g,041_f,041_d,041_e,041_j,041_n,041_m,245_ind1,245_ind2,245_a,245_b,245_c,245_n,245_p,245_h,245_f,245_s,300_a,300_b,300_c,300_e,300_f,300_3,rok,stran,vazba,jazyk,ceho
str,str,str,list[str],str,str,list[str],str,str,str,str,str,str,list[str],list[str],list[str],str,list[str],list[str],str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str,str,str,str,str,str,str,list[str],str,str,str,str,list[str],list[str],list[str],list[str],str,str,i64,i64,str,str,str
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20061684664""",""" nam a22 a 4500""","""060807s2006 xr e d 0…","""(brož.)""",,"[""80-7374-025-7""]",,,,,,,"""0""","[""cze"", ""ger""]",,,,,,,,,,,"""1""","""0""","""Česko-německý slovník jmen hub""",,"""J. Vedral""",,,,,,"[""12 s. ;""]",,"[""21 cm""]",,,,2006,12,"""brožovaná""","""česko-německý""","""jmen hub"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""cpk20031191583""",""" nam a22 a 4500""","""030722s2002 xr e d 0…","""(brož.)""",,"[""80-86261-80-8""]",,,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Anglicko-český hudební slovník""",,"""J. Vedral""",,,,,,"[""44 s. ;""]",,"[""21 cm""]",,,,2002,44,"""brožovaná""","""anglicko-český""","""hudební"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20142611740""",""" cam a22 a 4500""","""140620s2014 xr f e 0…","""(brož.)""",,"[""978-80-7457-327-9""]",,,,,,,"""0""","[""cze"", ""fre""]",,,,,,,,,,,"""1""","""0""","""Francouzsko-český slovník nádo…","""(Classification internationale…","""J. Vedral""",,,,,,"[""20 s. ;""]",,"[""21 cm""]",,,,2014,20,"""brožovaná""","""francouzsko-český""","""nádorů"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20112176273""",""" nam a22 a 4500""","""110503s2011 xr f e 0…","""(brož.)""",,"[""978-80-7457-014-8""]",,,,,,,"""0""","[""cze"", ""tha""]",,,,,,,,,,,"""1""","""0""","""Česko-thajský lékařský slovník""",,"""J. Vedral""",,,,,,"[""136 s. ;""]",,"[""21 cm""]",,,,2011,136,"""brožovaná""","""česko-thajský""","""lékařský"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20142611724""",""" cam a22 a 4500""","""140620s2014 xr e e 0…","""(brož.)""",,"[""978-80-7457-303-3""]",,,,,,,"""0""","[""cze"", ""gre""]",,,,,,,,,,,"""1""","""0""","""Řecko-český slovník třídění od…",,"""J. Vedral""",,,,,,"[""24 s. ;""]",,"[""21 cm""]",,,,2014,24,"""brožovaná""","""řecko-český""","""třídění odpadů"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20112176258""",""" nam a22 a 4500""","""110503s2011 xr f e 0…","""(kroužková vazba)""",,"[""978-80-7457-009-4""]",,,,,,,"""0""","[""cze"", ""est""]",,,,,,,,,,,"""1""","""0""","""Estonsko-český lékařský slovní…",,"""J. Vedral""",,,,,,"[""188 s. ;""]",,"[""21 cm""]",,,,2011,188,,"""estonsko-český""","""lékařský"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20142636466""",""" cam a22 a 4500""","""140925s2014 xr e d 0…","""(brož.)""",,"[""978-80-7457-360-6""]",,,,,,,"""0""","[""cze"", ""por""]",,,,,,,,,,,"""1""","""0""","""Portugalsko-český slovník ekon…",,"""J. Vedral""",,,,,,"[""20 s. ;""]",,"[""21 cm""]",,,,2014,20,"""brožovaná""","""portugalsko-český""","""ekonomických činností (NACE)"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20142636448""",""" cam a22 i 4500""","""140925s2014 xr e d 0…","""(kroužková vazba)""",,"[""978-80-7457-387-3""]",,,,,,,"""0""","[""cze"", ""lav""]",,,,,,,,,,,"""1""","""0""","""Lotyšsko-český slovník PRODCOM""",,"""J. Vedral""",,,,,,"[""164 stran ;""]",,"[""21 cm""]",,,,2014,164,,"""lotyšsko-český""","""PRODCOM"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20112164448""",""" nam a22 a 4500""","""120425s2012 xr e e 0…","""(brož.)""",,"[""978-80-87345-98-6""]",,,,,,,"""0""","[""cze"", ""spa""]",,,,,,,,,,,"""1""","""0""","""Španělsko-český slovník jmen m…",,"""J. Vedral""",,,,,,"[""16 s. ;""]",,"[""21 cm""]",,,,2012,16,"""brožovaná""","""španělsko-český""","""jmen mořských živočichů"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""cpk20031191598""",""" nam a22 a 4500""","""030703s2002 xr e d 0…","""(brož.)""",,"[""80-86261-93-X""]",,,,,,,"""0""","[""cze"", ""ger""]",,,,,,,,,,,"""1""","""0""","""Německo-český rybářský slovník""",,"""J. Vedral""",,,,,,"[""28 s. ;""]",,"[""22 cm""]",,,,2002,28,"""brožovaná""","""německo-český""","""rybářský"""


In [41]:
df = vedral.filter(pl.col('ceho').is_not_null() & pl.col('jazyk').is_not_null()).select(pl.col(['ceho','jazyk']))

In [43]:
df

ceho,jazyk
str,str
"""jmen hub""","""česko-německý"""
"""hudební""","""anglicko-český"""
"""nádorů""","""francouzsko-český"""
"""lékařský""","""česko-thajský"""
"""třídění odpadů""","""řecko-český"""
"""lékařský""","""estonsko-český"""
"""ekonomických činností (NACE)""","""portugalsko-český"""
"""PRODCOM""","""lotyšsko-český"""
"""jmen mořských živočichů""","""španělsko-český"""
"""rybářský""","""německo-český"""


In [63]:
temata = df.group_by('ceho').len().sort(by='len',descending=True).head(20)
temata

ceho,len
str,u32
"""jmen ptáků""",36
"""katalog odpadů""",26
"""biologický""",26
"""ekonomických činností (NACE)""",25
"""lékařský""",24
"""PRODCOM""",19
"""celní sazebník""",19
"""veřejných zakázek CPV""",18
"""třídění odpadů""",18
"""zemědělský""",16


In [69]:
reci = df.filter(pl.col('jazyk').str.contains('český')).group_by('jazyk').len().sort(by='len',descending=True).head(20)
reci

jazyk,len
str,u32
"""anglicko-český""",129
"""německo-český""",48
"""francouzsko-český""",21
"""španělsko-český""",19
"""italsko-český""",19
"""portugalsko-český""",18
"""polsko-český""",15
"""maďarsko-český""",14
"""nizozemsko-český""",13
"""rusko-český""",13


In [None]:
dilo = []
for t in temata.select(pl.col('ceho')).to_series().to_list():
    slovnik = {}
    for r in reci:
        

In [113]:
pl.DataFrame([
    {"finsko-český":
     {"jmen hub": 0, "jmen rostlin": 1},
    },
    {"rusko-český":
     {"jmen hub": 1, "jmen rostlin": 0},
    }
])

finsko-český,rusko-český
struct[2],struct[2]
"{0,1}",
,"{1,0}"


In [107]:
[
    {"finsko-český":
     {"jmen hub": 0, "jmen rostlin": 1},
    },
    {"rusko-český":
     {"jmen hub": 1, "jmen rostlin": 0},
    }
]

[{'finsko-český': {'jmen hub': 0, 'jmen rostlin': 1}},
 {'rusko-český': {'jmen hub': 1, 'jmen rostlin': 0}}]

In [105]:
{"rusko-český":
     {"jmen hub": 1, "jmen rostlin": 0},
    }

{'rusko-český': {'jmen hub': 1, 'jmen rostlin': 0}}