In [1]:
import os
import datetime
import warnings
import polars as pl
import pandas as pd
import altair as alt

from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin
from src.zjisti_vazbu import zjisti_vazbu

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')
warnings.filterwarnings('ignore')

In [2]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","100.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="left")
df = df.to_pandas()
df = df[df["leader"].str[6].isin(["a", "t"])]
df = df[~df["leader"].str[7].isin(["b", "i", "s", " "])]
df = df[(df["008"].str[15:17] == "xr") & (df["008"].str[35:38] == "cze")]
df = pl.from_pandas(df)
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","020.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","022.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","041.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","245.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","300.parquet")), left_on="001", right_on="001", how="left")
df = df.explode("022_a").filter(pl.col("022_a").is_null())
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))
df = df.with_columns(pl.col('300_a').map_elements(pocet_stran, return_dtype=int).alias('stran'))
df = df.with_columns(pl.col('245_a').map_elements(bez_bordelu, return_dtype=str))
df = df.explode("020_q").with_columns(pl.col("020_q").map_elements(zjisti_vazbu, return_dtype=str).alias('vazba'))
df = df.explode('245_p').with_columns(pl.col('245_p').map_elements(bez_bordelu, return_dtype=str))
print(len(df))
df = df.sort(by="rok")
df = df.filter((~pl.col("245_h").str.contains("grafika")) | pl.col("245_h").is_null()).unique(subset=["100_a","245_a"], keep="first")
print(len(df))

795736
566175


In [3]:
vedral = df.filter(pl.col("100_7") == "mzk2003169026")

In [4]:
vedral

100_ind1,100_a,100_7,100_4,100_d,100_q,100_c,100_b,100_e,001,leader,008,020_q,020_c,020_a,020_z,022_a,022_y,022_z,022_ind1,022_l,041_ind1,041_a,041_h,041_b,041_k,041_g,041_f,041_d,041_e,041_j,041_n,041_m,245_ind1,245_ind2,245_a,245_b,245_c,245_n,245_p,245_h,245_f,245_s,300_a,300_b,300_c,300_e,300_f,300_3,rok,stran,vazba
str,str,str,list[str],str,str,list[str],str,str,str,str,str,str,list[str],list[str],list[str],str,list[str],list[str],str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str,str,str,str,str,str,str,list[str],str,str,str,str,list[str],list[str],list[str],list[str],str,str,i64,i64,str
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20102127917""",""" nam a22 a 4500""","""101110s2010 xr esd 0…",,,"[""978-80-87345-61-0""]",,,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Anglicko-český technický slovn…",,"""J. Vedral""",,,"""[elektronický zdroj] /""",,,"[""1 CD-ROM :""]","[""čb. ;""]","[""12 cm""]",,,,2010,1,
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""cpk20031191530""",""" nam a22 a 4500""","""030714s2002 xr e d 0…","""(brož.)""",,"[""80-86261-41-7""]",,,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Anglicko-český slovník lyžován…",,"""J. Vedral""",,,,,,"[""20 s. ;""]",,"[""21 cm""]",,,,2002,20,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20102087617""",""" cam a22 a 4500""","""100105s2009 xr g f 0…","""(brož.)""",,"[""978-80-87345-14-6""]",,,,,,,,,,,,,,,,,,,"""1""","""0""","""Redoxní rovnice""",,"""Jiří Vedral""",,,,,,"[""32 s. ;""]",,"[""22 cm""]",,,,2009,32,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20142636459""",""" cam a22 a 4500""","""140925s2014 xr e d 0…","""(brož.)""",,"[""978-80-7457-352-1""]",,,,,,,"""0""","[""cze"", ""ger""]",,,,,,,,,,,"""1""","""0""","""Německo-český slovník zaměstná…",,"""J. Vedral""",,,,,,"[""44 s. ;""]",,"[""21 cm""]",,,,2014,44,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""cpk20031248955""",""" nam a22 a 4500""","""030715s2003 xr e d 0…","""(brož.)""",,"[""80-86711-48-X""]",,,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Anglicko-český optický slovník""",,"""J. Vedral""",,,,,,"[""44 s. ;""]",,"[""21 cm""]",,,,2003,44,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20061699782""",""" nam a22 a 4500""","""070104s2006 xr e d 0…","""(v knize neuvedeno ;""",,"[""80-7374-036-2""]",,,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Česko-anglický slovník jmen ni…",,"""J. Vedral""",,,,,,"[""32 s. ;""]",,"[""21 cm""]",,,,2006,32,
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20112176271""",""" nam a22 a 4500""","""110503s2011 xr e e 0…","""(brož.)""",,"[""978-80-7457-012-4""]",,,,,,,"""0""","[""cze"", ""bul""]",,,,,,,,,,,"""1""","""0""","""Bulharsko-český slovník veřejn…",,"""J. Vedral""",,,,,,"[""140 s. ;""]",,"[""21 cm""]",,,,2011,140,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20061684666""",""" nam a22 a 4500""","""070206s2006 xr e d 0…","""(brož.)""",,"[""80-7374-027-3""]",,,,,,,"""0""","[""cze"", ""ger""]",,,,,,,,,,,"""1""","""0""","""Česko-německý slovník jmen jed…",,"""J. Vedral""",,,,,,"[""12 s. ;""]",,"[""21 cm""]",,,,2006,12,"""brožovaná"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""cpe20122343676""",""" nam a22 a 4500""","""120220s2006 xr esd 0…",,,,"[""80-7374-038-9""]",,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Anglicko-český slovník automat…",,"""J. Vedral""",,,"""[elektronický zdroj] /""",,,"[""1 CD-ROM :""]","[""barev. ;""]","[""12 cm""]",,,,2006,1,
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20051448828""",""" nam a22 a 4500""","""051021s2005 xr g d 0…","""(brož.)""",,"[""80-86711-81-1""]",,,,,,,"""0""","[""cze"", ""nor""]",,,,,,,,,,,"""1""","""0""","""Norsko-český biologický slovní…",,"""J. Vedral""",,,,,,"[""52 s. ;""]",,"[""21 cm""]",,,,2005,52,"""brožovaná"""


In [5]:
def jazyk(retezec):
    for x in retezec.split(" "):
        if "-" in x:
            return x.lower()

In [6]:
def ceho(retezec):
    slova = retezec.split(" ")
    temata = []
    for s in slova:
        if "-" not in s:
            if s.lower() != "slovník":
                temata.append(s)
    return ' '.join(temata)

In [7]:
vedral = vedral.with_columns(
    pl.col("245_a").map_elements(jazyk).alias('jazyk')
).with_columns(pl.col('245_a').map_elements(ceho).alias('ceho'))

In [8]:
vedral

100_ind1,100_a,100_7,100_4,100_d,100_q,100_c,100_b,100_e,001,leader,008,020_q,020_c,020_a,020_z,022_a,022_y,022_z,022_ind1,022_l,041_ind1,041_a,041_h,041_b,041_k,041_g,041_f,041_d,041_e,041_j,041_n,041_m,245_ind1,245_ind2,245_a,245_b,245_c,245_n,245_p,245_h,245_f,245_s,300_a,300_b,300_c,300_e,300_f,300_3,rok,stran,vazba,jazyk,ceho
str,str,str,list[str],str,str,list[str],str,str,str,str,str,str,list[str],list[str],list[str],str,list[str],list[str],str,str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str,str,str,str,str,str,str,list[str],str,str,str,str,list[str],list[str],list[str],list[str],str,str,i64,i64,str,str,str
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20102127917""",""" nam a22 a 4500""","""101110s2010 xr esd 0…",,,"[""978-80-87345-61-0""]",,,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Anglicko-český technický slovn…",,"""J. Vedral""",,,"""[elektronický zdroj] /""",,,"[""1 CD-ROM :""]","[""čb. ;""]","[""12 cm""]",,,,2010,1,,"""anglicko-český""","""technický na CD"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""cpk20031191530""",""" nam a22 a 4500""","""030714s2002 xr e d 0…","""(brož.)""",,"[""80-86261-41-7""]",,,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Anglicko-český slovník lyžován…",,"""J. Vedral""",,,,,,"[""20 s. ;""]",,"[""21 cm""]",,,,2002,20,"""brožovaná""","""anglicko-český""","""lyžování"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20102087617""",""" cam a22 a 4500""","""100105s2009 xr g f 0…","""(brož.)""",,"[""978-80-87345-14-6""]",,,,,,,,,,,,,,,,,,,"""1""","""0""","""Redoxní rovnice""",,"""Jiří Vedral""",,,,,,"[""32 s. ;""]",,"[""22 cm""]",,,,2009,32,"""brožovaná""",,"""Redoxní rovnice"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20142636459""",""" cam a22 a 4500""","""140925s2014 xr e d 0…","""(brož.)""",,"[""978-80-7457-352-1""]",,,,,,,"""0""","[""cze"", ""ger""]",,,,,,,,,,,"""1""","""0""","""Německo-český slovník zaměstná…",,"""J. Vedral""",,,,,,"[""44 s. ;""]",,"[""21 cm""]",,,,2014,44,"""brožovaná""","""německo-český""","""zaměstnání ISCO 08"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""cpk20031248955""",""" nam a22 a 4500""","""030715s2003 xr e d 0…","""(brož.)""",,"[""80-86711-48-X""]",,,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Anglicko-český optický slovník""",,"""J. Vedral""",,,,,,"[""44 s. ;""]",,"[""21 cm""]",,,,2003,44,"""brožovaná""","""anglicko-český""","""optický"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20061699782""",""" nam a22 a 4500""","""070104s2006 xr e d 0…","""(v knize neuvedeno ;""",,"[""80-7374-036-2""]",,,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Česko-anglický slovník jmen ni…",,"""J. Vedral""",,,,,,"[""32 s. ;""]",,"[""21 cm""]",,,,2006,32,,"""česko-anglický""","""jmen nižších rostlin"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20112176271""",""" nam a22 a 4500""","""110503s2011 xr e e 0…","""(brož.)""",,"[""978-80-7457-012-4""]",,,,,,,"""0""","[""cze"", ""bul""]",,,,,,,,,,,"""1""","""0""","""Bulharsko-český slovník veřejn…",,"""J. Vedral""",,,,,,"[""140 s. ;""]",,"[""21 cm""]",,,,2011,140,"""brožovaná""","""bulharsko-český""","""veřejných zakázek (CPV)"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20061684666""",""" nam a22 a 4500""","""070206s2006 xr e d 0…","""(brož.)""",,"[""80-7374-027-3""]",,,,,,,"""0""","[""cze"", ""ger""]",,,,,,,,,,,"""1""","""0""","""Česko-německý slovník jmen jed…",,"""J. Vedral""",,,,,,"[""12 s. ;""]",,"[""21 cm""]",,,,2006,12,"""brožovaná""","""česko-německý""","""jmen jednoděložných"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""cpe20122343676""",""" nam a22 a 4500""","""120220s2006 xr esd 0…",,,,"[""80-7374-038-9""]",,,,,,"""0""","[""cze"", ""eng""]",,,,,,,,,,,"""1""","""0""","""Anglicko-český slovník automat…",,"""J. Vedral""",,,"""[elektronický zdroj] /""",,,"[""1 CD-ROM :""]","[""barev. ;""]","[""12 cm""]",,,,2006,1,,"""anglicko-český""","""automatizace na CD"""
"""1""","""Vedral, Jiří,""","""mzk2003169026""","[""aut""]","""1973-""",,,,,"""nkc20051448828""",""" nam a22 a 4500""","""051021s2005 xr g d 0…","""(brož.)""",,"[""80-86711-81-1""]",,,,,,,"""0""","[""cze"", ""nor""]",,,,,,,,,,,"""1""","""0""","""Norsko-český biologický slovní…",,"""J. Vedral""",,,,,,"[""52 s. ;""]",,"[""21 cm""]",,,,2005,52,"""brožovaná""","""norsko-český""","""biologický"""


In [37]:
vedral.group_by("rok").len().sort(by="rok")

rok,len
i64,u32
2002,75
2003,39
2004,8
2005,16
2006,46
2007,1
2008,29
2009,23
2010,21
2011,40


In [9]:
df = vedral.filter(pl.col('ceho').is_not_null() & pl.col('jazyk').is_not_null()).select(pl.col(['ceho','jazyk']))

In [10]:
df

ceho,jazyk
str,str
"""technický na CD""","""anglicko-český"""
"""lyžování""","""anglicko-český"""
"""zaměstnání ISCO 08""","""německo-český"""
"""optický""","""anglicko-český"""
"""jmen nižších rostlin""","""česko-anglický"""
"""veřejných zakázek (CPV)""","""bulharsko-český"""
"""jmen jednoděložných""","""česko-německý"""
"""automatizace na CD""","""anglicko-český"""
"""biologický""","""norsko-český"""
"""PRODCOM""","""španělsko-český"""


In [25]:
temata = df.group_by('ceho').len().sort(by='len',descending=True)
temata.head(20)

ceho,len
str,u32
"""jmen ptáků""",36
"""biologický""",26
"""katalog odpadů""",26
"""ekonomických činností (NACE)""",25
"""lékařský""",24
"""PRODCOM""",19
"""celní sazebník""",19
"""třídění odpadů""",18
"""veřejných zakázek CPV""",18
"""zemědělský""",16


In [27]:
len(temata)

240

In [29]:
reci = df.filter(pl.col('jazyk').str.contains('český')).group_by('jazyk').len().sort(by='len',descending=True)
reci.head(20)

jazyk,len
str,u32
"""anglicko-český""",129
"""německo-český""",48
"""francouzsko-český""",21
"""italsko-český""",19
"""španělsko-český""",19
"""portugalsko-český""",18
"""polsko-český""",15
"""maďarsko-český""",14
"""nizozemsko-český""",13
"""slovinsko-český""",13


In [31]:
len(reci)

54

In [33]:
dilo = []
for t in temata.select(pl.col('ceho')).to_series().to_list():
    slovnik = {}
    for r in reci:
        print(r)

shape: (54,)
Series: 'jazyk' [str]
[
	"anglicko-český"
	"německo-český"
	"francouzsko-český"
	"italsko-český"
	"španělsko-český"
	"portugalsko-český"
	"polsko-český"
	"maďarsko-český"
	"nizozemsko-český"
	"slovinsko-český"
	"rusko-český"
	"finsko-český"
	"turecko-český"
	"slovensko-český"
	"latinsko-český"
	"japonsko-český"
	"estonsko-český"
	"dánsko-český"
	"bulharsko-český"
	"švédsko-český"
	"čínsko-český"
	"norsko-český"
	"lotyšsko-český"
	"arabsko-český"
	"litevsko-český"
	"rumunsko-český"
	"korejsko-český"
	"chorvatsko-český"
	"řecko-český"
	"srbsko-český"
	"maltsko-český"
	"makedonsko-český"
	"ázerbajdžánsko-český"
	"bosensko-český"
	"thajsko-český"
	"arménsko-český"
	"ukrajinsko-český"
	"vietnamsko-český"
	"albánsko-český"
	"gruzínsko-český"
	"seversko-český"
	"černohorsko-český"
	"islandsko-český"
	"kazašsko-český"
	"tegulsko-český"
	"holandsko-český"
	"farsí-český"
	"afrikánsko-český"
	"jugoslávsko-český"
	"kosovsko-albánsko-český"
	"francousko-český"
	"skotsko-český"
	"hindsk

In [None]:
pl.DataFrame([
    {"finsko-český":
     {"jmen hub": 0, "jmen rostlin": 1},
    },
    {"rusko-český":
     {"jmen hub": 1, "jmen rostlin": 0},
    }
])

In [None]:
[
    {"finsko-český":
     {"jmen hub": 0, "jmen rostlin": 1},
    },
    {"rusko-český":
     {"jmen hub": 1, "jmen rostlin": 0},
    }
]

In [None]:
{"rusko-český":
     {"jmen hub": 1, "jmen rostlin": 0},
    }