In [1]:
import os
import json
import datetime
import warnings
import polars as pl
import pandas as pd
import altair as alt

from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin
from src.zjisti_vazbu import zjisti_vazbu
from src.me_to_neurazi import me_to_neurazi

with open(os.path.join('src','kredity.json'), 'r', encoding='utf-8') as kredity:
    kredity = json.loads(kredity.read())

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')
warnings.filterwarnings('ignore')

In [20]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="left")
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))

In [22]:
len(df)

1250624

In [24]:
len(df.filter(pl.col("rok").is_not_null()))

1215703

In [26]:
len(df.filter(pl.col("rok").is_not_null())) / len(df)

0.9720771390921652

In [28]:
panda = df.to_pandas()

In [30]:
panda = df.to_pandas()
panda = panda[panda["leader"].str[6].isin(["a", "t"])]
print(len(panda))
panda = panda[~panda["leader"].str[7].isin(["b", "i", "s", " "])]
print(len(panda))
panda = panda[(panda["008"].str[15:17] == "xr")]
print(len(panda))
panda = panda[(panda["008"].str[35:38] == "cze")]
print(len(panda))

1044144
995609
995609
893355


In [32]:
1044144 / 1250624 

0.8348984187093803

In [9]:
995609 / 1250624 

0.7960897919758456

In [10]:
893355 / 1250624 

0.7143274077580472

In [34]:
zpanda = pl.from_pandas(panda)

In [36]:
len(zpanda)

893355

In [38]:
len(zpanda.filter(pl.col("rok").is_not_null()))

884197

In [14]:
884197 / 1250624

0.7070046632720945

In [18]:
zpanda.sample(10)

leader,001,008,001_right,rok
str,str,str,str,f64
""" nam a22 1i 4500""","""bk197301702""","""970320s1973 xr 0…","""bk197301702""",1973.0
""" nam a22 a 4500""","""cpk20021179848""","""021016s2002 xr e 0…","""cpk20021179848""",2002.0
""" nam a22 i 4500""","""nkc20182993039""","""180419s2018 xr a f f 0…","""nkc20182993039""",2018.0
""" nam a22 1 4500""","""nos190152085""","""000619s1924 xr …","""nos190152085""",1924.0
""" nam a22 i 4500""","""nkc20193072468""","""190301s2019 xr g 0…","""nkc20193072468""",2019.0
""" nam a22 a 4500""","""nkc20091867455""","""090409s2009 xr ac e c 0…","""nkc20091867455""",2009.0
""" nam a22 4500""","""ck8702873""","""870811s1987 xr a u0…","""ck8702873""",1987.0
""" nam a22 4500""","""np9309475""","""940418s1992 xr 1…","""np9309475""",1992.0
""" cam a22 a 4500""","""cpk20041170582""","""010122s2000 xr a e 0…","""cpk20041170582""",2000.0
""" nam a22 a 4500""","""cpk19980354603""","""980611s1998 xr e 0…","""cpk19980354603""",1998.0


In [40]:
len(zpanda.join(pl.read_parquet(os.path.join("data/cnb_sloupce","022.parquet")), left_on="001", right_on="001", how="outer").explode("022_a").filter(pl.col("022_a").is_null()))

893728

In [None]:
aut = pl.read_parquet(os.path.join("data","aut_vyber.parquet"))

In [None]:
aut.shape

In [None]:
aut.explode("100_7").unique(subset=["100_7"],keep='first').shape

In [None]:
aut_stats = aut.explode("100_7").unique(subset=["100_7"],keep='first').explode("370_c").group_by("370_c").len().sort(by="len",descending=True)
aut_stats

In [None]:
aut_stats.filter((pl.col("370_c").is_not_null()) & (pl.col('370_c') != "Česko")).select(pl.col("len")).sum()

In [None]:
aut.explode("100_7").unique(subset=["100_7"],keep='first').shape

In [None]:
aut.explode("370_c").filter(pl.col("370_c").str.contains("Česk")).group_by("370_c").len().sort(by="len",descending=True)