In [1]:
import os
import json
import datetime
import warnings
import polars as pl
import pandas as pd
import altair as alt

from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin
from src.zjisti_vazbu import zjisti_vazbu
from src.me_to_neurazi import me_to_neurazi

with open(os.path.join('src','kredity.json'), 'r', encoding='utf-8') as kredity:
    kredity = json.loads(kredity.read())

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')
warnings.filterwarnings('ignore')

In [17]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="outer")
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))

In [21]:
len(df)

1250624

In [23]:
len(df.filter(pl.col("rok").is_not_null()))

1215703

In [25]:
len(df.filter(pl.col("rok").is_not_null())) / len(df)

0.9720771390921652

In [30]:
panda = df.to_pandas()

In [39]:
panda = df.to_pandas()
panda = panda[panda["leader"].str[6].isin(["a", "t"])]
print(len(panda))
panda = panda[~panda["leader"].str[7].isin(["b", "i", "s", " "])]
print(len(panda))
panda = panda[(panda["008"].str[15:17] == "xr")]
print(len(panda))
panda = panda[(panda["008"].str[35:38] == "cze")]
print(len(panda))

1044144
995609
995609
893355


In [35]:
1044144 / 1250624 

0.8348984187093803

In [37]:
995609 / 1250624 

0.7960897919758456

In [41]:
893355 / 1250624 

0.7143274077580472

In [43]:
zpanda = pl.from_pandas(panda)

In [45]:
len(zpanda)

893355

In [47]:
len(zpanda.filter(pl.col("rok").is_not_null()))

884197

In [53]:
884197 / 1250624

0.7070046632720945

In [57]:
aut = pl.read_parquet(os.path.join("data","aut_vyber.parquet"))

In [71]:
aut.shape

(1224571, 34)

In [75]:
aut.explode("100_7").unique(subset=["100_7"],keep='first').shape

(952206, 34)

In [77]:
aut_stats = aut.explode("100_7").unique(subset=["100_7"],keep='first').explode("370_c").group_by("370_c").len().sort(by="len",descending=True)
aut_stats

370_c,len
str,u32
,395286
"""Česko""",267209
"""Spojené státy americké""",66250
"""Německo""",52098
"""Velká Británie""",32350
"""Francie""",20333
"""Polsko""",19250
"""Rusko""",17220
"""Slovensko""",14627
"""Itálie""",12984


In [81]:
aut_stats.filter((pl.col("370_c").is_not_null()) & (pl.col('370_c') != "Česko")).select(pl.col("len")).sum()

len
u32
342227


In [None]:
aut.explode("100_7").unique(subset=["100_7"],keep='first').shape

In [69]:
aut.explode("370_c").filter(pl.col("370_c").str.contains("Česk")).group_by("370_c").len().sort(by="len",descending=True)

370_c,len
str,u32
"""Česko""",364083
"""Československo""",113
"""Praha, Česko""",43
"""Rožnov pod Radhoštěm, Česko""",17
"""Polička, Česko""",16
"""České Budějovice, Česko""",10
"""České království""",9
"""Čechy, Česko""",8
"""Česko, Československo""",7
"""Brno, Česko""",6
