In [1]:
import os
import datetime
import warnings
import polars as pl
import pandas as pd
import altair as alt

from src.najdi_rok import najdi_rok
from src.pocet_stran import pocet_stran
from src.bez_bordelu import bez_bordelu
from src.alt_friendly import alt_friendly
from src.hezke_jmeno import hezke_jmeno
from src.kristi_promin import kristi_promin

pl.Config(tbl_rows=100)
alt.data_transformers.disable_max_rows()
alt.themes.register('irozhlas', kristi_promin)
alt.themes.enable('irozhlas')
warnings.filterwarnings('ignore')

In [5]:
df = pl.read_parquet(os.path.join("data/cnb_sloupce","100.parquet"))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","leader.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","008.parquet")), left_on="001", right_on="001", how="left")
df = df.to_pandas()
df = df[df["leader"].str[6].isin(["a", "t"])]
df = df[~df["leader"].str[7].isin(["b", "i", "s", " "])]
df = df[(df["008"].str[15:17] == "xr") & (df["008"].str[35:38] == "cze")]
df = pl.from_pandas(df)
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","020.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","022.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","245.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","260.parquet")), left_on="001", right_on="001", how="left")
print(df.explode("260_c").group_by("260_c").len().sort(by="len",descending=True).head(20))
print(len(df.explode("260_c").filter(pl.col("260_c") == "2012")))
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","300.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","655.parquet")), left_on="001", right_on="001", how="left")
df = df.join(pl.read_parquet(os.path.join("data/cnb_sloupce","700.parquet")), left_on="001", right_on="001", how="left")
df = df.explode("022_a").filter(pl.col("022_a").is_null())
df = df.with_columns(pl.col('008').map_elements(najdi_rok, return_dtype=int).alias('rok'))
print(len(df.filter(pl.col("rok") == 2012)))
df = df.with_columns(pl.col('300_a').map_elements(pocet_stran, return_dtype=int).alias('stran'))
df = df.with_columns(pl.col('245_a').map_elements(bez_bordelu, return_dtype=str))
df = df.explode('245_p').with_columns(pl.col('245_p').map_elements(bez_bordelu, return_dtype=str))
print(len(df))
df = df.filter(pl.col("rok").is_between(1800,2024))
print(len(df.filter(pl.col("rok") == 2012)))
df = df.filter(pl.col("stran") > 30)
print(len(df.filter(pl.col("rok") == 2012)))
df = df.drop_nulls(subset=['100_7'])
print(len(df.filter(pl.col("rok") == 2012)))
df = df.filter((~pl.col("245_h").str.contains("grafika")) | pl.col("245_h").is_null()).unique(subset=["100_a","245_a"], keep="first")
print(len(df))

shape: (20, 2)
┌───────┬────────┐
│ 260_c ┆ len    │
│ ---   ┆ ---    │
│ str   ┆ u32    │
╞═══════╪════════╡
│ null  ┆ 158811 │
│ 2008  ┆ 11390  │
│ 2011  ┆ 11066  │
│ 2010  ┆ 11030  │
│ 2012  ┆ 10993  │
│ 2007  ┆ 10814  │
│ 2009  ┆ 10704  │
│ 2013  ┆ 10667  │
│ 2006  ┆ 10263  │
│ 2005  ┆ 9905   │
│ 2014  ┆ 9835   │
│ 2004  ┆ 9791   │
│ 2003  ┆ 9641   │
│ 2002  ┆ 9265   │
│ 2000  ┆ 9089   │
│ 2001  ┆ 8907   │
│ 1999  ┆ 8634   │
│ 1998  ┆ 8181   │
│ 1997  ┆ 8047   │
│ 1996  ┆ 7581   │
└───────┴────────┘
10993
12713
716789
12718
11282
11246
449120


In [None]:
df.group_by("rok").len().sort(by="rok")