In [1]:
import os
import polars as pl

In [2]:
pl.Config.set_tbl_rows(50)
pl.Config.set_fmt_str_lengths(150)
pl.Config.set_tbl_width_chars(200)
pl.Config.set_verbose(True)  

polars.config.Config

In [3]:
slovenske = [
    "S SK 1939",
    "S SR 1939-1945"
]

In [4]:
projevy = pl.read_ndjson(
    'data_raw/schuze/prepis_*.ndjson', 
    ignore_errors=True, 
    schema=pl.Schema(
        {
            'mluvci': pl.String, 
            'soubor': pl.String, 
            'mluvci_id': pl.String, 
            'text': pl.String, 
            'poradi': pl.Int32}
    ))

_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None


In [5]:
meta = pl.read_csv(
    'data_raw/schuze/meta_*.csv',
    schema=pl.Schema(
        {
            'komora': pl.String,
            'obdobi': pl.Int32,
            'schuze': pl.Int32,
            'cast': pl.Int32,
            'soubor': pl.String,
            'datum': pl.String,
            'komora_komplet': pl.String,
            'schuze_komplet': pl.String,
            'prepsano': pl.Boolean,
            'autorizovano': pl.Boolean
        }
    )
).with_columns(
    pl.col("datum").str.to_date(format="%Y-%m-%d",exact=False)
)

_init_credential_provider_builder(): credential_provider_init = None


In [6]:
try:
    os.makedirs("data")
except:
    pass

In [7]:
df = projevy.join(
    meta, on="soubor", how="left"
).filter(
    ~pl.col('komora_komplet').str.contains('SNR')
).filter(
    ~pl.col('komora_komplet').str.contains('NR SR')
).filter(
    ~pl.col('komora_komplet').is_in(slovenske)
).filter(
    ~(pl.col("text").str.starts_with("(pokračuje") | pl.col("text").str.starts_with("(Pokračuje") | pl.col("text").str.ends_with("hodin)") | pl.col("text").str.ends_with("hodin.)"))
).sort(
    by=["komora_komplet","schuze","cast","poradi"]
).with_columns(
    pl.when(pl.col('mluvci').is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).alias('mluvci_jisty')
).with_columns(
    pl.col("mluvci").forward_fill()
)

In [8]:
df.write_parquet("data/projevy.parquet", use_pyarrow=True)

In [9]:
komory = df.select(pl.col("komora")).unique().to_series().to_list()

In [10]:
for k in komory:
    obdobi = df.filter(pl.col('komora') == k).select(pl.col('obdobi')).unique().to_series().to_list()
    for o in obdobi:
        df.filter(pl.col('komora') == k).filter(pl.col('obdobi') == o).sort(by=['datum','schuze','cast','poradi']).write_parquet(f"data/{k}_{o}.parquet")

_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(): credential_provider_init = None
_init_credential_provider_builder(

In [11]:
len(df)

782574

In [12]:
len(meta)

102820

In [13]:
bez_data = meta.filter(pl.col("datum").is_null())
print(len(bez_data))

1738


In [14]:
bez_data.filter(pl.col("prepsano") == True)

komora,obdobi,schuze,cast,soubor,datum,komora_komplet,schuze_komplet,prepsano,autorizovano
str,i32,i32,i32,str,date,str,str,bool,bool
"""ssr""",1939,11,2,"""1939ssr_stenprot_011schuz_s011002.htm""",,"""S SR 1939-1945""","""11. schůze""",true,true
"""ssr""",1939,126,10,"""1939ssr_stenprot_126schuz_s126010.htm""",,"""S SR 1939-1945""","""126. schůze""",true,true
"""ssr""",1939,128,2,"""1939ssr_stenprot_128schuz_s128002.htm""",,"""S SR 1939-1945""","""128. schůze""",true,true
"""snr""",1948,2,2,"""1948snr_stenprot_002schuz_s002002.htm""",,"""SNR 1948-1954""","""2. schůze""",true,true
"""snr""",1990,7,1,"""1990snr_stenprot_007schuz_s007001.htm""",,"""SNR 1990-1992""","""7. schůze""",true,false
"""snr""",1990,7,2,"""1990snr_stenprot_007schuz_s007002.htm""",,"""SNR 1990-1992""","""7. schůze""",true,false
"""snr""",1990,7,3,"""1990snr_stenprot_007schuz_s007003.htm""",,"""SNR 1990-1992""","""7. schůze""",true,false
"""snr""",1990,7,4,"""1990snr_stenprot_007schuz_s007004.htm""",,"""SNR 1990-1992""","""7. schůze""",true,false
"""snr""",1990,7,5,"""1990snr_stenprot_007schuz_s007005.htm""",,"""SNR 1990-1992""","""7. schůze""",true,false
"""snr""",1990,7,6,"""1990snr_stenprot_007schuz_s007006.htm""",,"""SNR 1990-1992""","""7. schůze""",true,false


In [15]:
bez_data.sort(by='obdobi').tail(30)

komora,obdobi,schuze,cast,soubor,datum,komora_komplet,schuze_komplet,prepsano,autorizovano
str,i32,i32,i32,str,date,str,str,bool,bool
"""ps""",2021,83,53,"""2021ps_stenprot_083schuz_s083053.htm""",,"""PČR, PS 2017-...""","""83. schuze""",False,True
"""ps""",2021,83,54,"""2021ps_stenprot_083schuz_s083054.htm""",,"""PČR, PS 2017-...""","""83. schuze""",False,True
"""ps""",2021,83,55,"""2021ps_stenprot_083schuz_s083055.htm""",,"""PČR, PS 2017-...""","""83. schuze""",False,True
"""ps""",2021,83,56,"""2021ps_stenprot_083schuz_s083056.htm""",,"""PČR, PS 2017-...""","""83. schuze""",False,True
"""ps""",2021,83,57,"""2021ps_stenprot_083schuz_s083057.htm""",,"""PČR, PS 2017-...""","""83. schuze""",False,True
"""ps""",2021,83,58,"""2021ps_stenprot_083schuz_s083058.htm""",,"""PČR, PS 2017-...""","""83. schuze""",False,True
"""ps""",2021,83,59,"""2021ps_stenprot_083schuz_s083059.htm""",,"""PČR, PS 2017-...""","""83. schuze""",False,True
"""ps""",2021,83,60,"""2021ps_stenprot_083schuz_s083060.htm""",,"""PČR, PS 2017-...""","""83. schuze""",False,True
"""ps""",2021,83,61,"""2021ps_stenprot_083schuz_s083061.htm""",,"""PČR, PS 2017-...""","""83. schuze""",False,True
"""ps""",2021,83,62,"""2021ps_stenprot_083schuz_s083062.htm""",,"""PČR, PS 2017-...""","""83. schuze""",False,True


In [16]:
bez_komory = meta.filter(pl.col("komora_komplet").is_null())
print(len(bez_komory))

1002


In [17]:
bez_obdobi = meta.filter(pl.col("obdobi").is_null())
print(len(bez_obdobi))

0


In [18]:
bez_data = meta.filter(pl.col("datum").is_null()).sample(30)
print(len(bez_data))

30


In [19]:
meta.filter(pl.col("datum").is_null()).sample(30)

komora,obdobi,schuze,cast,soubor,datum,komora_komplet,schuze_komplet,prepsano,autorizovano
str,i32,i32,i32,str,date,str,str,bool,bool
"""ps""",2013,23,148,"""2013ps_stenprot_023schuz_s0232148.htm""",,,"""23. schůze""",False,
"""snr""",1990,24,28,"""1990snr_stenprot_024schuz_s024028.htm""",,"""SNR 1990-1992""","""24. schůze""",True,False
"""ps""",2017,29,137,"""2017ps_stenprot_029schuz_s029137.htm""",,,"""29. schůze""",False,
"""snr""",1990,8,9,"""1990snr_stenprot_008schuz_s008009.htm""",,"""SNR 1990-1992""","""8. schůze""",True,False
"""snr""",1990,15,27,"""1990snr_stenprot_015schuz_s015027.htm""",,"""SNR 1990-1992""","""15. schůze""",True,False
"""ps""",2013,23,440,"""2013ps_stenprot_023schuz_s023440.htm""",,,"""23. schůze""",False,
"""snr""",1990,24,16,"""1990snr_stenprot_024schuz_s024016.htm""",,"""SNR 1990-1992""","""24. schůze""",True,False
"""ps""",2013,23,532,"""2013ps_stenprot_023schuz_s023532.htm""",,,"""23. schůze""",False,
"""ps""",2013,23,128,"""2013ps_stenprot_023schuz_s0232128.htm""",,,"""23. schůze""",False,
"""ps""",2013,23,985,"""2013ps_stenprot_023schuz_s0231985.htm""",,,"""23. schůze""",False,


In [20]:
import altair as alt

In [21]:
alt.Chart(
    df.filter(~pl.col('datum').is_null()).group_by_dynamic(index_column="datum", every="1y").agg(pl.col('text').len()).to_pandas(),
    width=1000
).mark_bar(
).encode(
    alt.X("datum:T"),
    alt.Y("text:Q")
)

InvalidOperationError: argument in operation 'group_by_dynamic' is not sorted, please sort the 'expr/series/column' first

In [None]:
df.filter(pl.col('datum').dt.year() == 1976)