In [1]:
from pathlib import Path
from tempfile import NamedTemporaryFile
from utils import prepare_interim_files, construct_TEI
from parse import compile
from concurrent.futures import ProcessPoolExecutor

datadir = Path("/home/rupnik/parlamint/SRB/S_data/")

# Identify all (T,S) pairs:
files = list(datadir.glob("*"))
pattern = "{prefix}_T{term:02}_S{session}.{ending}"
p = compile(pattern)
terms_and_sessions = set((p.parse(i.name).named["term"], p.parse(i.name).named["session"]) for i in files)


def process(term, session) -> None:
    term = int(term)
    text_path = datadir / (f"text_T{term:02}_S{session}.txt")
    meta_path = datadir / (f"meta_T{term:02}_S{session}.tsv")
    mp_path = Path("/home/rupnik/parlamint/SRB") / Path("Serbia_MPs_final_20221007.xlsx")
    parties_path = Path("/home/rupnik/parlamint/SRB") / Path("Serbia_parties_final_20221007.xlsx")
    
    merged_file = NamedTemporaryFile()
    prepare_interim_files(
        text_path = text_path,
        meta_path = meta_path,
        mp_path = mp_path,
        parties_path= parties_path,
        out_file = merged_file.name
    )
    
    construct_TEI(
        pickled_file = merged_file.name,
        session_index=session,
        term_index=term,
        data_language_code = "sr",
        out_file = Path("/home/rupnik/parlamint/SRB/S/") / f"ParlaMint-RS_T{term:02}S{session}.xml"
        )
    
    
with ProcessPoolExecutor(max_workers=25) as executor:
    futures = executor.map(process, [term for term, session in terms_and_sessions], [session for term, session in terms_and_sessions])

# for term, session in list(terms_and_sessions)[:2]:
#     process(term, session)

  0%|          | 0/129 [00:00<?, ?it/s]  0%|          | 0/121 [00:00<?, ?it/s]  0%|          | 0/461 [00:00<?, ?it/s]  0%|          | 0/500 [00:00<?, ?it/s]  0%|          | 0/3 [00:00<?, ?it/s]  0%|          | 0/952 [00:00<?, ?it/s]  0%|          | 0/274 [00:00<?, ?it/s]  0%|          | 0/1511 [00:00<?, ?it/s]  0%|          | 0/3 [00:00<?, ?it/s]  0%|          | 0/3 [00:00<?, ?it/s]  0%|          | 0/245 [00:00<?, ?it/s]  0%|          | 0/776 [00:00<?, ?it/s]  0%|          | 0/96 [00:00<?, ?it/s]  0%|          | 0/300 [00:00<?, ?it/s]  0%|          | 0/2860 [00:00<?, ?it/s]  0%|          | 0/2222 [00:00<?, ?it/s]  0%|          | 0/1714 [00:00<?, ?it/s]  0%|          | 0/3 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/562 [00:00<?, ?it/s]  0%|          | 0/628 [00:00<?, ?it/s]  0%|          | 0/123 [00:00<?, ?it/s]  0%|          | 0/7583 [00:00<?, ?it/s]  0%|          | 0/1670 [00:00<?, ?it/s]  0%|          | 0/10088 [00:00<?, ?it/s]2022