In [1]:
from pathlib import Path
from tempfile import NamedTemporaryFile
from utils import prepare_interim_files, construct_TEI
from parse import compile
from concurrent.futures import ProcessPoolExecutor

datadir = Path("/home/rupnik/parlamint/SRB/S_data/")

# Identify all (T,S) pairs:
files = list(datadir.glob("*"))
pattern = "{prefix}_T{term:02}_S{session}.{ending}"
p = compile(pattern)
terms_and_sessions = set((p.parse(i.name).named["term"], p.parse(i.name).named["session"]) for i in files)


def process(term, session) -> None:
    term = int(term)
    text_path = datadir / (f"text_T{term:02}_S{session}.txt")
    meta_path = datadir / (f"meta_T{term:02}_S{session}.tsv")
    mp_path = Path("/home/rupnik/parlamint/SRB") / Path("Serbia_MPs_final_20221007.xlsx")
    parties_path = Path("/home/rupnik/parlamint/SRB") / Path("Serbia_parties_final_20221007.xlsx")
    
    merged_file = NamedTemporaryFile()
    prepare_interim_files(
        text_path = text_path,
        meta_path = meta_path,
        mp_path = mp_path,
        parties_path= parties_path,
        out_file = merged_file.name
    )
    
    construct_TEI(
        pickled_file = merged_file.name,
        session_index=session,
        term_index=term,
        data_language_code = "sr",
        out_file = Path("/home/rupnik/parlamint/SRB/S/") / f"ParlaMint-RS_T{term:02}S{session}.xml"
        )
    
    
with ProcessPoolExecutor(max_workers=25) as executor:
    futures = executor.map(process, [term for term, session in terms_and_sessions], [session for term, session in terms_and_sessions])

# for term, session in list(terms_and_sessions)[:2]:
#     process(term, session)

  0%|          | 0/340 [00:00<?, ?it/s]  0%|          | 0/445 [00:00<?, ?it/s]  0%|          | 0/149 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/120 [00:00<?, ?it/s]  0%|          | 0/874 [00:00<?, ?it/s]  0%|          | 0/946 [00:00<?, ?it/s]  0%|          | 0/1355 [00:00<?, ?it/s]  0%|          | 0/5 [00:00<?, ?it/s]  0%|          | 0/3 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/163 [00:00<?, ?it/s]  0%|          | 0/616 [00:00<?, ?it/s]  0%|          | 0/668 [00:00<?, ?it/s]  0%|          | 0/214 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/922 [00:00<?, ?it/s]  0%|          | 0/1770 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/2029 [00:00<?, ?it/s]  0%|          | 0/1327 [00:00<?, ?it/s]  0%|          | 0/2164 [00:00<?, ?it/s]  0%|          | 0/1606 [00:00<?, ?it/s]  0%|          | 0/170 [00:00<?, ?it/s]  0%|          | 0/4335 [00:00<?, ?it/s]2022-1