In [2]:
from pathlib import Path
from tempfile import NamedTemporaryFile
from utils import prepare_interim_files, construct_TEI
from parse import compile
from concurrent.futures import ProcessPoolExecutor

datadir = Path("/home/rupnik/parlamint/SRB/S_data/")

# Identify all (T,S) pairs:
files = list(datadir.glob("*"))
pattern = "{prefix}_T{term:02}_S{session}.{ending}"
p = compile(pattern)
terms_and_sessions = set((p.parse(i.name).named["term"], p.parse(i.name).named["session"]) for i in files)


def process(term, session) -> None:
    term = int(term)
    text_path = datadir / (f"text_T{term:02}_S{session}.txt")
    meta_path = datadir / (f"meta_T{term:02}_S{session}.tsv")
    mp_path = Path("/home/rupnik/parlamint/SRB") / Path("Serbia_MPs_final_20221007.xlsx")
    parties_path = Path("/home/rupnik/parlamint/SRB") / Path("Serbia_parties_final_20221007.xlsx")
    
    merged_file = NamedTemporaryFile()
    prepare_interim_files(
        text_path = text_path,
        meta_path = meta_path,
        mp_path = mp_path,
        parties_path= parties_path,
        out_file = merged_file.name
    )
    
    construct_TEI(
        pickled_file = merged_file.name,
        session_index=session,
        term_index=term,
        data_language_code = "sr",
        out_file = Path("/home/rupnik/parlamint/SRB/S/") / f"ParlaMint-HR_T{term:02}_S{session}.xml"
        )
    
    
with ProcessPoolExecutor(max_workers=20) as executor:
    futures = executor.map(process, [term for term, session in terms_and_sessions], [session for term, session in terms_and_sessions])

# for term, session in list(terms_and_sessions)[:2]:
#     process(term, session)

  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/1019 [00:00<?, ?it/s]  0%|          | 0/1437 [00:00<?, ?it/s]  0%|          | 0/3 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/3756 [00:00<?, ?it/s]  0%|          | 0/897 [00:00<?, ?it/s]  0%|          | 0/142 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/76 [00:00<?, ?it/s]  0%|          | 0/1714 [00:00<?, ?it/s]  0%|          | 0/180 [00:00<?, ?it/s]  0%|          | 0/1 [00:00<?, ?it/s]  0%|          | 0/120 [00:00<?, ?it/s]  0%|          | 0/1770 [00:00<?, ?it/s]  0%|          | 0/3178 [00:00<?, ?it/s]  0%|          | 0/1897 [00:00<?, ?it/s]  0%|          | 0/616 [00:00<?, ?it/s]  0%|          | 0/2521 [00:00<?, ?it/s]2022-10-28 13:33:06 INFO: Loading these models for language: hr (Croatian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2022-10-28 13:33:06 INFO: Use device: cpu
2022-10-28 1