In [1]:
# from pathlib import Path
# from tempfile import NamedTemporaryFile
# from utils import prepare_interim_files, construct_TEI

# directory = ".."

# for xmlid in range(6, 11):
#     text_path = Path(directory) / (str(xmlid) + "_text.txt")
#     meta_path = Path(directory) / (str(xmlid) + "_meta.tsv")
#     mp_path = Path(directory) / Path("Croatia_MPs_final_ 20220917.xlsx")
#     parties_path = Path(directory) / Path("Croatia_parties_final_20220917.xlsx")
    
#     merged_file = NamedTemporaryFile()
#     prepare_interim_files(
#         text_path = text_path,
#         meta_path = meta_path,
#         mp_path = mp_path,
#         parties_path= parties_path,
#         out_file = merged_file.name
#     )
    
#     construct_TEI(
#         # pickled_file="01_merged_data", 
#         pickled_file = merged_file.name,
#         file_index=xmlid,
#         # out_file=merged_file.name,
#         out_file = Path("../T/") / f"ParlaMint-HR_T{xmlid:02}.xml"
#         )

In [2]:
from pathlib import Path
from tempfile import NamedTemporaryFile
from utils import prepare_interim_files, construct_TEI
from concurrent.futures import ProcessPoolExecutor


def process(xmlid: int) -> None:
    text_path = Path("..") / (str(xmlid) + "_text.txt")
    meta_path = Path("..") / (str(xmlid) + "_meta.tsv")
    mp_path = Path("..") / Path("Croatia_MPs_final_ 20220917.xlsx")
    parties_path = Path("..") / Path("Croatia_parties_final_20220917.xlsx")
    
    merged_file = NamedTemporaryFile()
    prepare_interim_files(
        text_path = text_path,
        meta_path = meta_path,
        mp_path = mp_path,
        parties_path= parties_path,
        out_file = merged_file.name
    )
    
    construct_TEI(
        # pickled_file="01_merged_data", 
        pickled_file = merged_file.name,
        file_index=xmlid,
        # out_file=merged_file.name,
        out_file = Path("../T/") / f"ParlaMint-HR_T{xmlid}.xml"
        )
    
with ProcessPoolExecutor(max_workers=10) as executor:
    futures = executor.map(process, [i for i in range(5,11)])

  0%|          | 0/12059 [00:00<?, ?it/s]2022-10-26 13:45:00 INFO: Loading these models for language: hr (Croatian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2022-10-26 13:45:00 INFO: Use device: cpu
2022-10-26 13:45:00 INFO: Loading: tokenize
2022-10-26 13:45:00 INFO: Done loading processors!
  0%|          | 2/12059 [00:00<1:31:17,  2.20it/s]  0%|          | 24/12059 [00:01<06:33, 30.57it/s]   0%|          | 44/12059 [00:01<03:36, 55.55it/s]  0%|          | 58/12059 [00:01<03:28, 57.63it/s]  1%|          | 79/12059 [00:01<02:23, 83.47it/s]  1%|          | 99/12059 [00:01<01:53, 105.00it/s]  0%|          | 0/43615 [00:00<?, ?it/s]  1%|          | 115/12059 [00:01<02:18, 86.10it/s]  1%|          | 146/12059 [00:01<01:33, 126.82it/s]  1%|▏         | 172/12059 [00:02<01:17, 153.38it/s]  2%|▏         | 193/12059 [00:02<01:45, 112.17it/s]  2%|▏         | 210/12059 [00:02<01:38, 120.38it/s]2022-10-26 13:45:02 INFO: Loading these models for lang

In [29]:
from pathlib import Path
from tempfile import NamedTemporaryFile
from utils import prepare_interim_files, construct_TEI
from parse import compile
from concurrent.futures import ProcessPoolExecutor

datadir = Path("../S_data").resolve()

# Identify all (T,S) pairs:
files = list(datadir.glob("*"))
pattern = "{prefix}_T{term:02}_S{session:02}.{ending}"
p = compile(pattern)
terms_and_sessions = set((p.parse(i.name).named["term"], p.parse(i.name).named["session"]) for i in files)


def process(term, session) -> None:
    text_path = datadir / (f"text_T{term:02}_S{session:02}.txt")
    meta_path = datadir / (f"meta_T{term:02}_S{session:02}.tsv")
    mp_path = Path("..") / Path("Croatia_MPs_final_ 20220917.xlsx")
    parties_path = Path("..") / Path("Croatia_parties_final_20220917.xlsx")
    
    merged_file = NamedTemporaryFile()
    prepare_interim_files(
        text_path = text_path,
        meta_path = meta_path,
        mp_path = mp_path,
        parties_path= parties_path,
        out_file = merged_file.name
    )
    
    construct_TEI(
        pickled_file = merged_file.name,
        file_index=xmlid,
        # out_file=merged_file.name,
        out_file = Path("../S/") / f"ParlaMint-HR_T{term:02}_S{session:02}.xml"
        )

{('05', '01'),
 ('05', '02'),
 ('05', '03'),
 ('05', '04'),
 ('05', '05'),
 ('05', '06'),
 ('05', '07'),
 ('05', '08'),
 ('05', '09'),
 ('05', '10'),
 ('05', '11'),
 ('05', '12'),
 ('05', '13'),
 ('05', '14'),
 ('05', '15'),
 ('05', '16'),
 ('05', '17'),
 ('05', '18'),
 ('05', '19'),
 ('05', '20'),
 ('05', '21'),
 ('05', '22'),
 ('05', '23'),
 ('05', '24'),
 ('05', '25'),
 ('05', '26'),
 ('05', '27'),
 ('06', '01'),
 ('06', '02'),
 ('06', '03'),
 ('06', '04'),
 ('06', '05'),
 ('06', '06'),
 ('06', '07'),
 ('06', '08'),
 ('06', '09'),
 ('06', '10'),
 ('06', '11'),
 ('06', '12'),
 ('06', '13'),
 ('06', '14'),
 ('06', '15'),
 ('06', '16'),
 ('06', '17'),
 ('06', '18'),
 ('06', '19'),
 ('06', '20'),
 ('06', '21'),
 ('06', '22'),
 ('06', '23'),
 ('06', '24'),
 ('07', '01'),
 ('07', '02'),
 ('07', '03'),
 ('07', '04'),
 ('07', '05'),
 ('07', '06'),
 ('07', '07'),
 ('07', '08'),
 ('07', '09'),
 ('07', '10'),
 ('07', '11'),
 ('07', '12'),
 ('07', '13'),
 ('07', '14'),
 ('07', '15'),
 ('07', '1

In [25]:
files[0]

PosixPath('/home/rupnik/parlamint/S_data/meta_T06_S03.tsv')

In [23]:
_.named

{'prefix': 'meta', 'term': '06', 'session': '03', 'ending': 'tsv'}