In [1]:
import os

In [2]:
from pathlib import Path

In [3]:
path_to_data = Path('../data')

In [4]:
langdirs = [
    d for d in os.listdir(path_to_data) 
    if '.' not in d  # exclude files and hidden folders
]

In [5]:
langdirs

['BN-Bangla',
 'DE-German',
 'EN-English',
 'ES-Spanish',
 'FA-Farsi',
 'HI-Hindi',
 'KO-Korean',
 'NL-Dutch',
 'RU-Russian',
 'TR-Turkish',
 'ZH-Chinese']

In [6]:
def make_new_sample():
    return {
        'tokens': [],
        'token_labels': [],
        'lang': langdir,
        'id': None,
        'domain': None
    }

In [7]:
import json
import re

ID_DOMAIN = re.compile(r'# id (\S+)\sdomain=(\w+)')
# TOKEN_LABEL = re.compile(r'(\S+) _ _ (\S+)')
CONLL_PATTERN = '_ _'

TYPE = 'train'

wrong_lines = []
dataset = []

for langdir in langdirs:
    fname = [f for f in os.listdir(path_to_data / langdir) if TYPE in f][0]
    with (path_to_data / langdir / fname).open() as f:
        lines = f.readlines()

    new_sample = make_new_sample()
    for line in lines:
        line = line.strip()
        if line:
            match = ID_DOMAIN.match(line)
            if match:
                new_sample['id'] = match.group(1)
                new_sample['domain'] = match.group(2)
            elif CONLL_PATTERN in line:
                splitted = line.split(CONLL_PATTERN)
                new_sample['tokens'].append(splitted[0].strip())
                new_sample['token_labels'].append(splitted[1].strip())        
            else:
                wrong_lines.append(line)
        else:
            dataset.append(new_sample)
            new_sample = make_new_sample()        
            
assert not wrong_lines

with open(f'../data/{TYPE}_dataset.json', 'w') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

Аналогично поступим для dev

In [8]:
import json
import re

ID_DOMAIN = re.compile(r'# id (\S+)\sdomain=(\w+)')
# TOKEN_LABEL = re.compile(r'(\S+) _ _ (\S+)')
CONLL_PATTERN = '_ _'

TYPE = 'dev'

wrong_lines = []
dataset = []

for langdir in langdirs:
    fname = [f for f in os.listdir(path_to_data / langdir) if TYPE in f][0]
    with (path_to_data / langdir / fname).open() as f:
        lines = f.readlines()

    new_sample = make_new_sample()
    for line in lines:
        line = line.strip()
        if line:
            match = ID_DOMAIN.match(line)
            if match:
                new_sample['id'] = match.group(1)
                new_sample['domain'] = match.group(2)
            elif CONLL_PATTERN in line:
                splitted = line.split(CONLL_PATTERN)
                new_sample['tokens'].append(splitted[0].strip())
                new_sample['token_labels'].append(splitted[1].strip())        
            else:
                wrong_lines.append(line)
        else:
            dataset.append(new_sample)
            new_sample = make_new_sample()        
            
assert not wrong_lines

with open(f'../data/{TYPE}_dataset.json', 'w') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)