In [None]:
!pip -q install hebrew-tokenizer

In [None]:
import os
import requests
import hebrew_tokenizer as ht
import json

# URL для YAP REST API
YAP_API_URL = 'http://localhost:8000/yap/heb/joint'

pos_dict = {'PROPN': 'person1', 'PRON': 'pron1', 'NUM': 'ordinal1'}

def get_lemmas_and_pos(text):
    tokens = [token for _, token, _, _ in ht.tokenize(text) if token.strip()]

    yap_input = json.dumps({'text': '\n'.join(tokens) + '  '})

    headers = {'content-type': 'application/json'}
    response = requests.post(url=YAP_API_URL, data=yap_input, headers=headers)

    if response.status_code != 200:
        print("Ответ:", response.text)
        return []

    yap_output = response.json()

    lemmas_and_pos = []

    if 'ma_lattice' in yap_output:
        ma_lattice = yap_output['ma_lattice']
        for line in ma_lattice.strip().split('\n'):
            parts = line.split('\t')
            if len(parts) > 5:
                lemma = parts[2]
                pos = parts[4]
                lemmas_and_pos.append((lemma, pos))

    return lemmas_and_pos

def prepare_hebrew_text(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as fin:
        text = fin.read()

    tokens = [token for _, token, _, _ in ht.tokenize(text) if token.strip()]

    lemmas_and_pos = get_lemmas_and_pos('\n'.join(tokens))

    with open(output_file, 'w', encoding='utf-8') as prepared_text:
        seen_tokens = set()
        for lemma, pos in lemmas_and_pos:
            if lemma not in seen_tokens:
                seen_tokens.add(lemma)
                if pos in pos_dict:
                    prepared_text.write(pos_dict[pos])
                elif lemma.isdigit():
                    prepared_text.write('ordinal1')
                elif pos != 'PUNCT':
                    prepared_text.write(lemma)
                prepared_text.write(' ')
        prepared_text.write('\n')

def process_all_files(pure_folder, processed_folder):
    if not os.path.exists(processed_folder):
        os.makedirs(processed_folder)

    for filename in os.listdir(pure_folder):
        if filename.endswith('.txt'):
            input_file = os.path.join(pure_folder, filename)
            output_file = os.path.join(processed_folder, filename)
            print(f"Processing {input_file} -> {output_file}")
            prepare_hebrew_text(input_file, output_file)

In [None]:
pure_folder = 'test1'
processed_folder = 'test2'

process_all_files(pure_folder, processed_folder)

Processing test1/תקוה ופחד.txt -> test2/תקוה ופחד.txt


In [None]:
tokens = ['יש', 'וביום', 'זה', 'יתנפנפו', 'דגלים', 'שחורים', 'מעל', 'בתיהם', 'של', 'סובאי']

yap_input = json.dumps({'tokens': tokens})

headers = {'content-type': 'application/json'}
response = requests.post(url=YAP_API_URL, data=yap_input, headers=headers)

if response.status_code == 200:
    yap_output = response.json()
    print(json.dumps(yap_output, ensure_ascii=False, indent=4))
else:
    print("Ответ:", response.text)

{}
