In [27]:
import wget
import tarfile
from datetime import datetime
import os
import re
import copy
import shutil
import xml.etree.ElementTree as ET
import lxml.etree as etree
from collections import defaultdict

In [28]:
"""A few files from the Knesset dataset could not be parsed properly.
We suspect that it is because then do not contain enough closing tags.
Since those problematic files do not represent a huge fraction of the
total dataset,  we preferred to set them aside instead of trying to
edit them."""

def parse_corpus_file(fpath):
    """if fpath in ['./hebrew_dataset/16/'+pf for pf in problematic_files]:
        return [], []"""
    ns, vs = [], []
    tree = ET.parse(fpath)
    root = tree.getroot()
    #print(root)
    for paragraph in root[0]:
        for sentence in paragraph:
            for token in sentence:
                #print(token.attrib['surface'])
                for analysis in token:
                    #print(analysis.attrib['score'])
                    if 'score' in analysis.attrib.keys():
                        if float(analysis.attrib['score']) > 0:
                            for base in analysis:
                                #if 'dottedLexiconItem' in base.attrib.keys():
                                #print(base.attrib['dottedLexiconItem'])
                                if 'lexiconItem' in base.attrib.keys():
                                #print(base.attrib['lexiconItem'])
                                    for pos in base:
                                        if pos.tag == 'verb':
                                            vs.append(base.attrib['lexiconItem'])
                                        if pos.tag == 'noun':
                                            ns.append(base.attrib['lexiconItem'])
    return ns, vs

In [29]:
def parse_all_corpus_files():
    
    if 'knesset_tagged_16.tar.gz' not in os.listdir('./hebrew dataset/'):
        url = 'https://yeda.cs.technion.ac.il:8443/corpus/software/corpora/knesset/tagged/knesset_tagged_16.tar.gz'
        inp = './hebrew dataset/knesset_tagged_16.tar.gz'
        response = wget.download(url, inp)
        
        f = tarfile.open(inp)
        f.extractall('./hebrew dataset/')
        f.close()

    problematic_files = ['17892403.xml', '63021903.xml', '17880203.xml']
    files = os.listdir('./hebrew dataset/16/')
    files = [f for f in files if f[-4:] == '.xml']
    n_files = len(files)
    ns = []
    vs = []
    for i, f in enumerate(files):
        print(f'Now processing {f}')
        print(f'Step {i}/{n_files}')
        if f in problematic_files:
            continue
        fpath = './hebrew dataset/16/'+f
        new_ns, new_vs = parse_corpus_file(fpath)
        ns.extend(new_ns)
        vs.extend(new_vs)
    unique_ns = list(set(ns))
    unique_vs = list(set(vs))
    print(f'Total number of nouns in the Knesset dataset: {len(ns)}')
    print(f'Total number of verbs in the Knesset dataset: {len(vs)}')
    print(f'Number of unique nouns in the Knesset dataset: {len(unique_ns)}')
    print(f'Number of unique verbs in the Knesset dataset: {len(unique_vs)}')
    return unique_ns, unique_vs


def write_list_to_file(fname, l):
    with open(f'./{fname}', 'w') as f:
        for x in l:
            f.write(x+'\n')

def load_list_from_file(fname):
    with open(f'./{fname}', 'r') as f:
        l = f.read().split('\n')[:-1]
    return l

In [30]:
if 'unique_ns.csv' not in os.listdir('./hebrew dataset/') or 'unique_vs.csv' not in os.listdir('./hebrew dataset/'):
    unique_ns, unique_vs = parse_all_corpus_files()
    write_list_to_file('./hebrew dataset/unique_ns.csv', unique_ns)
    write_list_to_file('./hebrew dataset/unique_vs.csv', unique_vs)
    os.remove('./hebrew dataset/knesset_tagged_16.tar.gz')
    shutil.rmtree('./hebrew dataset/16')
else:
    unique_ns = load_list_from_file('./hebrew dataset/unique_ns.csv')
    unique_vs = load_list_from_file('./hebrew dataset/unique_vs.csv')

The cell below defines the patterns used to generate our data form Hebrew nouns. We are using two denominal, two non-denominal, and two "other" patterns. So, for each noun, we'll be able to generate at most 6 related forms.

In [None]:
class Dataset:
    def __init__(self):
        self.datapoints = []
        
    def add_datapoint(self, datapoint):
        self.datapoints.append(datapoint)
    
    def remove_datapoint(self, datapoint):
        self
        
class Datapoint:
    def __init__(self, noun, denoms, non_denoms, others):
        self.noun = noun
        self.denoms = denoms
        self.non_denoms = non_denoms
        self.others = others
        
class Template:
    def __init__(self, label, category, content):
        self.label = label
        self.category = category
        self.content = content
        self.compat = []
    
    def add_compat(self, template):
        self.compat.append(template)
        

In [None]:
nts = ['תץץוץת', 'ץץץן', 'מץץץ', 'תץץיץ', 'תץץוץ', 'ץץץון', 'מץץוץת', 'שץץץת', 'אץץץה', 'אץץוץ', 'ץץץני', 'מץץץת', 'מץץוץ', 'תץץוץה', 'אץץץ', 'אץץוץ']
n_templates = []
for nt in nts:
    n_template = Template('', 'n', nt)
    
    

In [31]:
n_templates = ['תץץוץת', 'ץץץן', 'מץץץ', 'תץץיץ', 'תץץוץ', 'ץץץון', 'מץץוץת', 'שץץץת', 'אץץץה', 'אץץוץ', 'ץץץני', 'מץץץת', 'מץץוץ', 'תץץוץה', 'אץץץ', 'אץץוץ']

dn_patterns = defaultdict(list)
ndn_patterns = defaultdict()
other_patterns = defaultdict()


# pi'el pattern for denominal
dn_patterns[0] = ['לתץץץ', 'לץץץן', 'למץץץ', 'לתץץץ', 'לתץץץ', 'לץץץן', 'למץץץ', 'לשץץץ', 'לאץץץ', 'לאץץץ', 'לץץץן', 'למץץץ', 'למץץץ', 'לתץץץ', 'לאץץץ', 'לאץץץ']

# pi'el/pu'al pattern for non-denominal
ndn_patterns[0] = 'לץץץ'

# hitpa'el pattern for denominal
dn_patterns[1] = ['NA', 'להתץץץן', 'להתמץץץ', 'NA', 'NA', 'להתץץץן', 'להתמץץץ', 'להשתץץץ', 'להתאץץץ', 'להתאץץץ', 'להתץץץן', 'להתמץץץ', 'להתמץץץ', 'NA', 'להתאץץץ', 'להתאץץץ']

# hitpa'el pattern for non-denominal
ndn_patterns[1] = 'להתץץץ'

# kal/nif'al pattern
other_patterns[0] = 'לץץוץ'

# hif'il/huf'al pattern
other_patterns[1] = 'להץץיץ'

n_template_to_v_templates = defaultdict(lambda: defaultdict(str))

for i, n in enumerate(n_templates):
    n_template_to_v_templates[n]['denom'] = [dn_patterns[j][i] for j in range(2)]
    n_template_to_v_templates[n]['non_denom'] = [ndn_patterns[j] for j in range(2)]
    n_template_to_v_templates[n]['others'] = [other_patterns[j] for j in range(2)]

In [32]:
"""def generate_dataset(n_template_to_v_templates, unique_ns):
    """ This function generates datapoints using the Knesset annotated corpus.
    First, it identifies nouns in the corpus that match the noun templates
    specified above. The characters of the matched nouns that do not beong to 
    thew template are then retained, to generate denominals, non-denominal verbs,
    and other verbs following the relevant patterns."""

    dataset = []
    matched_ns = []
    n_templates = n_template_to_v_templates.keys()

    for n in unique_ns: # we go through the noun lexicon
        all_matched_templates, all_skipped_characters = find_template(n, n_templates)
        if not all_matched_templates: # no matching template found
            continue

        for n_template, skipped_characters in zip(all_matched_templates, all_skipped_characters):
            datapoint = defaultdict(list)
            datapoint['noun'] = n

            for key in ['denom', 'non_denom', 'others']: # building the datapoint
                datapoint[key] = []
                for verbal_template in n_template_to_v_templates[n_template][key]:
                    #print("verb template", verbal_template)
                    if verbal_template != 'NA':
                        n_verbalized = ''
                        i = 0
                        for character in verbal_template: # producing the verbal form
                            #print("n verb", n_verbalized)
                            if character == "ץ":
                                n_verbalized+=skipped_characters[i]
                                i += 1
                            else:
                                n_verbalized+=character
                        #print("n verb", n_verbalized)
                        datapoint[key].append(n_verbalized)
                    else:
                        datapoint[key].append('')
            dataset.append(datapoint)
    return dataset"""

def find_template(n, n_templates, cat='noun'):
    all_matched_templates = []
    all_skipped_characters = []
    for n_template in n_templates:
        if len(n) == len(n_template):
            empty_positions = [_.start() for _ in re.finditer("ץ", n_template)]
            skipped_characters = [n[i] for i in empty_positions]

            if cat == 'noun':
                if 'ו' in skipped_characters or 'י' in skipped_characters:
                    continue
            
            cut_template = ''.join([n_template[i] for i in range(len(n_template)) if i not in empty_positions])
            cut_noun = ''.join([n[i] for i in range(len(n_template)) if i not in empty_positions])
            if cut_template == cut_noun: # the template matches the noun:
                all_matched_templates.append(n_template)
                all_skipped_characters.append(skipped_characters)
    return all_matched_templates, all_skipped_characters

In [None]:
def generate_dataset(n_template_to_v_templates, unique_ns):
    """ This function generates datapoints using the Knesset annotated corpus.
    First, it identifies nouns in the corpus that match the noun templates
    specified above. The characters of the matched nouns that do not beong to 
    thew template are then retained, to generate denominals, non-denominal verbs,
    and other verbs following the relevant patterns."""

    dataset = Dataset()
    matched_ns = []
    n_templates = n_template_to_v_templates.keys()

    for n in unique_ns: # we go through the noun lexicon
        all_matched_templates, all_skipped_characters = find_template(n, n_templates)
        if not all_matched_templates: # no matching template found
            continue

        for n_template, skipped_characters in zip(all_matched_templates, all_skipped_characters):
            datapoint = Datapoint(n, [], [], [])
            #datapoint['noun'] = n

            for key in ['denom', 'non_denom', 'others']: # building the datapoint
                #datapoint[key] = []
                for verbal_template in n_template_to_v_templates[n_template][key]:
                    #print("verb template", verbal_template)
                    if verbal_template != 'NA':
                        n_verbalized = ''
                        i = 0
                        for character in verbal_template: # producing the verbal form
                            #print("n verb", n_verbalized)
                            if character == "ץ":
                                n_verbalized+=skipped_characters[i]
                                i += 1
                            else:
                                n_verbalized+=character
                        #print("n verb", n_verbalized)
                        datapoint[key].append(n_verbalized)
                    """else:
                        datapoint[key].append('')"""
            dataset.add_datapoint(datapoint)
    return dataset

In [33]:
dataset = generate_dataset(n_template_to_v_templates, unique_ns)
print(f'Number of generated datapoints: {len(dataset)}')
denoms = [datapoint['denom'] for datapoint in dataset]
denoms = [denom for denom_pair in denoms for denom in denom_pair]
denoms = [denom for denom in denoms if denom != '']
print(f'Total number of generated denominals: {len(denoms)}')

Number of generated datapoints: 788
Total number of generated denominals: 1435


In [40]:
def filter_dataset(dataset, unique_vs):
    """This function eliminates the datapoints that do not even remotely
    exist in the Knesset dataset. In particular, it checks if the generated
    denominals, or any of their inflected forms, are present in the Knesset
    corpus. If not, the datapoint is moved from the dataset to a special junk
    list."""

    filtered_dataset, junk = copy.deepcopy(dataset), []
    for i, datapoint in enumerate(dataset):
        #print('Datapoint')
        #print(datapoint)
        for j, denominal in enumerate(datapoint['denom']):
            #print(f'Now testing {denominal}')
            if denominal == '':
                #print('Denominal is null')
                #print('Updated datapoint:')
                filtered_dataset = remove_dn(filtered_dataset, denominal, i)
                #print(filtered_dataset[i])
            elif not is_a_verb(denominal, unique_vs, j):
                #print('Denominal is not a verb')
                junk.append(split_dn(copy.deepcopy(datapoint), denominal))
                filtered_dataset = remove_dn(filtered_dataset, denominal, i)
                #print('Updated datapoint:')
                #print(filtered_dataset[i])
                #print('Junk entry:')
                #print(junk[-1])
    filtered_dataset = [datapoint for datapoint in filtered_dataset if datapoint['denom']]
    return filtered_dataset, junk

def is_a_verb(dn, unique_vs, j):
    inflected_dn = dn[1:]
    if inflected_dn in unique_vs:
        return True
    if j == 0: # first denominal form
        inflected_dn = dn[0]+'י'+dn[1:]
        if inflected_dn in unique_vs:
            return True
    return False

def split_dn(datapoint, dn):
    split_datapoint = datapoint
    split_datapoint['denom'] = [dn]
    return split_datapoint

def remove_dn(dataset, dn, i):
    datapoint = dataset[i]
    datapoint['denom'].remove(dn)
    dataset[i] = datapoint
    return dataset

In [41]:
filtered_dataset, junk = filter_dataset(dataset, unique_vs)
print(f'Number of remaining datapoints: {len(filtered_dataset)}')
print(f'Number of junk datapoints: {len(junk)}')
denoms = [datapoint['denom'] for datapoint in filtered_dataset]
denoms = [denom for denom_pair in denoms for denom in denom_pair]
denoms = [denom for denom in denoms if denom != '']
print(f'Total number of generated denominals: {len(denoms)}')
denoms = [datapoint['denom'] for datapoint in junk]
denoms = [denom for denom_pair in denoms for denom in denom_pair]
denoms = [denom for denom in denoms if denom != '']
print(f'Total number of junk denominals: {len(denoms)}')

Number of remaining datapoints: 99
Number of junk datapoints: 1322
Total number of generated denominals: 113
Total number of junk denominals: 1322


In [42]:
def write_dataset(dataset, name):
     with open(f'./{name}', 'w') as f:
        f.write('Noun,Denominal,Non-denominal,Others\n')
        for datapoint in dataset:
            f.write(datapoint['noun']+',')
            for key in ['denom', 'non_denom', 'others']:
                forms = datapoint[key]
                for form in forms:
                    f.write(form+' ')
                if key != 'others':
                    f.write(',')
                else:
                    f.write('\n')

In [44]:
write_dataset(junk, './hebrew dataset/junk.csv')
write_dataset(filtered_dataset, './hebrew dataset/filtered_dataset.csv')

In [53]:
ds = Dataset()

In [55]:
ds.datapoints

[]

In [56]:
dp = Datapoint('bla', ['bli', 'blu'], ['bly'], ['ble'])

In [57]:
ds.add_datapoint(dp)

In [58]:
ds.datapoints

[<__main__.Datapoint at 0x7f25fc043040>]