In [1]:
from xml.dom.minidom import (parse, Element)

In [3]:
corpus_filename = 'Tselina'
#corpus_filename = 'pedagogika'
#corpus_filename = 'sirija'

file_ext = '.xml'
doc = parse(corpus_filename + file_ext)
sentences = doc.getElementsByTagName('S')

In [4]:
f'Количество предложений в корпусе = {len(sentences)}'

'Количество предложений в корпусе = 189'

In [12]:
d = {}

class Sentence:
    def __init__(self, sent: Element):
        self.id = sent.getAttribute('ID')
        self.words = []
        
        self.parseWords(sent.getElementsByTagName('W'))
        
    def parseWords(self, rawWords):
        
        self.wordMap = {}
        
        for rawWord in rawWords:
            
            word = Word(rawWord, self.id)
            
            self.words.append(word)
            
            if word.dom == '_root':
                self.rootWord = word
                
            if word.dom in self.wordMap.keys():
                self.wordMap[word.dom].append(word)
            else:
                self.wordMap[word.dom] = [word]

        for word in self.words:
            if word.id in self.wordMap.keys():
                for d in self.wordMap[word.id]:
                    word.addWord(d)
    def printGram(self):
        self.rootWord.printGram(True) 
        
        
class Word:
    def __init__(self, w: Element, sentId):
        self.dom = w.getAttribute('DOM')
        self.feat = w.getAttribute('FEAT')
        self.id = w.getAttribute('ID')
        self.lemma = w.getAttribute('LEMMA')
        self.link = w.getAttribute('LINK')
        
        v = " ".join(t.nodeValue for t in w.childNodes if t.nodeType == t.TEXT_NODE).lower().strip()
        if len(v) != 0 and v[-1] == '.':
            v = v[:-1]
        if v != '':
            if self.feat in d.keys():
                d[self.feat].add(v)
            else:
                d[self.feat] = {v}
        
        
        self.sentId = sentId
        
        self.connectedWords = []
        
    def addWord(self, w):
        self.connectedWords.append(w)
        
    def printGram(self, checkProj):
        if checkProj:
            t = checkProjectivity(self, {}, 0)
            if not t:
                # закомментировать return, чтобы такие предложения не пропускались
                #print(f'\nNOT PROJECTIVE, {self.sentId}\n')
                return
        print('F{' + f'[{self.feat}' + ']}->', end = '')
        
        if self.connectedWords == []:
            print(f'[{self.feat}]\n', end = '')
            
        else:
            t = False
            for w in self.connectedWords:
                if not t and (int(w.id) > int(self.id)): 
                    print(f'[{self.feat}]', end = '')
                    t = True
                    
                print(';D{[' + f'{w.feat}' + '], ' + f'{w.link}' + '};', end = '')
            if not t:
                print(f'[{self.feat}]', end = '')
            for w in self.connectedWords:
                print('\nD{[' + f'{w.feat}], {w.link}' + '}->F{[' + f'{w.feat}' + ']}')
                #print() # раскомментировать и закомментировать предыдущую, если не хотим печатать D(t, s) -> F(t)
                w.printGram(False)
                
def checkProjectivity(rootWord, seenWordsIds, floor):
    if floor not in seenWordsIds.keys():
        seenWordsIds[floor] = []
    if any(map(lambda x: int(x) > int(rootWord.id), seenWordsIds[floor])):
        return False
    seenWordsIds[floor].append(rootWord.id)
    if rootWord.connectedWords == []:
        return True
    t = True
    for w in rootWord.connectedWords:
        t = t and checkProjectivity(w, seenWordsIds, floor + 1)
        if not t:
            return t
    return t   
        

In [6]:
def parseSentences():
    for sent in sentences:
        parseSentence(sent)

def parseSentence(sent: Element):
    sentence = Sentence(sent)
    sentence.printGram()

In [7]:
import sys
filename = corpus_filename + '_' + 'gram.out'

In [66]:
# для теста
# печатает грамматику одного предложения

sentence_id = 119 #id предложения в корпусе <S ID = 'sentence_id'>

filename = 'test/test.out'
filename_sorted = 'test/test_sorted.out'

orig_stdout = sys.stdout
f = open(filename, 'w', encoding = 'utf-16')
sys.stdout = f

sentence = Sentence(sentences[sentence_id - 1])
sentence.printGram()

sys.stdout = orig_stdout
f.close()

In [67]:
filename_sorted = corpus_filename + '_' + 'gram_sorted.out'

In [8]:
def printSortedNoDuplicatesFile(fname, fname_sorted):
    lines_seen = set()
    with open(fname, 'r', encoding = 'utf-16') as r:
        with open(fname_sorted, 'w', encoding = 'utf-16') as f:
            for line_orig in sorted(r):
                if line_orig not in lines_seen:
                    line = str(line_orig.strip())

                    if len(line) != 0 and line[-1] == ';':
                        line = line[:-1].strip()
                    line = line.replace(';;', ';')
                    line = line.replace('->;', '->')
                    line += '\n'  
                    if line == '\n':
                        continue
                    print(line, end = '', file = f)
                    lines_seen.add(line_orig)

In [69]:
printSortedNoDuplicatesFile(filename, filename_sorted)

In [70]:
def printDictionary(d, fname):
    with open(fname, 'w', encoding='utf-16') as f:
        for key, value in d.items():
            s = ' | '.join(value)
            print(f'[{key}] = {s}', file = f)

In [71]:
printDictionary(d, f'{corpus_filename}_dict.out')

In [86]:
d_gram_map = {}
f_gram_map = {}

In [87]:
def addToMap(mapName, key, value):
    if key in mapName.keys():
        mapName[key].append(value)
    else:
        mapName[key] = [value]

def mapGrammar(filename):
    with open(filename, 'r', encoding = 'utf-16') as r:
        for line in r:
            s = line.split('->')
            p = s[0].strip()
            w = s[1].strip().replace('\n', '')
            if (p[0] == 'D'):
                addToMap(d_gram_map, p, w)
            else:
                addToMap(f_gram_map, p, w)

In [26]:
def printGrammarMap(mapName, filename_min):
    with open(filename_min, 'w', encoding = 'utf-16') as f:
        for key, value in mapName.items():
            s = '|'.join(value)
            s = ';'.join(s.split(';'))
            print(key, '->', s, file = f, sep = '')

In [27]:
def dMapFirstStep():
    for key, value in d_gram_map.items():
        d_gram_map[key] = f_gram_map[d_gram_map[key][0]]

In [28]:
def removeRecursion(d_gram_map):
    seen = []
    t = True
    while t:
        t = False
        for key, value in d_gram_map.items():
            if 'D{' not in str(value) and key not in seen:
                t = True
                seen.append(key)
                for k, v in d_gram_map.items():
                    l = []
                    if key in str(v):
                        for item in v:
                            for val in value:
                                l.append(item.replace(key, val))
                    if l != []:
                        d_gram_map[k] = l
                        
    return d_gram_map
            

In [29]:
import copy

In [30]:
d_gram_map = {}
f_gram_map = {}


mapGrammar('test/test_sorted.out')
    
dMapFirstStep()

new_map = removeRecursion(copy.deepcopy(d_gram_map))

printGrammarMap(new_map, 'test/test_gram_min.out')
printSortedNoDuplicatesFile('test/test_gram_min.out', 'test/test_gram_min_sorted.out')

In [14]:
class RTNNode:
    def __init__(self, s, i, isEnd = False):
        self.s = s
        self.i = i
        self.isEnd = isEnd
        self.connected = None
    def connect(self, node):
        self.connected = node
    def printNode(self):
        s = f'${self.s}' if self.s[0] == 'D' else self.s
        if self.isEnd:
            print(s, '*', end = '')
            print(';')
        else:
            print(s, self.i, end = '')
            print(';')
            
    def printNodes(self, printItself = False):
        if printItself:
            self.printNode()
        if not self.isEnd:
            print(f'{self.i}:')
        if self.connected != None:
            self.connected.printNodes(printItself = True)
        

def toRTN(new_map, fname):
    orig_stdout = sys.stdout
    f = open(fname, 'w', encoding = 'utf-16')
    sys.stdout = f
    for key in new_map.keys():
        l = new_map[key]
        new_l = []
        m_len = 0
        for item in l:
            p = item.split(';')
            if len(p) > m_len:
                m_len = len(p)
            new_l.append(p)
        print(f'${key}')
        print('(')
        j = 0
        k = 0
        t = 0
        m = {}
        last_ones = []
        for items in new_l:
            last = None
            if k == 0:
                k = len(items) - 1
            else:
                k += t
            j = k
            t = 0
            
            for i, item in enumerate(reversed(items)):
                if i == 0:
                    r = RTNNode(item, 0, isEnd = i == 0)
                    last = r
                    continue
                r = RTNNode(item, j, isEnd = i == 0)
                j -= 1
                t += 1
            
                if last != None:
                    r.connect(last)
                last = r
                
            last_ones.append(last)
                
        print('0:')
        for i in last_ones:
            i.printNode()
        for i in last_ones:
            i.printNodes()        
        print(')')
        print()
    sys.stdout = orig_stdout
    f.close()            

In [15]:
test_new_map = {
    'S': ['D1;D2;D3;t;D4;D5'],
    'S1': ['a;D1', 'D2;D1']
}

t_new_map = True

In [16]:
if t_new_map:
    toRTN(test_new_map, f't_new_map.out')
else:
    toRTN(new_map, f'test/test_rtn.out')