In [11]:
import os
from copy import deepcopy
import xml.etree.ElementTree as etree

In [26]:
class Sense(object):
    def __init__(self,
                 id,
                 synset_id,
                 synt_type,
                 name,
                 lemma,
                 main_word,
                 poses,
                 meaning):
        """
        Sense class
        """
        self.id = id
        self.synset_id = synset_id
        self.synt_type = synt_type
        self.name = name
        self.lemma = lemma
        self.main_word = main_word
        self.poses = poses
        self.meaning = meaning
        self.composed_of = None
        self.derived_from = None
        
    def __str__(self):
        return str(self.__dict__)
    
# <synset id="A4790" ruthes_name="ЦИРКОВОЙ МАНЕЖ"
# definition="большая круглая площадка посреди цирка, место, где выступают артисты" part_of_speech="Adj">
#     <sense id="21311">АРЕННЫЙ</sense>
#     <sense id="21312">МАНЕЖНЫЙ</sense>
# </synset>
        
class Synset(object):
    def __init__(self,
                 id,
                 ruthes_name,
                 definition,
                 part_of_speech
                ):
        self.id = id
        self.part_of_speech = part_of_speech
        self.ruthes_name = ruthes_name
        self.definition = definition
        self.sense_list = []
        self.hypernym_for = []
        self.hyponym_for = []
        self.domain_for = []
        self.antonym = []
        self.pos_synonymy = []
        
    def __str__(self):
        return str(self.__dict__)
        
        
class RuWordNet(object):
    def __init__(self, ruwordnet_path):
        self.ruwordnet_path = ruwordnet_path
        self.id2sense = {}
        self.id2synset = {}
        
        self.__load_senses_and_synsets()
        self.__load_relations()
    
    def __load_senses_and_synsets(self):
        n_senses_path = os.path.join(self.ruwordnet_path, "senses.N.xml")
        v_senses_path = os.path.join(self.ruwordnet_path, "senses.V.xml")
        a_senses_path = os.path.join(self.ruwordnet_path, "senses.A.xml")
        n_synsets_path = os.path.join(self.ruwordnet_path, "synsets.N.xml")
        v_synsets_path = os.path.join(self.ruwordnet_path, "synsets.V.xml")
        a_synsets_path = os.path.join(self.ruwordnet_path, "synsets.A.xml")
        senses_synsets_paths = [n_senses_path, v_senses_path, a_senses_path,
                        n_synsets_path, v_synsets_path, a_synsets_path]
        
        for path_idx, path in enumerate(senses_synsets_paths):
            if not os.path.exists(path):
                print("File {} does not exist! Stop loading RuWordNet".format(path))
                break
            
            print("Loading senses/synsets from {} ...".format(path))
            tree = etree.parse(path)
            root = tree.getroot()
            for value in root:
                value_id = value.attrib["id"]
                if path_idx < 3:
                    assert value_id not in self.id2sense, 'Error: sense id {} already exist'.format(value_id)
                    self.id2sense[value_id] = Sense(**value.attrib)
                else:
                    assert value_id not in self.id2synset, 'Error: synset id {} already exist'.format(value_id)
                    new_synset = Synset(**value.attrib)
                    for sense in value:
                        new_synset.sense_list.append(sense.attrib['id'])
                    self.id2synset[value_id] = new_synset
                    
        self.get_stat()
        
    def __load_relations(self):
        """
        Only hypernymy loading today
        """
        n_relations_path = os.path.join(self.ruwordnet_path, "synset_relations.N.xml")
        v_relations_path = os.path.join(self.ruwordnet_path, "synset_relations.V.xml")
        a_relations_path = os.path.join(self.ruwordnet_path, "synset_relations.A.xml")
        
        relations_paths = [n_relations_path, v_relations_path, a_relations_path]
        
        for path_idx, path in enumerate(relations_paths):
            if not os.path.exists(path):
                print("File {} does not exist! Stop loading RuWordNet".format(path))
                break
                
            print("Loading relations from {} ...".format(path))
            tree = etree.parse(path)
            root = tree.getroot()
            for value in root:
                parent_id = value.attrib["parent_id"]
                child_id = value.attrib["child_id"]
                relation_name = value.attrib["name"]
                
                if relation_name == "hypernym":
                    self.id2synset[child_id].hypernym_for.append(child_id)
                    self.id2synset[parent_id].hyponym_for.append(child_id)
                    
                
    def get_stat(self):
        print("Number of senses: {}".format(len(self.id2sense)))
        print("Number of synsets: {}".format(len(self.id2synset)))
        
    def get_roots(self):
        root_synsets = []
        for synset_id, synset in self.id2synset.items():
            if len(synset.hyponym_for) == 0 and len(synset.hypernym_for) != 0:
                root_synsets.append(deepcopy(synset))
        return root_synsets

In [27]:
ruwordnet = RuWordNet("../rwn-xml-2017-05-13/")

Loading senses/synsets from ../rwn-xml-2017-05-13/senses.N.xml ...
Loading senses/synsets from ../rwn-xml-2017-05-13/senses.V.xml ...
Loading senses/synsets from ../rwn-xml-2017-05-13/senses.A.xml ...
Loading senses/synsets from ../rwn-xml-2017-05-13/synsets.N.xml ...
Loading senses/synsets from ../rwn-xml-2017-05-13/synsets.V.xml ...
Loading senses/synsets from ../rwn-xml-2017-05-13/synsets.A.xml ...
Number of senses: 130416
Number of synsets: 49492
Loading relations from ../rwn-xml-2017-05-13/synset_relations.N.xml ...
Loading relations from ../rwn-xml-2017-05-13/synset_relations.V.xml ...
Loading relations from ../rwn-xml-2017-05-13/synset_relations.A.xml ...


In [28]:
roots = ruwordnet.get_roots()
print(len(roots))

459


In [29]:
for p in roots[:200]:
    print(p)

{'id': 'N13124', 'part_of_speech': 'N', 'ruthes_name': 'СЛАВЯНЕ', 'definition': '', 'sense_list': ['97323', '97322', '97325'], 'hypernym_for': ['N13124', 'N13124', 'N13124'], 'hyponym_for': [], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}
{'id': 'N13642', 'part_of_speech': 'N', 'ruthes_name': 'РОЛЬ, ПОЛОЖЕНИЕ, МЕСТО', 'definition': 'положение кого-либо в обществе, роль, отведенная кому-либо в обществе, в какой-либо деятельности', 'sense_list': ['12476', '12477', '12479'], 'hypernym_for': ['N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642', 'N13642'], 'hyponym_for': [], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}
{'id': 'N14572', 'part_of_speech': 'N', 'ruthes_name': 'КАВКАЗЦЫ', 'definition': '', 'sense_list': ['20079', '20078', '20073', '20074', '20075', '20076', '20077'], '

In [30]:
print (ruwordnet.id2synset['N12683'])

{'id': 'N12683', 'part_of_speech': 'N', 'ruthes_name': 'СИСТЕМНЫЙ АДМИНИСТРАТОР', 'definition': 'специалист, занимающийся наладкой и поддержанием работоспособности компьютерной сети, парка компьютеров', 'sense_list': ['113534'], 'hypernym_for': [], 'hyponym_for': ['N34464', 'N38221'], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}


In [91]:
list(ruwordnet.id2synset.keys())[-10:]

['A12648',
 'A12649',
 'A12650',
 'A12651',
 'A12652',
 'A12653',
 'A12654',
 'A12655',
 'A12656',
 'A12657']

In [93]:
print(ruwordnet.id2synset['A12654'])

{'id': 'A12654', 'part_of_speech': 'Adj', 'ruthes_name': 'КУРОРТ', 'definition': 'местность с целебными природными свойствами и с учреждениями для лечебных целей и для отдыха', 'sense_list': ['48528'], 'hypernym_for': [], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}


In [96]:
ruwordnet.get_stat()

Number of senses: 130416
Number of synses: 49492


In [17]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [47]:
tree = etree.parse("../rwn-xml-2017-05-13/synsets.N.xml")

In [48]:
root = tree.getroot()

In [49]:
root

<Element 'synsets' at 0x7ff849f2acc8>

In [78]:
for s in root[:10]:
    print(s)
    print(s.attrib)
    for v in s:
        print(v, v.attrib)

<Element 'synset' at 0x7ff849f2ad18>
{'id': 'N12658', 'ruthes_name': 'КОДИРОВАНИЕ ОТ ЗАВИСИМОСТИ', 'definition': '', 'part_of_speech': 'N'}
<Element 'sense' at 0x7ff849f2ad68> {'id': '115643'}
<Element 'sense' at 0x7ff849f2adb8> {'id': '115640'}
<Element 'sense' at 0x7ff849f2ae08> {'id': '115641'}
<Element 'sense' at 0x7ff849f2ae58> {'id': '115642'}
<Element 'synset' at 0x7ff849f2aea8>
{'id': 'N12659', 'ruthes_name': 'ПОДГОТОВИТЬ К ИЗВЕСТИЮ', 'definition': 'готовить, настраивать соответствующим образом для восприятия чего-либо (какого-либо известия, сообщения и т. п.)', 'part_of_speech': 'N'}
<Element 'sense' at 0x7ff849f2aef8> {'id': '117307'}
<Element 'sense' at 0x7ff849f2af48> {'id': '117313'}
<Element 'synset' at 0x7ff849f2af98>
{'id': 'N12660', 'ruthes_name': 'ПОЛОГ', 'definition': '', 'part_of_speech': 'N'}
<Element 'sense' at 0x7ff84f050048> {'id': '29834'}
<Element 'synset' at 0x7ff84f050098>
{'id': 'N12661', 'ruthes_name': 'СВЕРНУТЬ (СЛОМАТЬ)', 'definition': '', 'part_of_speec

In [11]:
len(root)

77372

In [12]:
!wc -l ../rwn-xml-2017-05-13/senses.N.xml

77374 ../rwn-xml-2017-05-13/senses.N.xml
