In [1]:
import os
from copy import deepcopy
import xml.etree.ElementTree as etree

In [10]:
class Sense(object):
    def __init__(self,
                 id,
                 synset_id,
                 synt_type,
                 name,
                 lemma,
                 main_word,
                 poses,
                 meaning):
        """
        Sense class
        """
        self.id = id
        self.synset_id = synset_id
        self.synt_type = synt_type
        self.name = name
        self.lemma = lemma
        self.main_word = main_word
        self.poses = poses
        self.meaning = meaning
        self.composed_of = None
        self.derived_from = None
        
    def __str__(self):
        return str(self.__dict__)
    
        
class Synset(object):
    def __init__(self,
                 id,
                 ruthes_name,
                 definition,
                 part_of_speech
                ):
        self.id = id
        self.part_of_speech = part_of_speech
        self.ruthes_name = ruthes_name
        self.definition = definition
        self.sense_list = []
        self.hypernym_for = []
        self.hyponym_for = []
        self.domain_for = []
        self.antonym = []
        self.pos_synonymy = []
        
    def __str__(self):
        return str(self.__dict__)
        
        
class RuWordNet(object):
    def __init__(self, ruwordnet_path):
        self.ruwordnet_path = ruwordnet_path
        self.id2sense = {}
        self.id2synset = {}
        
        self.__load_senses_and_synsets()
        self.__load_relations()
    
    def __load_senses_and_synsets(self):
        n_senses_path = os.path.join(self.ruwordnet_path, "senses.N.xml")
        v_senses_path = os.path.join(self.ruwordnet_path, "senses.V.xml")
        a_senses_path = os.path.join(self.ruwordnet_path, "senses.A.xml")
        n_synsets_path = os.path.join(self.ruwordnet_path, "synsets.N.xml")
        v_synsets_path = os.path.join(self.ruwordnet_path, "synsets.V.xml")
        a_synsets_path = os.path.join(self.ruwordnet_path, "synsets.A.xml")
        senses_synsets_paths = [n_senses_path, v_senses_path, a_senses_path,
                        n_synsets_path, v_synsets_path, a_synsets_path]
        
        for path_idx, path in enumerate(senses_synsets_paths):
            if not os.path.exists(path):
                print("File {} does not exist! Stop loading RuWordNet".format(path))
                break
            
            print("Loading senses/synsets from {} ...".format(path))
            tree = etree.parse(path)
            root = tree.getroot()
            for value in root:
                value_id = value.attrib["id"]
                if path_idx < 3:
                    assert value_id not in self.id2sense, 'Error: sense id {} already exist'.format(value_id)
                    self.id2sense[value_id] = Sense(**value.attrib)
                else:
                    assert value_id not in self.id2synset, 'Error: synset id {} already exist'.format(value_id)
                    new_synset = Synset(**value.attrib)
                    for sense in value:
                        new_synset.sense_list.append(sense.attrib['id'])
                    self.id2synset[value_id] = new_synset
                    
        self.get_stat()
        
    def __load_relations(self):
        """
        Only hypernymy loading today
        """
        n_relations_path = os.path.join(self.ruwordnet_path, "synset_relations.N.xml")
        v_relations_path = os.path.join(self.ruwordnet_path, "synset_relations.V.xml")
        a_relations_path = os.path.join(self.ruwordnet_path, "synset_relations.A.xml")
        
        relations_paths = [n_relations_path, v_relations_path, a_relations_path]
        
        for path_idx, path in enumerate(relations_paths):
            if not os.path.exists(path):
                print("File {} does not exist! Stop loading RuWordNet".format(path))
                break
                
            print("Loading relations from {} ...".format(path))
            tree = etree.parse(path)
            root = tree.getroot()
            for value in root:
                parent_id = value.attrib["parent_id"]
                child_id = value.attrib["child_id"]
                relation_name = value.attrib["name"]
                
                if relation_name == "hypernym" or relation_name == "instance hypernym":
                    self.id2synset[child_id].hypernym_for.append(parent_id)
                    self.id2synset[parent_id].hyponym_for.append(child_id)
    
    def get_synsets(self, part_of_speech=None):
        synsets = []
        for synset in self.id2synset.values():
            if part_of_speech is not None and synset.part_of_speech not in part_of_speech:
                continue
            synsets.append(deepcopy(synset))
        return synsets
                    
    def get_stat(self):
        print("Number of senses: {}".format(len(self.id2sense)))
        print("Number of synsets: {}".format(len(self.id2synset)))
        
    def get_roots(self):
        root_synsets = []
        for synset_id, synset in self.id2synset.items():
            if len(synset.hyponym_for) == 0 and len(synset.hypernym_for) != 0:
                root_synsets.append(deepcopy(synset))
        return root_synsets
    
    def get_tree(self, root_synset_id):
        vert_name = self.id2synset[root_synset_id].ruthes_name
        tree = {vert_name:{}}
        for synset_id in self.id2synset[root_synset_id].hypernym_for:
            tree[vert_name].update(self.get_tree(synset_id))
        return tree
    
    def get_child_ids(self, root_synset_id):
        child_ids = []
        for synset_id in self.id2synset[root_synset_id].hypernym_for:
            child_ids += [synset_id] + self.get_child_ids(synset_id)
        return child_ids
    
    def get_synsets_without_relations(self):
        synsets_without_relations = []
        for synset_id, synset in self.id2synset.items():
            if len(synset.hyponym_for) == 0 and len(synset.hypernym_for) == 0:
                synsets_without_relations.append(deepcopy(synset))
        return synsets_without_relations
    
    def get_connect_components(self):
        component_roots = [set([root.id]) for root in self.get_roots()]
#         print(component_roots[:10])
        component_set_ids = [set([child_id for root in root_set for child_id in self.get_child_ids(root)])\
                             for root_set in component_roots]
        
        while True:
            new_component_roots = []
            new_component_set_ids = []
        
            for idx_1, component_root_1 in enumerate(component_roots):
                changed = False
                for idx_2, component_root_2 in enumerate(component_roots[idx_1 + 1:]):
                    if len(component_set_ids[idx_1].intersection(component_set_ids[idx_1 + 1 + idx_2])) > 0:
#                         print (idx_1, idx_1 + 1 + idx_2, len(component_roots[idx_1 + 1:idx_1 + 1 + idx_2]),
#                                len(component_roots[idx_2 + 1:]))
                        new_component_roots.append(component_root_1.union(component_root_2))
                        new_component_roots += component_roots[idx_1 + 1:idx_1 + 1 + idx_2]
                        new_component_roots += component_roots[idx_1 + 1 + idx_2 + 1:]
                        changed = True
                        break
                if changed:
                    break
                new_component_roots.append(component_root_1)
#             print (len(new_component_roots), len(component_roots))
            if len(component_roots) == len(new_component_roots):
                break
            else:
                component_roots = deepcopy(new_component_roots)
                component_set_ids = [set([child_id for root in root_set for child_id in self.get_child_ids(root) ])\
                             for root_set in component_roots]
#             break    
                
        print(len(component_roots))
        return component_roots

In [11]:
ruwordnet = RuWordNet("../rwn-xml-2017-05-13/")

Loading senses/synsets from ../rwn-xml-2017-05-13/senses.N.xml ...
Loading senses/synsets from ../rwn-xml-2017-05-13/senses.V.xml ...
Loading senses/synsets from ../rwn-xml-2017-05-13/senses.A.xml ...
Loading senses/synsets from ../rwn-xml-2017-05-13/synsets.N.xml ...
Loading senses/synsets from ../rwn-xml-2017-05-13/synsets.V.xml ...
Loading senses/synsets from ../rwn-xml-2017-05-13/synsets.A.xml ...
Number of senses: 130416
Number of synsets: 49492
Loading relations from ../rwn-xml-2017-05-13/synset_relations.N.xml ...
Loading relations from ../rwn-xml-2017-05-13/synset_relations.V.xml ...
Loading relations from ../rwn-xml-2017-05-13/synset_relations.A.xml ...


In [12]:
comps = ruwordnet.get_connect_components()

141


In [13]:
comps[0]

{'N13642',
 'N17250',
 'N21422',
 'N25077',
 'N31905',
 'N34096',
 'N36988',
 'N41294'}

In [5]:
print(ruwordnet.id2synset['N34394'])

{'id': 'N34394', 'part_of_speech': 'N', 'ruthes_name': 'ПЕНЗЕНСКАЯ ОБЛАСТЬ', 'definition': '', 'sense_list': ['119446', '119447'], 'hypernym_for': [], 'hyponym_for': ['N30503', 'N33610'], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}


In [6]:
n_synsets = ruwordnet.get_synsets(part_of_speech=["N"])
print(len(n_synsets))

29326


In [42]:
all_synsets = ruwordnet.get_synsets()
print (len(all_synsets))

49492


In [7]:
for s in n_synsets[:10]:
    print(s)

{'id': 'N12658', 'part_of_speech': 'N', 'ruthes_name': 'КОДИРОВАНИЕ ОТ ЗАВИСИМОСТИ', 'definition': '', 'sense_list': ['115643', '115640', '115641', '115642'], 'hypernym_for': [], 'hyponym_for': ['N37195', 'N14084'], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}
{'id': 'N12659', 'part_of_speech': 'N', 'ruthes_name': 'ПОДГОТОВИТЬ К ИЗВЕСТИЮ', 'definition': 'готовить, настраивать соответствующим образом для восприятия чего-либо (какого-либо известия, сообщения и т. п.)', 'sense_list': ['117307', '117313'], 'hypernym_for': [], 'hyponym_for': ['N30586', 'N39232'], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}
{'id': 'N12660', 'part_of_speech': 'N', 'ruthes_name': 'ПОЛОГ', 'definition': '', 'sense_list': ['29834'], 'hypernym_for': [], 'hyponym_for': ['N26308'], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}
{'id': 'N12661', 'part_of_speech': 'N', 'ruthes_name': 'СВЕРНУТЬ (СЛОМАТЬ)', 'definition': '', 'sense_list': ['96991', '96989'], 'hypernym_for': [], 'hyponym_for': ['N

In [15]:
no_rel = ruwordnet.get_synsets_without_relations()
print(len(no_rel))

814


In [16]:
for s in no_rel[:10]:
    print (s)

{'id': 'N12772', 'part_of_speech': 'N', 'ruthes_name': 'НОВИНКА, НОВОСТЬ', 'definition': '', 'sense_list': ['73830', '73829'], 'hypernym_for': [], 'hyponym_for': [], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}
{'id': 'V42089', 'part_of_speech': 'V', 'ruthes_name': 'АНОНИМНОСТЬ', 'definition': 'не имеющий указания на автора или хозяина, без имени автора или хозяина(о сочинении, письме, банковском счёте и\xa0т.\xa0п.)', 'sense_list': ['7073'], 'hypernym_for': [], 'hyponym_for': [], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}
{'id': 'V42121', 'part_of_speech': 'V', 'ruthes_name': 'ПРОЯСНЕНИЕ В ГОЛОВЕ', 'definition': '', 'sense_list': ['827', '830', '831', '832', '833'], 'hypernym_for': [], 'hyponym_for': [], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}
{'id': 'V42232', 'part_of_speech': 'V', 'ruthes_name': 'РАДИОАКТИВНЫЕ ОТХОДЫ', 'definition': '', 'sense_list': ['84496'], 'hypernym_for': [], 'hyponym_for': [], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}


In [44]:
roots = ruwordnet.get_roots()
print(len(roots))

450


In [45]:
s = 0
for r in roots:
    s += len(ruwordnet.get_child_ids(r.id))

135836

In [10]:
for p in roots[:200]:
    print(p)

{'id': 'N13124', 'part_of_speech': 'N', 'ruthes_name': 'СЛАВЯНЕ', 'definition': '', 'sense_list': ['97323', '97322', '97325'], 'hypernym_for': ['N20366', 'N23820', 'N32246'], 'hyponym_for': [], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}
{'id': 'N13642', 'part_of_speech': 'N', 'ruthes_name': 'РОЛЬ, ПОЛОЖЕНИЕ, МЕСТО', 'definition': 'положение кого-либо в обществе, роль, отведенная кому-либо в обществе, в какой-либо деятельности', 'sense_list': ['12476', '12477', '12479'], 'hypernym_for': ['N16140', 'N16946', 'N18603', 'N18767', 'N20953', 'N20996', 'N21004', 'N22320', 'N25662', 'N25833', 'N27949', 'N28331', 'N28766', 'N29047', 'N29583', 'N30759', 'N30782', 'N31966', 'N32419', 'N33837', 'N34164', 'N35382', 'N35390', 'N37961', 'N38989', 'N39235'], 'hyponym_for': [], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}
{'id': 'N14572', 'part_of_speech': 'N', 'ruthes_name': 'КАВКАЗЦЫ', 'definition': '', 'sense_list': ['20079', '20078', '20073', '20074', '20075', '20076', '20077'], '

In [32]:
tree = ruwordnet.get_tree('V42860')

In [41]:
child_ids = ruwordnet.get_child_ids('V42860')
print(child_ids)

['V42060', 'V42541', 'V43389', 'V42355', 'V47806', 'V43621', 'V44585', 'V49277', 'V49357', 'V43694', 'V43895', 'V43875', 'V46338', 'V43734', 'V48375', 'V48847', 'V43937', 'V44032', 'V48050', 'V45947', 'V46953', 'V42106', 'V42783', 'V42499', 'V47742', 'V47713', 'V48792', 'V49478', 'V43928', 'V44864', 'V46875', 'V47592']


In [33]:
print(tree)

{'УТРАТИТЬ': {'МЕРКНУТЬ': {}, 'РАЗРЯДИТЬСЯ, ПОТЕРЯТЬ ЗАРЯД': {}, 'НЕМЕТЬ, ЗАНЕМЕТЬ': {'ДЕРЕВЕНЕТЬ,КОСТЕНЕТЬ (НЕМЕТЬ, ВОЗМОЖНО ОТВЕРДЕВАЯ)': {'КОЧЕНЕТЬ ОТ ХОЛОДА': {}}}, 'РАССТАТЬСЯ (ЛИШИТЬСЯ КОГО-ЧЕГО-Н.)': {'ОСТАВИТЬ ЗАНЯТИЕ': {'ЗАПУСТИТЬ ЗАНЯТИЯ': {}, 'ВЫЙТИ ИЗ СОСТАВА': {}}}, 'ПОПЛАТИТЬСЯ': {}, 'УТРАТИТЬ ИМУЩЕСТВО': {'ПОГОРЕТЬ (УТРАТИТЬ ИМУЩЕСТВО)': {}, 'ТЕРЯТЬ, УТЕРЯТЬ': {'ЗАБЫТЬ ВЗЯТЬ': {}, 'ДЕВАТЬ НЕИЗВЕСТНО КУДА': {}}, 'БЕДНЕТЬ (СТАНОВИТЬСЯ НЕИМУЩИМ)': {'НИЩАТЬ, ВПАДАТЬ В НИЩЕТУ': {}, 'РАЗОРИТЬСЯ': {'БАНКРОТСТВО, НЕСОСТОЯТЕЛЬНОСТЬ': {}}}}, 'ОТВЫКНУТЬ, УТРАТИТЬ ПРИВЫЧКУ': {}, 'УТРАТИТЬ СВОЮ ЧАСТЬ': {'РОНЯТЬ ПОКРОВ': {'ЛИНЬКА ЖИВОТНЫХ': {}}, 'ЛЫСЕТЬ, ТЕРЯТЬ ВОЛОСЫ': {}, 'ТРАВМАТИЧЕСКАЯ АМПУТАЦИЯ': {}}, 'ОСИРОТЕТЬ, ОСТАТЬСЯ СИРОТОЙ': {}, 'РАСПРОСТИТЬСЯ (ЛИШИТЬСЯ)': {}, 'ОПУСТЕТЬ, СТАТЬ ПУСТЫМ': {'ОБЕЗЛЮДЕТЬ': {}, 'ОСИРОТЕТЬ (ОПУСТЕТЬ, СТАТЬ НЕУСТРОЕННЫМ)': {}, 'ВЫМЕРЕТЬ (ОПУСТЕТЬ)': {}, 'ЗАСНУТЬ (ОПУСТЕТЬ И ЗАТИХНУТЬ)': {}}}}


In [30]:
print (ruwordnet.id2synset['N12683'])

{'id': 'N12683', 'part_of_speech': 'N', 'ruthes_name': 'СИСТЕМНЫЙ АДМИНИСТРАТОР', 'definition': 'специалист, занимающийся наладкой и поддержанием работоспособности компьютерной сети, парка компьютеров', 'sense_list': ['113534'], 'hypernym_for': [], 'hyponym_for': ['N34464', 'N38221'], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}


In [91]:
list(ruwordnet.id2synset.keys())[-10:]

['A12648',
 'A12649',
 'A12650',
 'A12651',
 'A12652',
 'A12653',
 'A12654',
 'A12655',
 'A12656',
 'A12657']

In [93]:
print(ruwordnet.id2synset['A12654'])

{'id': 'A12654', 'part_of_speech': 'Adj', 'ruthes_name': 'КУРОРТ', 'definition': 'местность с целебными природными свойствами и с учреждениями для лечебных целей и для отдыха', 'sense_list': ['48528'], 'hypernym_for': [], 'domain_for': [], 'antonym': [], 'pos_synonymy': []}


In [96]:
ruwordnet.get_stat()

Number of senses: 130416
Number of synses: 49492


In [17]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [47]:
tree = etree.parse("../rwn-xml-2017-05-13/synsets.N.xml")

In [48]:
root = tree.getroot()

In [49]:
root

<Element 'synsets' at 0x7ff849f2acc8>

In [78]:
for s in root[:10]:
    print(s)
    print(s.attrib)
    for v in s:
        print(v, v.attrib)

<Element 'synset' at 0x7ff849f2ad18>
{'id': 'N12658', 'ruthes_name': 'КОДИРОВАНИЕ ОТ ЗАВИСИМОСТИ', 'definition': '', 'part_of_speech': 'N'}
<Element 'sense' at 0x7ff849f2ad68> {'id': '115643'}
<Element 'sense' at 0x7ff849f2adb8> {'id': '115640'}
<Element 'sense' at 0x7ff849f2ae08> {'id': '115641'}
<Element 'sense' at 0x7ff849f2ae58> {'id': '115642'}
<Element 'synset' at 0x7ff849f2aea8>
{'id': 'N12659', 'ruthes_name': 'ПОДГОТОВИТЬ К ИЗВЕСТИЮ', 'definition': 'готовить, настраивать соответствующим образом для восприятия чего-либо (какого-либо известия, сообщения и т. п.)', 'part_of_speech': 'N'}
<Element 'sense' at 0x7ff849f2aef8> {'id': '117307'}
<Element 'sense' at 0x7ff849f2af48> {'id': '117313'}
<Element 'synset' at 0x7ff849f2af98>
{'id': 'N12660', 'ruthes_name': 'ПОЛОГ', 'definition': '', 'part_of_speech': 'N'}
<Element 'sense' at 0x7ff84f050048> {'id': '29834'}
<Element 'synset' at 0x7ff84f050098>
{'id': 'N12661', 'ruthes_name': 'СВЕРНУТЬ (СЛОМАТЬ)', 'definition': '', 'part_of_speec

In [11]:
len(root)

77372

In [12]:
!wc -l ../rwn-xml-2017-05-13/senses.N.xml

77374 ../rwn-xml-2017-05-13/senses.N.xml
