In [3]:
import os
import xml.etree.ElementTree as etree

In [65]:
class Sense(object):
    def __init__(self,
                 id,
                 synset_id,
                 synt_type,
                 name,
                 lemma,
                 main_word,
                 poses,
                 meaning):
        """
        Sense class
        """
        self.id = id
        self.synset_id = synset_id
        self.synt_type = synt_type
        self.name = name
        self.lemma = lemma
        self.main_word = main_word
        self.poses = poses
        self.meaning = meaning
        self.composed_of = None
        self.derived_from = None
        
    def __str__(self):
        return str(self.__dict__)
        
class Synset(object):
    def __init__(self):
        self.id = None
        self.part_of_speech = None
        self.ruthes_name = None
        self.definition = None
        self.sense_list = []
        self.hypernym_for = []
        self.domain_for = []
        self.antonym = []
        self.pos_synonymy = []
        
        
class RuWordNet(object):
    def __init__(self, ruwordnet_path):
        self.ruwordnet_path = ruwordnet_path
        self.id2sense = {}
        self.id2synset = {}
        
        self.__load_senses()
    
    def __load_senses_and_synsets(self):
        n_senses_path = os.path.join(self.ruwordnet_path, "senses.N.xml")
        v_senses_path = os.path.join(self.ruwordnet_path, "senses.V.xml")
        a_senses_path = os.path.join(self.ruwordnet_path, "senses.A.xml")
        senses_paths = [n_senses_path, v_senses_path, a_senses_path]
        
        for path_idx, path in enumerate(senses_paths):
            if not os.path.exists(path):
                print("File {} does not exist! Stop loading RuWordNet".format(path))
                break
            
            print("Loading senses from {} ...".format(path))
            tree = etree.parse(path)
            root = tree.getroot()
            for sense in root:
                sense_id = sense.attrib["id"]
                assert sense_id not in self.id2sense, 'Error: sense id {} already exist'.format(sense_id)
                self.id2sense[sense_id] = Sense(**sense.attrib)
                
    def get_stat(self):
        print("Number of senses: {}".format(len(self.id2sense)))
    
    def __load_synsets(self):
        pass
        
    def __load_relations(self):
        pass

In [66]:
ruwordnet = RuWordNet("../rwn-xml-2017-05-13/")

Loading senses from ../rwn-xml-2017-05-13/senses.N.xml ...
Loading senses from ../rwn-xml-2017-05-13/senses.V.xml ...
Loading senses from ../rwn-xml-2017-05-13/senses.A.xml ...


In [71]:
list(ruwordnet.id2sense.keys())[-10:]

['77009',
 '74974',
 '126563',
 '48528',
 '12867',
 '12869',
 '12870',
 '115994',
 '57090',
 '57095']

In [72]:
print(ruwordnet.id2sense['57095'])

{'id': '57095', 'synset_id': 'A12657', 'synt_type': 'Adj', 'name': 'ХАМОВАТЫЙ', 'lemma': 'ХАМОВАТЫЙ', 'main_word': '', 'poses': '', 'meaning': '1', 'composed_of': None, 'derived_from': None}


In [38]:
ruwordnet.get_stat()

Number of senses: 130416


In [17]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [47]:
tree = etree.parse("../rwn-xml-2017-05-13/synsets.N.xml")

In [48]:
root = tree.getroot()

In [49]:
root

<Element 'synsets' at 0x7ff849f2acc8>

In [50]:
for s in root[:10]:
    print(s)
    print(s.attrib)

<Element 'synset' at 0x7ff849f2ad18>
{'id': 'N12658', 'ruthes_name': 'КОДИРОВАНИЕ ОТ ЗАВИСИМОСТИ', 'definition': '', 'part_of_speech': 'N'}
<Element 'synset' at 0x7ff849f2aea8>
{'id': 'N12659', 'ruthes_name': 'ПОДГОТОВИТЬ К ИЗВЕСТИЮ', 'definition': 'готовить, настраивать соответствующим образом для восприятия чего-либо (какого-либо известия, сообщения и т. п.)', 'part_of_speech': 'N'}
<Element 'synset' at 0x7ff849f2af98>
{'id': 'N12660', 'ruthes_name': 'ПОЛОГ', 'definition': '', 'part_of_speech': 'N'}
<Element 'synset' at 0x7ff84f050098>
{'id': 'N12661', 'ruthes_name': 'СВЕРНУТЬ (СЛОМАТЬ)', 'definition': '', 'part_of_speech': 'N'}
<Element 'synset' at 0x7ff84f050188>
{'id': 'N12662', 'ruthes_name': 'ОБЛАВА НА ЗВЕРЯ', 'definition': 'охота, при которой загонщики окружают то место, где находится зверь, и гонят его на сидящих в засаде охотников; цепь загонщиков, окружающих зверя', 'part_of_speech': 'N'}
<Element 'synset' at 0x7ff84f050368>
{'id': 'N12663', 'ruthes_name': 'РАЗВОДЫ (УЗОР)', 

In [11]:
len(root)

77372

In [12]:
!wc -l ../rwn-xml-2017-05-13/senses.N.xml

77374 ../rwn-xml-2017-05-13/senses.N.xml
