In [1]:
!which python
# /Users/alexskrn/opt/anaconda3/envs/yargy/bin/python

/Users/alexskrn/opt/anaconda3/envs/yargy/bin/python


In [2]:
import os
import json

from yargy import Parser
from yargy import rule
from yargy import predicates
from yargy import or_, and_, not_
from yargy import pipelines
from yargy.interpretation import fact, attribute
from ipymarkup import show_markup

In [3]:
def show_matches(rule, *lines):
    parser = Parser(rule)
    for line in lines:
        matches = parser.findall(line)
        spans = [_.span for _ in matches]
        show_markup(line, spans)

def load_lines(path):
    with open(path) as file:
        for line in file:
            yield line.rstrip('\n')


def show_json(data):
    print(json.dumps(data, indent=2, ensure_ascii=False))


def join_spans(text, spans):
    spans = sorted(spans)
    return ' '.join(
        text[start:stop]
        for start, stop in spans
    )


class Match(object):
    def __init__(self, fact, spans):
        self.fact = fact
        self.spans = spans
        

class Extractor(object):
    def __init__(self, union_rule_obj, wrapper_obj):
        self.union_rule_obj_parser = Parser(union_rule_obj)
        self.wrapper_parser = Parser(wrapper_obj)

    def __call__(self, text):
        matches = self.union_rule_obj_parser.findall(text)
        spans = [_.span for _ in matches]

        line = join_spans(text, spans)
        matches = list(self.wrapper_parser.findall(line))
        fact = None
        if matches:
            match = matches[0]
            fact = match.fact

        return Match(fact, spans)

In [4]:
# Станции метро
METRO_STATIONS = set(load_lines(os.path.join('dicts', 'metro.txt')))

In [5]:
# Список текстов объявлений
TEXTS = []
with open(os.path.join('data', 'flats.txt'), 'r', encoding='utf8') as fromF:
    for line in fromF:
        line = line.strip()
        if line:
            TEXTS.append(line)
print(len(TEXTS))
print(TEXTS[0])

20
text1. ann1. Арбатская м. 1-комн. кв-ру, или м.Кропоткинская 7 мин/п, Староконюшенный пер., д.30. 35/21 кв.м, кухня 10, 10/12 эт, блоч. дома, балкон, тел., хор. сост., с/у совм., окна в тихий сквер, освобождение сразу, продаю, 45 тыс.$. Торг Т 557-00-73


### COMMON RULES

In [6]:
INT = rule(predicates.type('INT'))

# SPACE = predicates.eq(' ')

FLOAT = rule(
    INT,
    predicates.in_({',', '.'}),
    predicates.in_('123456789')  # не больше одного знака после запятой
)

INT_OR_FLOAT = rule(or_(INT, FLOAT))

SLASH = predicates.eq('/')
DASH = predicates.eq('-')
DOT = predicates.eq('.')
COMMA = predicates.eq(',')

### Атрибуты объекта FLAT

In [7]:
flat_fact = fact('flat',
                 [attribute('metro').repeatable(),
                  'rooms',
                  'address',
                  'floor_area',
                  'floor',
                  'price',
                  attribute('telephone').repeatable()
                 ]
           )

### Атрибут METRO

In [8]:
# METRO = pipelines.pipeline(METRO_STATIONS).repeatable().interpretation(flat.metro)
METRO = rule(or_(rule('м',
                      '.',
                      pipelines.pipeline(METRO_STATIONS).repeatable().interpretation(flat_fact.metro)
                     ),
                 rule( pipelines.pipeline(METRO_STATIONS).repeatable().interpretation(flat_fact.metro),
                      'м',
                      '.'
                      )
                )
).repeatable()

example_1 = 'text1. ann1. Арбатская м. 1-комн. кв-ру, или м.Кропоткинская 7 мин/п, Староконюшенный пер.'
example_2 = 'text2. ann2. Арбатская м. 1-комн. кв ру, 5 м/пеш., ул.Арбат дом 15/43;'

show_matches(
    METRO,
    example_1,
    example_2,
)

### Атрибут ROOMS

In [9]:
rooms_pipe = pipelines.morph_pipeline([
    '-комн.'
])

ROOMS = rule(INT.interpretation(flat_fact.rooms),
             rooms_pipe
            )

example_1 = 'text1. ann1. Арбатская м. 1-комн. кв-ру, или м.Кропоткинская 7 мин/п, Староконюшенный пер.'
example_2 = 'text2. ann2. Арбатская м. 1-комн. кв ру, 5 м/пеш., ул.Арбат дом 15/43;'

show_matches(
    ROOMS,
    example_1,
    example_2,
)

### Атрибут STREET ADDRESS

In [10]:
address_fact = fact('address', ['street', 'house'])
street_type_pipe = pipelines.morph_pipeline([
    'ул',
    'пер',
])
house_pipe = pipelines.morph_pipeline([
    'д',
    'дом'
])

ADDRESS = rule(
               or_(rule(
                        street_type_pipe,
                        DOT.optional(),
                        predicates.in_('БМ').optional(),   # ул.Б.Грузинская
                        DOT.optional(),
                        or_(predicates.gram('ADJF'),
                            predicates.gram('NOUN'),
                            predicates.gram('Geox')
                           ),
                       predicates.gram('NOUN').optional()  # ул. Бутырский вал
                       ),
                   rule(
                       predicates.in_('БМ').optional(),
                       DOT.optional(),
                       or_(predicates.gram('ADJF'),
                            predicates.gram('NOUN'),
                            predicates.gram('Geox')
                           ),
                       street_type_pipe,
                       DOT.optional(),
                       )
                  ) .interpretation(address_fact.street),
               rule(predicates.eq(',').optional(),
                    house_pipe,
                    predicates.eq('.').optional(),
                    rule(INT,
                         SLASH.optional(),
                         INT.optional(),
                         rule(COMMA,                # optional 'корпус'
                              predicates.eq('корп'),
                              DOT,
                              INT).optional()
                        ).interpretation(address_fact.house)
                   ).optional()
              ).interpretation(address_fact).interpretation(flat_fact.address)

example_1 = 'Арбатская м. 1-комн. кв-ру, или м.Кропоткинская 7 мин/п, Староконюшенный пер., д.30. 35/21 кв.м, кухня 10'
example_2 = 'Арбатская м. 1-комн. кв ру, 5 м/пеш., ул.Арбат дом 15/43; 53/20/13 кв.м,'
example_3 = '1-комн. кв-ру, ул.Б.Грузинская, д.14, 36/20.2 кв.м'
example_4 = 'Аптекарский пер., д.3/22, 30/16.2 кв.м'
example_5 = '5 мин/п., Токмаков пер., 28/16 кв.м'
example_6 = 'ул. Бутырский вал, д. 34,'
example_7 = 'Б.Тишинский пер., 22/15 кв.м'

show_matches(
    ADDRESS,
    example_1,
    example_2,
    example_3,
    example_4,
    example_5,
    example_6,
    example_7
)

### Атрибут FLOOR AREA

In [11]:
floor_area_fact = fact('floor_area',
                       ['area_1',
                        'area_2',
                        'area_3',
                        'kitchen',
                        'recessed_balcony'
                       ])

sq_m_pipe = pipelines.morph_pipeline([
      'кв.м',
      'кв. м',
      ]
)

FLOOR_AREA = rule(
    INT_OR_FLOAT.interpretation(floor_area_fact.area_1),
    SLASH,
    INT_OR_FLOAT.interpretation(floor_area_fact.area_2),
    SLASH.optional(),
    INT_OR_FLOAT.interpretation(floor_area_fact.area_3).optional(),
    sq_m_pipe,
    rule(or_(COMMA, DOT),
         predicates.in_(['кухня', 'кух']),
         DOT.optional(),
         INT_OR_FLOAT.interpretation(floor_area_fact.kitchen)
        ).optional(),
    rule(COMMA,
         predicates.eq('лоджия'),
         INT_OR_FLOAT.interpretation(floor_area_fact.recessed_balcony)).optional()
).interpretation(floor_area_fact).interpretation(flat_fact.floor_area)

example_1 = 'Староконюшенный пер., д.30. 35/21 кв.м, кухня 10, 10/12 эт,'
example_2 = 'ул.Арбат дом 15/43; 53/20/13 кв.м, еврорем., 4/6-эт. кирп.'
example_3 = 'ул.М.Почтовая, 10; 44/21 кв.м, кухня 13.5, 2/8-эт. '
example_4 = '34/15 кв.м. кухня 7,'

show_matches(
    FLOOR_AREA,
    example_1,
    example_2,
    example_3,
    example_4,
#     example_5,
#     example_6
)

### Атрибут FLOOR NUMBER

In [12]:
floor_fact = fact('floor', ['floor_num', 'floor_ttl'])

floor_num_pipe = pipelines.morph_pipeline([
    'эт',
    '-эт',
    'эт.',
    'этаж',
    'пан',
    'кирп',
    '-пан'
      ]
)

FLOOR = rule(rule(
             INT.interpretation(floor_fact.floor_num),
             SLASH,
             INT.interpretation(floor_fact.floor_ttl),
             floor_num_pipe,
            ).interpretation(floor_fact).interpretation(flat_fact.floor))

example_1 = 'Староконюшенный пер., д.30. 35/21 кв.м, кухня 10, 10/12 эт, блоч. дома,'
example_2 = 'ул.Арбат дом 15/43; 53/20/13 кв.м, еврорем., 4/6-эт. кирп. дома,'

show_matches(
    FLOOR,
    example_1,
    example_2,
)

### Атрибут PRICE

In [13]:
money_pipe = pipelines.morph_pipeline([
    'тыс. $',
])
PRICE = rule(
             INT_OR_FLOAT,
             money_pipe
            ).interpretation(flat_fact.price)

    
example_1 = 'продаю, 45 тыс.$. Торг Т 557-00-73'
example_2 = 'хозяин, 130 тыс.$. Торг. Т.762-55-67, С'

show_matches(
    PRICE,
    example_1,
    example_2,
)

### Атрибут TELEPHONE

In [14]:
tele_pipe = pipelines.morph_pipeline([
    'Т',
])
TELE = rule(tele_pipe,
            DOT.optional(),
            rule(INT,
            DASH,
            INT,
            DASH,
            INT).repeatable().interpretation(flat_fact.telephone)
           ).repeatable()

example_1 = 'продаю, 45 тыс.$. Торг Т 557-00-73'
example_2 = 'хозяин, 130 тыс.$. Торг. Т.762-55-67, С'
example_3 = '27 тыс.$. Т.737-76-67, раб., Т.265-23-31, дом.'

show_matches(
    TELE,
    example_1,
    example_2,
    example_3
)

### ADVERTISEMENT -- союз всех атибутов; WRAPPER -- обертка поверх союза атрибутов

In [15]:
AD = or_(METRO.repeatable(),
         ROOMS,
         ADDRESS,
         FLOOR_AREA,
         FLOOR,
         PRICE,
         TELE
        ).interpretation(flat_fact)

WRAPPER = rule(
    METRO.repeatable(),
    ROOMS.optional(),
    METRO.optional().repeatable(),
    ADDRESS.optional(),
    FLOOR_AREA.optional(),
    FLOOR.optional(),
    PRICE.optional(),
    TELE.optional().repeatable()
).interpretation(flat_fact)

### Тесты

In [16]:
# tests
text = TEXTS[0]
extractor = Extractor(AD, WRAPPER)
match = extractor(text)
assert match.fact.as_json['metro'] == ['Арбатская', 'Кропоткинская']
assert match.fact.as_json['rooms'] == '1'
assert match.fact.as_json['address']['street'] == 'Староконюшенный пер.'
assert match.fact.as_json['address']['house'] == '30'
assert match.fact.as_json['floor_area']['area_1'] == '35'
assert match.fact.as_json['floor_area']['area_2'] == '21'
assert match.fact.as_json['floor']['floor_num'] == '10'
assert match.fact.as_json['floor']['floor_ttl'] == '12'
assert match.fact.as_json['price'] == '45 тыс.$'
assert match.fact.as_json['telephone'] == ['557-00-73']

text = TEXTS[1]
extractor = Extractor(AD, WRAPPER)
match = extractor(text)
assert match.fact.as_json['metro'] == ['Арбатская']
assert match.fact.as_json['rooms'] == '1'
assert match.fact.as_json['address']['street'] == 'ул.Арбат'
assert match.fact.as_json['address']['house'] == '15/43'
assert match.fact.as_json['floor_area']['area_1'] == '53'
assert match.fact.as_json['floor_area']['area_2'] == '20'
assert match.fact.as_json['floor_area']['area_3'] == '13'
assert match.fact.as_json['floor']['floor_num'] == '4'
assert match.fact.as_json['floor']['floor_ttl'] == '6'
assert match.fact.as_json['price'] == '130 тыс.$'
assert match.fact.as_json['telephone'] == ['762-55-67']

text = TEXTS[2]
extractor = Extractor(AD, WRAPPER)
match = extractor(text)
assert match.fact.as_json['metro'] == ["Баррикадная"]
assert match.fact.as_json['rooms'] == "1"
assert match.fact.as_json['address']['street'] == "ул.Б.Грузинская"
assert match.fact.as_json['address']['house'] == "14"
assert match.fact.as_json['floor_area']['area_1'] == "36"
assert match.fact.as_json['floor_area']['area_2'] == "20.2"
assert match.fact.as_json['floor_area']["kitchen"] == "8.3"
assert match.fact.as_json['floor']['floor_num'] == "2"
assert match.fact.as_json['floor']['floor_ttl'] == "14"
assert match.fact.as_json['telephone'] == ["962-30-63"]

text = TEXTS[9]
extractor = Extractor(AD, WRAPPER)
match = extractor(text)
assert match.fact.as_json['metro'] == ["Бауманская"]
assert match.fact.as_json['rooms'] == "1"
assert match.fact.as_json['address']['street'] == "Токмаков пер."
assert match.fact.as_json['floor_area']['area_1'] == "28"
assert match.fact.as_json['floor_area']['area_2'] == "16"
assert match.fact.as_json['floor_area']["kitchen"] == "5.5"
assert match.fact.as_json['floor']['floor_num'] == "6"
assert match.fact.as_json['floor']['floor_ttl'] == "7"
assert match.fact.as_json['price'] == "17тыс.$"
assert match.fact.as_json['telephone'] == ["978-92-94"]

text = TEXTS[10]
extractor = Extractor(AD, WRAPPER)
match = extractor(text)
assert match.fact.as_json['metro'] == ["Бауманская"]
assert match.fact.as_json['rooms'] == "1"
assert match.fact.as_json['address']['street'] == "ул. Б. Почтовая"
assert match.fact.as_json['address']['house'] == "18/20, корп. 16"
assert match.fact.as_json['floor_area']['area_1'] == "34"
assert match.fact.as_json['floor_area']['area_2'] == "15"
assert match.fact.as_json['floor']['floor_num'] == "5"
assert match.fact.as_json['floor']['floor_ttl'] == "5"
assert match.fact.as_json['price'] == "23.5 тыс.$"
assert match.fact.as_json['telephone'] == ["218-59-90"]

text = TEXTS[16]
extractor = Extractor(AD, WRAPPER)
match = extractor(text)
assert match.fact.as_json['metro'] == ["Белорусская"]
assert match.fact.as_json['rooms'] == "1"
assert match.fact.as_json['address']['street'] == "ул.Нижняя"
assert match.fact.as_json['address']['house'] == "5"
assert match.fact.as_json['floor_area']['area_1'] == "31,3"
assert match.fact.as_json['floor_area']['area_2'] == "13,4"
assert match.fact.as_json['floor_area']["kitchen"] == "9,2"
assert match.fact.as_json['floor']['floor_num'] == "3"
assert match.fact.as_json['floor']['floor_ttl'] == "5"
assert match.fact.as_json['price'] == "25 тыс.$"
assert match.fact.as_json['telephone'] == ["318-94-49"]

text = TEXTS[17]
extractor = Extractor(AD, WRAPPER)
match = extractor(text)
assert match.fact.as_json['metro'] == ["Белорусская"]
assert match.fact.as_json['rooms'] == "1"
assert match.fact.as_json['address']['street'] == "ул.Верхняя"
assert match.fact.as_json['address']['house'] == "6"
assert match.fact.as_json['floor_area']['area_1'] == "43"
assert match.fact.as_json['floor_area']['area_2'] == "20"
assert match.fact.as_json['floor_area']["kitchen"] == "13"
assert match.fact.as_json['floor']['floor_num'] == "2"
assert match.fact.as_json['floor']['floor_ttl'] == "14"
assert match.fact.as_json['price'] == "41.5 тыс.$"
assert match.fact.as_json['telephone'] == ["405-85-64"]

text = TEXTS[18]
extractor = Extractor(AD, WRAPPER)
match = extractor(text)
assert match.fact.as_json['metro'] == ["Белорусская"]
assert match.fact.as_json['rooms'] == "1"
assert match.fact.as_json['address']['street'] == "ул. Бутырский вал"
assert match.fact.as_json['address']['house'] == "34"
assert match.fact.as_json['floor_area']['area_1'] == "36"
assert match.fact.as_json['floor_area']['area_2'] == "20"
assert match.fact.as_json['floor']['floor_num'] == "12"
assert match.fact.as_json['floor']['floor_ttl'] == "12"
assert match.fact.as_json['price'] == "28 тыс.$"
assert match.fact.as_json['telephone'] == ["299-96-51"]

text = TEXTS[19]
extractor = Extractor(AD, WRAPPER)
match = extractor(text)
assert match.fact.as_json['metro'] == ["Белорусская"]
assert match.fact.as_json['rooms'] == "1"
assert match.fact.as_json['address']['street'] == "Б.Тишинский пер."
assert match.fact.as_json['floor_area']['area_1'] == "22"
assert match.fact.as_json['floor_area']['area_2'] == "15"
assert match.fact.as_json['floor']['floor_num'] == "2"
assert match.fact.as_json['floor']['floor_ttl'] == "6"
assert match.fact.as_json['price'] == "18 тыс.$"
assert match.fact.as_json['telephone'] == ["253-30-92"]
print(match.fact)

flat(metro=['Белорусская'], rooms='1', address=address(street='Б.Тишинский пер.', house=None), floor_area=floor_area(area_1='22', area_2='15', area_3=None, kitchen=None, recessed_balcony=None), floor=floor(floor_num='2', floor_ttl='6'), price='18 тыс.$', telephone=['253-30-92'])


In [17]:
# text = TEXTS[14]  # problems here!
# text = TEXTS[15]
text = TEXTS[3]
extractor = Extractor(AD, WRAPPER)
match = extractor(text)
show_markup(text, match.spans)
if match.fact:
    print(json.loads(json.dumps(match.fact.as_json, ensure_ascii=False)))
    show_json(match.fact.as_json)

{'metro': ['Бауманская'], 'rooms': '1', 'address': {'street': 'Аптекарский пер.', 'house': '3/22'}, 'floor_area': {'area_1': '30', 'area_2': '16.2', 'kitchen': '8'}, 'price': '27 тыс.$', 'telephone': ['737-76-67', '265-23-31']}
{
  "metro": [
    "Бауманская"
  ],
  "rooms": "1",
  "address": {
    "street": "Аптекарский пер.",
    "house": "3/22"
  },
  "floor_area": {
    "area_1": "30",
    "area_2": "16.2",
    "kitchen": "8"
  },
  "price": "27 тыс.$",
  "telephone": [
    "737-76-67",
    "265-23-31"
  ]
}


In [18]:
# Put all ads into a dict
json_data = {'flat_sale_ads': []}
extractor = Extractor(AD, WRAPPER)
for text in TEXTS:
    match = extractor(text)
    if match.fact:
        json_data['flat_sale_ads'].append(json.loads(json.dumps(match.fact.as_json, ensure_ascii=False)))

print(len(json_data['flat_sale_ads']))

18


In [19]:
# Write to a json file
with open(os.path.join('data', 'flats_data.json'), 'w', encoding='utf-8') as f:
    json.dump(json_data,
              f,
              ensure_ascii=False,
              indent=4,
              sort_keys=True
             )