In [18]:
import re

from pymongo import MongoClient
import nltk
# nltk.download("stopwords")
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

pattern_price = re.compile(r'(\d+\s?грн[.]?|гривен[ь]|гривен[ь])', re.I)
string_start_pattern = re.compile(r'[ .]?[A-ZА-Я]')
abbreviation_pattern_en = re.compile(r'\s?(hz|khz|mhz|ghz|kb|mb|gb|tb|\d+x\d+)+\s?', re.I)
abbreviation_pattern_ru = re.compile(r'\s?(гц|кгц|мгц|ггц|кб|мб|гб|тб)+\s?', re.I)

mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

client = MongoClient()
db = client.Overbot
collection = db.article

In [14]:
def split_text(content):
    ending_br = []
    for match in re.finditer(pattern_price, content):
        ending_br.append((match.span(), match.group()))

    start = 0
    sentences = []
    ending_br = [_ for _ in reversed(ending_br)]
    while ending_br:
        indx, price = ending_br.pop()

        article = content[start:indx[1]-(indx[1]-indx[0])]
        if article.startswith(' ') or not start:
            sentences.append((article, price))
        else:
            line_start = [(m.start(0), m.end(0)) for m in re.finditer(string_start_pattern, article)]
            if line_start:
                line_start=line_start[-1][1]
                article = content[line_start:indx[1]-(indx[1]-indx[0])]
            else:
                # another action needed
                sentences.append((article, price))  
        start=indx[1]
    return sentences


def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]
    
    text = " ".join(tokens)
    
    return text


def replace_analog_text(text, dictionary):
    for k, v in dictionary.items():
        text = text.replace(k, v)
    return text

In [20]:
word_analogs_dict = {
    'озу': 'ram',
    'ддр': 'ddr',
    'ддр1': 'ddr',
    'ддр2': 'ddr2',
    'ддр3': 'ddr3',
    'ддр4': 'ddr4',
    'ддр5': 'ddr5',
    'ддр6': 'ddr6',
    'винчестер': 'hdd',
    'винт': 'hdd',
    'процессор': 'cpu',
    'цпу': 'cpu',
    'видяха': 'gpu',
    'видик': 'gpu',
    'видеокарта': 'gpu',
    'бокс': 'box',
    'трей': 'tray',
    'амд': 'amd',
    'интел': 'intel',
    'пень': 'pentium',
    'целерон': 'celeron',
    'кор': 'core',
    'аслон': 'athlon',
    'райзен': 'ryzen',
    'ризен': 'ryzen',
    'вт': 'w',
    'ват': 'w',
    'ватт': 'w,'
}

In [19]:
doc = collection.find_one({'article_id': 208517})
content = doc.get('post_content')

# split post content into sentences
sentences = split_text(content)

# tokenize sentences
sentences = [(replace_analog_text(preprocess_text(sentences[i][0]), word_analogs_dict), sentences[i][1]) for i in range(len(sentences))]
sentences = [(re.sub(abbreviation_pattern_en, r'\1', sentence[0]), sentence[1]) for sentence in sentences]
sentences = [(re.sub(abbreviation_pattern_ru, r'\1', sentence[0]), sentence[1]) for sentence in sentences]
sentences

[('материнский плата am3 asus m4a77td cpu amd phenom x3 720 анлок phenom b20 x4 box',
  '1300 грн'),
 ('материнский плата am2 biostar mcp6pb m2 cpu amd athlon x2 245 box ddr2 1gb',
  '800 грн'),
 ('ram ddr3 corsair 1333mhz4gb', '500 грн'),
 ('ram ddr3 amd 1600mhz4gb', '500 грн'),
 ('ram ddr3 hynix 1333mhzgb', '500 грн'),
 ('ram ddr3 1333mhz2gb', '250 грн'),
 ('gpu powercolor 7770 1gb', '1000 грн'),
 ('gpu gtx 470', '900 грн'),
 ('gpu gtx 650 msi 1gb', '900 грн'),
 ('gpu asus 6850 1gb', '850 грн'),
 ('hdd wd green 750gb', '550 грн'),
 ('hdd wd blue 500gb', '450 грн'),
 ('hdd wd2500ks 250gb', '250 грн'),
 ('блок питание goldenfield atx sh700e модульный кулер синий подсветка пломба весь модульный кабель присутствовать',
  '800 грн'),
 ('блок питание fsp 450pnr 450w', '400 грн'),
 ('coreпус aerocool pgs vx r black картридер привод двд рв хороший состояние',
  '600 грн'),
 ('монитор samsung 19 933hd встраивать тюнер', '1100грн'),
 ('монитор 19 benq g925hda хороший состояние', '900 грн')]