In [81]:
import pandas as pd
import re
from orv_cleanup_utils import * 
import numpy as np
from extractor import NumberExtractor # Word-to-Number-Russian

In [82]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [88]:
data_dir = '/home/common/regulation_data/parsed_reports/'
main_df = pd.read_csv(data_dir + 'main_df.csv')

otm_tables_fnames = sorted([fn for fn in os.listdir(data_dir) if not fn.startswith('main')])
otm_tables = {fn[:-4]: pd.read_csv(data_dir + fn) for fn in otm_tables_fnames}
otm_tables.keys()

dict_keys(['0_business', '1_business', '2_business', 'cancel_duties', 'expenses', 'goals', 'group_changes', 'group_expenses', 'groups', 'kpi', 'neccessary_measures', 'new_functions', 'notification_info', 'public_discussion', 'risks'])

## Очистка и нормализация заполнений

In [89]:
def is_empty(text):
    if text.replace('\r', '').replace('\n', '').strip() == '' \
            or text == '' or text == 'nan':
        return True
    return False

def is_hyphens(text):
    if text.replace('-', '').strip() == '':
        return True 
    return False

def is_underscores(text):
    if text.replace('_', '').strip()  == '':
        return True
    return False

def is_junk(text):
    return is_empty(text) or is_hyphens(text) or is_underscores(text)

def clean_up(text):
    text = str(text)
    text = text.strip()
    text = text.lower()
    text = text.replace('(место для текстового описания)', '')
    if is_junk(text):
         return 0     
    text = text.rstrip('.')
    text = text.replace('«', '"').replace('»', '"')
    text = text.replace('\t', '').replace('\r\n', '')
    return text

In [None]:
from rnnmorph.predictor import RNNMorphPredictor
predictor = RNNMorphPredictor(language="ru")

In [90]:
def tokenize(sentence):
    sentence = re.sub('(?<! )(?=[.,!?()])|(?<=[.,!?()])(?! )', r'', sentence)
    words = sentence.split()
    return words


def tag(sentence):
    words = tokenize(sentence)
    tagged_words = predictor.predict(words)
    return tagged_words


def preparse(sentence):
    tagged_words = tag(sentence)
    
    degree_re = re.compile('(.*)\|Degree=\w+(.*)')
    gender_re = re.compile('(.*)\|Gender=\w+(.*)')
    res = ''
    
    for word in tagged_words:
        word_tag = word.tag
        if 'Degree' in word_tag:
            word_tag = degree_re.sub(r"\1\2", word_tag)

        word_pos = 'pos=' + word.pos#.lower().capitalize()
        if 'PUNCT' in word_pos:
            word_pos = ''

        if 'ADJ' in word_pos:
            if 'Gender' in word_tag:
                word_tag = gender_re.sub(r"\1\2", word_tag)

        word_tags = [
                word.normal_form,
                word_pos,
                word_tag.replace('|', ' ').replace('_', ''),
            ]
        res += ' ' + ' '.join([tag for tag in word_tags if tag])

    res = res.strip().replace('  ', ' ')
    return res

def norm(sentence):
    if sentence is None:
        return
    tagged_words = tag(sentence)
    return ' '.join(word.normal_form for word in tagged_words)       

In [92]:
groups = otm_tables['groups']
groups_count = groups['Оценка количества участников отношений']
groups_count = groups_count.apply(clean_up)

groups_count_norm = [norm(str(i)) for i in groups_count]

## Вычленение оценок количества участников отношений

Для этого убираем из заполнений все прочие цифры.

In [93]:
def remove_trash (s):
    s = s.replace('(72)', '72').replace('№ 839)', '').replace('(10)', '10').replace('футбол 2018', '').replace('грэс-2', '')
    s = s.replace(', 2014,', '').replace('01.01.2013)','01.01.2013').replace('– 4181', '4181').replace('теле2','теле')
    s = s.replace('1 свыше', '').replace('2 свыше', '').replace('3 свыше', '').replace('сотня тысяча','')
    s = s.replace('1.свысоко', '').replace('2.свысоко', '').replace('3.свыше', '').replace('не имеется1','')
    s = s.replace('1. 42. 853', '').replace('1. 22. 13.', '').replace('11111111неограниченный количество','')
    s = s.replace('1)12)13)14)', '').replace('5)16)857)', '').replace('1.дши','').replace('; 2.','').replace('2098-р', '')
    s = s.replace('3,5 до 12 тонн','').replace('3,5 до 12 тонна','').replace('14,2 млн. тонна','').replace('более 12 т ','')
    s = s.replace(' 6. ','-').replace(' 5. ','-').replace(' 4. ','-').replace(' 3. ','-').replace(' 2. ','-').replace('1. ','-').replace(';3.','-')
    s = re.sub(r'\d*\s+обращение','', s).replace('в один группа','').replace('которы6', '').replace('11 статья', '').replace('лиц1','')
    s = re.sub(r'^2\.\s+', '-', s)
    s = re.sub(r'1\)\s+', '-', s)
    s = re.sub(r'2\)\s+', '-', s)
    s = re.sub(r'3\)\s+', '-', s)
    s = re.sub(r'4\)\s+', '-', s)
    s = re.sub(r'5\)\s+', '-', s)
    s = re.sub(r'6\)\s+', '-', s)
    s = re.sub(r'7\)\s+', '-', s)
    s = re.sub(r'8\)\s+', '-', s)
    s = re.sub(r'9\)\s+', '-', s)
    s = re.sub(r'10\)\s+', '-', s)
    s = re.sub(r'11\)\s+', '-', s)
    s = re.sub(r'12\)\s+', '-', s)
    return (s)

def remove_symb (s):
    s = s.replace('пунтка', 'пункт')
    s = s.replace('№',"no")
    s = s.replace('ст.','статья')
    s = re.sub(r'(no\s+\d*)|(no\d*)|(d{4}\,\s+no)', '', s)
    s = re.sub(r'(статья\s+\d*\.\d*)|(\bстатья\b\s+\d*)', '', s)
    s = re.sub(r'(пункт\s+\d.\d)|(пункт\s+\d*)', '', s)
    s = re.sub(r'(п\.\s+\d.\d)|(п\s+\d*)|(п\.\d*\.\d*)||(п\.\s+\d*\.\d*)', '', s)
    s = re.sub(r'\bчасть\b\s+\d*', '', s)
    s = re.sub(r'\bfifa\b\s+\d*', '', s)
    s = re.sub(r'\d+\s*\bт\b', '', s)
    s = s.replace('масса более 12', '')
    s = s.replace(' %', '%').replace('57-фз', '')
    s = re.sub(r'(\d*\,\d*%)|(\d*\.\d*%)', '', s)
    s = re.sub(r'\d*%', '', s).replace('4 и 5 категория', '').replace('46 лесной', '')
    s = s.replace('1 класс','').replace('2 класс','').replace('2 класса','').replace('30 стадо', '')
    s = s.replace('128,4 млн. га','').replace('111,1 млн. га','').replace('17,3 млн. га','')
    s = s.replace('197 млн обращение', '').replace(' от 0 до 18', '').replace('тс 001/2011', '')
    s = s.replace('1 (один)','1').replace('20 (двадцать)','').replace('два тысячвосемьдесить пять','2085')
    s = s.replace('шесть с половина тысяча','6500')
    return (s)

def remove_time (s):
    s = s.replace('года', 'г.').replace('ть 2014','')
    s = s.replace('январь', '.01').replace('февраль', '.02').replace('март', '.03').replace('апрель', '.04')
    s = s.replace('май', '.05').replace('июнь', '.06').replace('июль', '.07').replace('август', '.08')
    s = s.replace('сентябрь', '.09').replace('октябрь', '.10').replace('ноябрь', '.11').replace('декабрь', '.12')
    s = re.sub(r'\.\d{2}\s+\d{4}', '', s)
    s = re.sub(r'\d{4}-\d{4}\s+гг', '', s)
    s = re.sub(r'(\d{2})\.(\d{2})\.(\d{4})', '', s)
    s = re.sub(r'\d+\s*\bгод\b', '', s)
    s = re.sub(r'от\s+\d*\s+\d*\s+год', '', s)
    s = re.sub(r'к\s+\d*\s+\году', '', s)
    s = re.sub(r'в\s+\d*\s+\году', '', s)
    s = re.sub(r'с\s+\d*\s+по\s+\d*', '', s)
    s = re.sub(r'\d+\s*\bмесяц\b', '', s)
    s = re.sub(r'\d+\s+\.01', '', s)
    s = re.sub(r'\d+\s+\.02', '', s)
    s = re.sub(r'\d+\s+\.03', '', s)
    s = re.sub(r'\d+\s+\.04', '', s)
    s = re.sub(r'\d+\s+\.05', '', s)
    s = re.sub(r'\d+\s+\.06', '', s)
    s = re.sub(r'\d+\s+\.07', '', s)
    s = re.sub(r'\d+\s+\.08', '', s)
    s = re.sub(r'\d+\s+\.09', '', s)
    s = re.sub(r'\d+\s+\.10', '', s)
    s = re.sub(r'\d+\s+\.11', '', s)
    s = re.sub(r'\d+\s+\.12', '', s)
    s = re.sub(r'(\d+\s*г\.)|(\d+\s*г\s)', '', s)
    s = re.sub(r'\d+\s*(\bг\b\.)\s+', '', s)
    s = s.replace('в год', '') 
    return (s)

def let_converter (s):
    new_text = ''
    s = s.replace('миллион', 'млн').replace('миллиона', 'млн')
    s = s.replace(' млрд', 'млрд')
    s = s.replace(' млн', 'млн')
    s = s.replace(' тысяча','тыс').replace(' тысяч','тыс').replace(' тыс', 'тыс')
    for word in s.split(' '):
        word = word.replace('трлн','000000000000')
        if re.search(r'\d\b[\,\.]\b(\d)млрд', word):
            word = word.replace('млрд','00000000')
        elif re.search(r'\d\b[\,\.]\b(\d\d)млрд', word):
            word = word.replace('млрд','0000000')
        elif re.search(r'\d\b[\,\.]\b(\d\d\d)млрд', word):
            word = word.replace('млрд','000000')
        elif re.search(r'(0)[\,\.]\d\d\bмлрд\b', s):
            s = s.replace('млрд','0000000')
        else:
            word = word.replace('млрд','000000000')
        if re.search(r'\d\b[\,\.]\b(\d)млн', word):
            word = word.replace('млн','00000')
        elif re.search(r'\d\b[\,\.]\b(\d\d)млн', word):
            word = word.replace('млн','0000')
        elif re.search(r'\d\b[\,\.]\b(\d\d\d)млн', word):
            word = word.replace('млн','000')
        elif re.search(r'\d[\,\.](\s)\d{2}\s+млн', word):
            word = word.replace('млн','0000') 
        else:
            word = word.replace('млн','000000')
        if re.search(r'(\d*-\d*тыс)|(\d*-\d*\s+тыс)', word):
            word = word.replace('тыс','000').replace('-', '000')
        if re.search(r'\d\b[\,\.]\b(\d)тыс', word):
            word = word.replace('тыс','00')
        elif re.search(r'\d\b[\,\.]\b(\d\d)тыс', word):
            word = word.replace('тыс','0')
        else:
            word = word.replace('тыс','000')
        new_text += word + ' '
    return (new_text)

def has_numb (inputString):
    return bool(re.search(r'\d', inputString))

def numb_to_words (s):
    extractor = NumberExtractor()
    s = extractor.replace(s)
    s = str(s)
    return (s)

def use_num_to_word (s):
    if has_numb(s) == False:
        s = numb_to_words(s)
    return(s)

def strip_spaces (s):
    s = "".join(s.split())
    return (s)

def remove_dot_com (s):
    s = re.sub("[,.]",'', s)
    s = re.sub(r'(\d)\s+(\d)', r'\1\2', s)
    return (s)

def extract_num (s):
    s = re.findall(r'\d+', s)
    s = " ".join(s)
    return (s)

In [94]:
gr_norm =  pd.Series(groups_count_norm)

gr_norm = gr_norm.apply(remove_trash)
gr_norm = gr_norm.apply(remove_symb)
gr_norm = gr_norm.apply(remove_time)
gr_norm = gr_norm.apply(let_converter)
gr_norm = gr_norm.apply(use_num_to_word)
gr_norm = gr_norm.apply(strip_spaces)
gr_norm = gr_norm.apply(remove_dot_com)
gr_norm = gr_norm.apply(extract_num)

In [95]:
# исправляем обнаруженные косяки обработки и убираем числа, не позволяющие напрямую оценить количетсво учатсников отношений
# (напр. количество заявлений, сообщений, аукционов и т.д.) 

def adjust_groups (s):
    s = re.sub(r'^2114$', '', s)
    s = re.sub(r'^182 92 2114 44$', '', s)
    s = re.sub(r'^4600 11600 6200 3300 18800 10800 3100 9500 9100$', '', s)
    s = re.sub(r'^1074 216 858 1 998 835 163 2032 1792 1273 1757 1702 1119$', '', s)
    s = re.sub(r'^1000000700000$', '1700000', s)
    s = re.sub(r'^469000056500003 146000000$', '4690000 5650000 146000000', s)
    s = re.sub(r'^46900005650000146000000$', '4690000 5650000 146000000', s)
    s = re.sub(r'^14500037103000$', '145000 37103000', s)
    s = re.sub(r'^7 33000000000$', '33000000000', s)
    s = re.sub(r'^7000080000$', '70000 80000', s)
    s = re.sub(r'^5500060000$', '55000 60000', s)
    s = re.sub(r'^000$', '0', s)
    s = re.sub(r'^000000$', '0', s)
    s = re.sub(r'^44000700$', '44000 700', s)
    s = re.sub(r'^13 260000 85 6000085$', '13 260000 85 60000 85', s) 
    s = re.sub(r'^1082019 118137875$', '118137875', s) 
    s = re.sub(r'^109213 1709 20 20$', '109213', s)
    s = re.sub(r'^55 1200000 300 200$', '', s)
    s = re.sub(r'^1 28$', '1', s)
    s = re.sub(r'^700010000$', '10000', s)
    s = re.sub(r'^2000 150 5672460 1$', '2000 150 567 2460', s)
    s = re.sub(r'^1 247$', '247', s)
    s = re.sub(r'^1 2000$', '2000', s)
    s = re.sub(r'^69$', '61 69', s)
    s = re.sub(r'^925 3589 375 1826$', '375 1826', s)
    s = re.sub(r'^1 36000 2$', '36000', s)
    s = re.sub(r'^92$', '', s)
    s = re.sub(r'^10 11$', '', s)
    s = re.sub(r'^3500000 2 24$', '3500000 24', s)
    s = re.sub(r'^0420 123600 26823 63740 487 12186$', '23600 26823 63740 487 12186', s)
    s = re.sub(r'^1052016 646 85$', '646 85', s)
    s = re.sub(r'^447 715 1 87 85 79 104 108 96 12 3 30$', '', s)
    s = re.sub(r'^31670$', '316 70', s) 
    s = re.sub(r'^2 30 25 3 4$', '30 25', s) 
    s = re.sub(r'^14000000 40 5600000$', '14000000 5600000', s)  
    s = re.sub(r'^1 2 3 4 5 6 7 8 9 10 1 12 13 14 15 16 17$', '', s)
    s = re.sub(r'^31670$', '316 70', s) 
    s = re.sub(r'^110 120$', '', s) 
    s = re.sub(r'^55 1200000 300 200$', '', s) 
    s = re.sub(r'^182$', '1 82', s) 
    s = re.sub(r'^1 3949928$', '3949928', s)
    s = re.sub(r'^4171 3507 586 2049 1351 617 679 378 235$', '', s)
    s = re.sub(r'^39 13 8 5 20 9 11 6 3 3$', '', s)
    s = re.sub(r'^33998$', '3998', s)
    s = re.sub(r'^400 300000$', '400000', s)
    s = re.sub(r'^2 58000$', '58000', s)
    s = re.sub(r'^200 300000 400$', '200', s)
    s = re.sub(r'^616 5501110$', '616 5 501 110', s)
    s = re.sub(r'^816943350392712132371000371000875984364684213392712463235558138$', '816943 35039 2 7121 32 371000 371000 8759 8 4364 68 42133 927 124632 35558 138', s)
    s = re.sub(r'^434 2 2$', '434 2', s)
    s = re.sub(r'^2000030000$', '30000', s)
    s = re.sub(r'^2000025001500170$', '20000 2500 1500170', s)
    s = re.sub(r'^2000025000$', '25000', s)
    s = re.sub(r'^2016 1000$', '1000', s)
    s = re.sub(r'^653627290315$', '653627 290315', s)
    s = re.sub(r'^25001500$', '2500 1500', s)
    s = re.sub(r'^46900005650000$', '4690000 5650000', s)
    s = re.sub(r'^9 894$', '', s)
    s = re.sub(r'^2946812 2018$', '2946812', s)
    s = re.sub(r'^7092015 9$', '9', s)
    s = re.sub(r'^4 7$', '', s)
    s = re.sub(r'^2500026000 20000$', '26000 20000', s)
    s = re.sub(r'^10734445621162359893$', '', s)  # ?
    return (s)

In [96]:
groups_count = pd.DataFrame({'id': groups['id'], 'groups_count': gr_norm})
groups_count['groups_count'] = groups_count['groups_count'].apply(adjust_groups) 
groups_count.iloc[4959, groups_count.columns.get_loc('groups_count')] = '3 70'
groups_count.iloc[15070, groups_count.columns.get_loc('groups_count')] = '15 3'

In [97]:
groups_count_otm = groups_count.assign(groups_count = groups_count['groups_count'].str.split(' ')).explode('groups_count')
groups_count_otm['groups_count'] = groups_count_otm['groups_count'].replace('', np.NaN).replace('0', np.NaN)
groups_count_otm['groups_count'] = groups_count_otm['groups_count'].astype('float').astype('Int64') 
groups_count_otm = groups_count_otm.rename(columns = {"groups_count": "Оценка количества участников отношений (число)"})

In [98]:
len(groups_count_otm)

17050

In [101]:
groups_count_otm['Оценка количества участников отношений (число)'].describe()

count         5441.00000
mean       3823677.60099
std       74124682.52031
min              0.00000
25%             10.00000
50%            214.00000
75%           9243.00000
max     3300000000.00000
Name: Оценка количества участников отношений (число), dtype: float64

In [100]:
groups_count_otm.head(20)

Unnamed: 0,id,Оценка количества участников отношений (число)
0,02/04/11-20/00110705,
1,02/04/11-20/00110705,
2,02/08/02-17/00062320,
3,02/08/02-17/00062320,
4,02/07/07-17/00068049,
5,02/07/07-17/00068049,
6,02/04/01-18/00077703,
7,02/04/01-18/00077703,
8,02/08/07-17/00068406,1.0
9,02/08/07-17/00068406,1.0
