In [1]:
import pandas as pd
import re
from orv_cleanup_utils import *
import numpy as np

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
data_dir = '/home/common/regulation_data/parsed_reports/'
main_df = pd.read_csv(data_dir + 'main_df.csv')

otm_tables_fnames = sorted([fn for fn in os.listdir(data_dir) if not fn.startswith('main')])
otm_tables = {fn[:-4]: pd.read_csv(data_dir + fn) for fn in otm_tables_fnames}
otm_tables.keys()

dict_keys(['0_business', '1_business', '2_business', 'cancel_duties', 'expenses', 'goals', 'group_changes', 'group_expenses', 'groups', 'kpi', 'neccessary_measures', 'new_functions', 'notification_info', 'public_discussion', 'risks'])

In [5]:
def is_empty(text):
    if text.replace('\r', '').replace('\n', '').strip() == '' \
            or text == '' or text == 'nan':
        return True
    return False

def is_hyphens(text):
    if text.replace('-', '').strip() == '':
        return True 
    return False


def is_underscores(text):
    if text.replace('_', '').strip()  == '':
        return True
    return False


def is_junk(text):
    return is_empty(text) or is_hyphens(text) or is_underscores(text)


# Точка входа дворника
def clean_up(text):
    text = str(text)
    text = text.strip()
    text = text.lower()
    text = text.replace('(место для текстового описания)', '')
    if is_junk(text):
         return 0
        
    #text = text.rstrip('.')
    text = text.replace('«', '"').replace('»', '"')
    text = text.replace('\t', '').replace('\r\n', '')
    return text

In [6]:
expenses = otm_tables['expenses']
group_expenses = otm_tables['group_expenses']

## Описание и оценка видов расходов (доходов)

In [7]:
exprev_est = group_expenses['Описание и оценка видов расходов (доходов)']
exprev_est = exprev_est.apply(clean_up)

In [9]:
def rouble_norm (s):
    s = s.replace(u'\xa0', u' ')
    s = s.replace('рублей','р.')
    s = s.replace('руб.','р.')
    s = s.replace('руб','р.')
    s = re.sub(r'\d+\,(\bр\b)\.', 'р.', s)
    s = re.sub(r'\,+\d{2}[\bр\b]\.', 'р.', s)
    s = re.sub(r'\,\s\d{2}\s\bр\b', 'р.', s) 
    s = re.sub(r'\,\d{2}\s[\bр\b]\.', 'р.', s) 
    s = re.sub(r'\,\d{2}\s\s\bр\b\.', 'р.', s) 
    s = s.replace('миллиона', 'млн')
    return (s)

def let_to_num (s):
    s = s.replace('пятидесяти','50')
    s = s.replace('десяти','10')
    s = s.replace('трехсот','300')
    if re.search(r'\bодного\b\s+\d', s):
        s = s.replace('одного', '1')
    s = s.replace('2)','').replace('3)','')
    return (s)

def remove_range (s):
    s = re.sub(r'\d+\-', '', s)
    return (s)

def remove_dot_com (s):
    s = re.sub("[,.]",'', s)
    s = re.sub(r'(\d)\s+(\d)', r'\1\2', s)
    return (s)

def extract_num (s):
    s = str(s).replace(' р','р')
    s = re.findall(r'\d+[\bр\b]', s)
    s = "".join(s)
    s = s[:-1]
    return (s)

def let_converter (s):
    new_text = ''
    s = s.replace(' млрд', 'млрд')
    s = s.replace(' млн', 'млн')
    s = s.replace(' тысяча','тыс').replace(' тысяч','тыс').replace(' тыс', 'тыс')
    for word in s.split(' '):
        word = word.replace('трлн','000000000000')
        if re.search(r'\d\b[\,\.]\b(\d)млрд', word):
            word = word.replace('млрд','00000000')
        elif re.search(r'\d\b[\,\.]\b(\d\d)млрд', word):
            word = word.replace('млрд','0000000')
        elif re.search(r'\d\b[\,\.]\b(\d\d\d)млрд', word):
            word = word.replace('млрд','000000')
        elif re.search(r'(0)[\,\.]\d\d\bмлрд\b', s): 
            s = s.replace('млрд','0000000')
        else:
            word = word.replace('млрд','000000000')
        if re.search(r'\d\b[\,\.]\b(\d)млн', word):
            word = word.replace('млн','00000')
        elif re.search(r'\d\b[\,\.]\b(\d\d)млн', word):
            word = word.replace('млн','0000')
        elif re.search(r'\d\b[\,\.]\b(\d\d\d)млн', word):
            word = word.replace('млн','000')
        elif re.search(r'\d[\,\.](\s)\d{2}\s+млн', word): 
            word = word.replace('млн','0000') 
        else:
            word = word.replace('млн','000000')
        if re.search(r'\d\b[\,\.]\b(\d)тыс', word):
            word = word.replace('тыс','00')
        elif re.search(r'\d\b[\,\.]\b(\d\d)тыс', word):
            word = word.replace('тыс','0')
        else:
            word = word.replace('тыс','000')
        new_text += word + ' '
    return (new_text)

In [10]:
exprev_est = group_expenses['Описание и оценка видов расходов (доходов)']

exprev_est = exprev_est.astype(str)
exprev_est = exprev_est.apply(rouble_norm)
exprev_est = exprev_est.apply(let_converter)
exprev_est = exprev_est.apply(let_to_num)
exprev_est = exprev_est.apply(remove_dot_com)

exprev_est = exprev_est.apply(extract_num)

In [30]:
exprev_df = pd.DataFrame({'id':  group_expenses['id'], 'exprev': exprev_est})
exprev_df['exprev'] = exprev_df['exprev'].replace('3500','').replace('100000р1000','').replace('300000','').replace('7500р3500','')
exprev_df['exprev'] = exprev_df['exprev'].replace('25000000000','').replace('3000р40900000','40900000')

In [31]:
exprev_otm = exprev_df.assign(exprev = exprev_df['exprev'].str.split('р')).explode('exprev')
exprev_otm['exprev'] = exprev_otm['exprev'].replace('', np.NaN)
exprev_otm['exprev'] = exprev_otm['exprev'].astype('float').astype('Int64')

## Количественная оценка расходов (возможных поступлений) 
### Единовременные расходы в год возникновения

In [14]:
onetime_exp = expenses['Единовременные расходы в год возникновения'] 
onetime_exp = onetime_exp.replace('0','0 р.')
onetime_exp = onetime_exp.apply(clean_up)

In [15]:
onetime_exp = onetime_exp.astype(str)
onetime_exp = onetime_exp.apply(rouble_norm)
onetime_exp = onetime_exp.apply(let_converter)
onetime_exp = onetime_exp.apply(let_to_num)
onetime_exp = onetime_exp.apply(remove_dot_com)

onetime_exp = onetime_exp.apply(extract_num)

In [32]:
onetime_df = pd.DataFrame({'id':  expenses['id'], 'onetime': onetime_exp})
onetime_df['onetime'] = onetime_df['onetime'].replace('0000','0').replace('000','0').replace('0000000','0').replace('38930','2530450')
onetime_df['onetime'] = onetime_df['onetime'].replace('219024р313582','219024р313582р367397').replace('18236000000','182360000')
onetime_df['onetime'] = onetime_df['onetime'].replace('000000р136659800400р2000000р2000000р60651755700','1000000р2000000')
onetime_df['onetime'] = onetime_df['onetime'].replace('190000000','140000000р190000000р').replace('105000р40000р2100','0')

In [33]:
onetime_exp_otm = onetime_df.assign(onetime = onetime_df['onetime'].str.split('р')).explode('onetime')
onetime_exp_otm['onetime'] = onetime_exp_otm['onetime'].replace('', np.NaN)
onetime_exp_otm['onetime'] = onetime_exp_otm['onetime'].astype('float').astype('Int64')

### Периодические расходы за период

In [20]:
period_exp = expenses['Периодические расходы за период'] 
period_exp = period_exp.replace('0,0','0 р.').replace('0,00','0 р.').replace('0.00','0 р.')
period_exp = period_exp.apply(clean_up)

In [21]:
period_exp = period_exp.astype(str)
period_exp = period_exp.apply(rouble_norm)
period_exp = period_exp.apply(let_converter)
period_exp = period_exp.apply(let_to_num)
period_exp = period_exp.apply(remove_dot_com)

period_exp = period_exp.apply(extract_num)

In [36]:
period_df = pd.DataFrame({'id':  expenses['id'], 'period': period_exp})
period_df['period'] = period_df['period'].replace('552000000р29220560','5520000р29220560').replace('0000','0')
period_df['period'] = period_df['period'].replace('000','0').replace('0000000','0').replace('25000000000','21000000000р25000000000')
period_df['period'] = period_df['period'].replace('500р28595320','28595320')

In [37]:
period_exp_otm = period_df.assign(period = period_df['period'].str.split('р')).explode('period')
period_exp_otm['period'] = period_exp_otm['period'].replace('', np.NaN)
period_exp_otm['period'] = period_exp_otm['period'].astype('float').astype('Int64')

### Возможные поступления за период

In [169]:
poss_receipt = expenses['Возможные поступления за период']
poss_receipt = poss_receipt.replace('0,0','0 р.').replace('0,00','0 р.').replace('0.00','0 р.').replace('0','0 р.')
poss_receipt = poss_receipt.replace('12 631 000 (поступления в бюджет от штрафов за нарушения лицензионных требований)','12631000 р.')
poss_receipt = poss_receipt.replace('50 млн.  рублей','50000000 р.').replace('478 881 000,00 (за 1 год)','478881000 р.')
poss_receipt = poss_receipt.replace('1000000 в год','1000000 р.').replace('262906,7','262906 р.').replace('300 тысяч - 5 млн','300000 р.')
poss_receipt = poss_receipt.apply(clean_up)

In [170]:
poss_receipt = poss_receipt.astype(str)
poss_receipt = poss_receipt.apply(rouble_norm)
poss_receipt = poss_receipt.apply(let_converter)
poss_receipt = poss_receipt.apply(let_to_num)
poss_receipt = poss_receipt.apply(remove_dot_com)

poss_receipt = poss_receipt.apply(extract_num)

In [173]:
poss_rec_df = pd.DataFrame({'id':  expenses['id'], 'poss_rec': poss_receipt})
poss_rec_df['poss_rec'] = poss_rec_df['poss_rec'].replace('000','0').replace('350','').replace('39000','').replace('3500','')
poss_rec_df['poss_rec'] = poss_rec_df['poss_rec'].replace('55600000000','55600000000р87800000000').replace('3000','')
poss_rec_df['poss_rec'] = poss_rec_df['poss_rec'].replace('118000000000291000000','118291000000')
poss_rec_df['poss_rec'] = poss_rec_df['poss_rec'].replace('1500р90000000','90000000')
poss_rec_df['poss_rec'] = poss_rec_df['poss_rec'].replace('190000000','140000000р190000000р90000000р110000000р')
poss_rec_df['poss_rec'] = poss_rec_df['poss_rec'].replace('300000','300000р5000000р') 

In [179]:
poss_rec_otm = poss_rec_df.assign(poss_rec = poss_rec_df['poss_rec'].str.split('р')).explode('poss_rec')
poss_rec_otm['poss_rec'] = poss_rec_otm['poss_rec'].replace('', np.NaN)
poss_rec_otm['poss_rec'] = poss_rec_otm['poss_rec'].astype('float').astype('Int64')

In [187]:
exprev_otm = exprev_otm.rename(columns = {"exprev": "Описание и оценка видов расходов (доходов) (руб)"})
onetime_exp_otm = onetime_exp_otm.rename(columns = {"onetime": "Единовременные расходы в год возникновения (руб)"})
period_exp_otm = period_exp_otm.rename(columns = {"period": "Периодические расходы за период (руб)"})
poss_rec_otm = poss_rec_otm.rename(columns = {"poss_rec": "Возможные поступления за период (руб)"})