In [1]:
import pandas as pd
import re
from orv_cleanup_utils import *
import numpy as np

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [3]:
data_dir = '/home/common/regulation_data/parsed_reports/'
main_df = pd.read_csv(data_dir + 'main_df.csv')

otm_tables_fnames = sorted([fn for fn in os.listdir(data_dir) if not fn.startswith('main')])
otm_tables = {fn[:-4]: pd.read_csv(data_dir + fn) for fn in otm_tables_fnames}
otm_tables.keys()

dict_keys(['0_business', '1_business', '2_business', 'cancel_duties', 'expenses', 'goals', 'group_changes', 'group_expenses', 'groups', 'kpi', 'neccessary_measures', 'new_functions', 'notification_info', 'public_discussion', 'risks'])

## Очистка и нормализация заполнений

In [4]:
def is_empty(text):
    if text.replace('\r', '').replace('\n', '').strip() == '' \
            or text == '' or text == 'nan':
        return True
    return False

def is_hyphens(text):
    if text.replace('-', '').strip() == '':
        return True 
    return False

def is_underscores(text):
    if text.replace('_', '').strip()  == '':
        return True
    return False

def is_junk(text):
    return is_empty(text) or is_hyphens(text) or is_underscores(text)

def clean_up(text):
    text = str(text)
    text = text.strip()
    text = text.lower()
    text = text.replace('(место для текстового описания)', '')
    if is_junk(text):
         return 0
    text = text.rstrip('.')
    text = text.replace('«', '"').replace('»', '"')
    text = text.replace('\t', '').replace('\r\n', '')
    return text

In [6]:
from rnnmorph.predictor import RNNMorphPredictor
predictor = RNNMorphPredictor(language="ru")

In [None]:
def use_yandexspeller(bad_str):
    """исправление орфографических ошибок с помощью Яндекс.Спеллер"""
    post_str = re.sub(r'[^\w|\d]', '', bad_str.lower())
    if not post_str:
        return None
    
    with requests.Session() as s:
        url = 'https://speller.yandex.net/services/spellservice.json/checkText?text='
        r = s.get(url + post_str)
        
        if r.status_code != 200:
            print(f'GET error: {r.status_code}')
            return None
        
        jsn = r.content.decode('unicode-escape')
        jsn_dict = json.loads(jsn)
        
        if not jsn_dict:
            return bad_str
        
        ya_answer = jsn_dict[0]['s']
        if ya_answer:
            return ya_answer[0]
        else:
            return bad_str

def tokenize(sentence):
    sentence = re.sub('\(', ',', sentence)
    sentence = re.sub('\)', '', sentence)
    sentence = re.sub('(?<! )(?=[.,!?()])|(?<=[.,!?()])(?! )', r'', sentence)
    words = sentence.split()
    return words

def tag(sentence):
    words = tokenize(sentence)
    tagged_words = predictor.predict(words)
    return tagged_words

def preparse(sentence):
    tagged_words = tag(sentence)
    
    degree_re = re.compile('(.*)\|Degree=\w+(.*)')
    gender_re = re.compile('(.*)\|Gender=\w+(.*)')
    res = ''
    
    for word in tagged_words:
        word_tag = word.tag
        if 'Degree' in word_tag:
            word_tag = degree_re.sub(r"\1\2", word_tag)

        word_pos = 'pos=' + word.pos#.lower().capitalize()
        if 'PUNCT' in word_pos:
            word_pos = ''

        if 'ADJ' in word_pos:
            if 'Gender' in word_tag:
                word_tag = gender_re.sub(r"\1\2", word_tag)

        word_tags = [
                word.normal_form,
                word_pos,
                word_tag.replace('|', ' ').replace('_', ''),
            ]
        res += ' ' + ' '.join([tag for tag in word_tags if tag])

    res = res.strip().replace('  ', ' ')
    return res

def norm (sentence):
    if sentence is None:
        return
    tagged_words = tag(sentence)
    return ' '.join(word.normal_form for word in tagged_words) 

In [31]:
sup_date = main_df['supposed_date: Предполагаемая дата вступления в силу проекта акта']
sup_date = sup_date.apply(clean_up)

sup_date_norm = [norm(use_yandexspeller(str(i))) for i in sup_date]

goals = otm_tables['goals']
goals_timing = goals['Установленные сроки достижения целей предлагаемого регулирования']
goals_timing = goals_timing.apply(clean_up)

goals_timing_norm = [norm(use_yandexspeller(str(i))) for i in goals_timing]

## Вычленение дат
Как в числовом формате ('01.01.2021', '1 января 2021', '1 квартал 2021', '2021', и тд), так и в виде периода ('в течение 180 дней со дня принятия нпа').

Для этого убираем из заполнений все прочие цифры, в том числе даты, не относящиеся к срокам достижения целей регулирования. 

In [5]:
def let_to_num(s):
    s = s.replace('1 один', '1').replace('6 шесть', '6')
    if re.search(r'девяносто\s+день', s):
        s = s.replace('девяносто','90')
    if re.search(r'сто\s+восемьдесят\s+день', s):
        s = s.replace('сто восемьдесят','180')
    if re.search(r'шесть\s+месяц', s):
        s = s.replace('шесть','6')
    if re.search(r'6\s+месячный', s):
        s = s.replace('месячный','месяц')
    if re.search(r'пять\s+месяц', s):
        s = s.replace('пять','5')
    if re.search(r'триста\s+день', s):
        s = s.replace('триста','300')
    if re.search(r'тридцать\s+день', s):
        s = s.replace('тридцать','30')
    if re.search(r'пять\s+месяц', s):
        s = s.replace('пять','5')
    if re.search(r'десять\s+день', s):
        s = s.replace('десять','10')
    if re.search(r'(три\s+день)|(три\s+месяц)|(три\s+год)', s):
        s = s.replace('три','3')
    if re.search(r'(два\s+месяц)|(два\s+год)', s):
        s = s.replace('два','2')  
    if re.search(r'(один\s+месяц)|(один\s+год)', s):
        s = s.replace('один','1')
    s = s.replace('но не ранее чем по истечение один месяц', '') 
    s = s.replace('1v', 'iv').replace('четвёртый', 'четвертый').replace('1 й квартал','1 квартал')
    return(s)

def insert_dot (s):
    regex = '(\d{2})(\d{2})(20)'
    pattern = '\\1.\\2.2020'
    s = re.sub(regex, pattern , s)
    return (s)

def insert_dots (s):
    if re.search(r'(151220)|(210220)|(010920)|(151220)|(301220)|(010820)|(311220)|(010320)|(010420)', s):
        s = insert_dot(s)
    return (s)

def drop_trash (s):
    s = s.replace('70 лет', '').replace('70 год', '')
    s = s.replace('5g', '')
    s = re.sub(r'в\s+\d*\s+раз', '', s)
    s = re.sub(r'\d*\s+фз', '', s) 
    s = re.sub(r'\d*\s+%', '', s) 
    s = re.sub(r'на\s+100', '', s)
    s = re.sub(r'приказ\s+\d*', '', s)
    s = re.sub(r'г\s+\d*', '', s)
    s = re.sub(r'\d*\s+диметилгидразин', '', s)
    s = re.sub(r'\d*\s+млрд', '', s)
    s = re.sub(r'\d\s+с\s+', '',s )   
    s = re.sub(r'\d*\.\s+в', '', s)
    s = s.replace('802.11ax','')
    s = s.replace('"112"', '')
    s = s.replace('календарный день со', '') 
    s = s.replace('24.01.2019 № 903п-п9', '').replace('24.01.2019 № вм-п9-8992', '')
    s = re.sub(r'(ммв-\d*-\d*/\d*@/\d*)|(ммв-\d*-\d*/\d*@)', '', s)
    s = s.replace('пункт 1, 7.1.1.- 7.1.4., 7.1.13 - 7.1.14, 7.1.31, 7.1.37 - 7.1.38., 7.3.3, 7.4.4, 7.19.1', '')
    s = s.replace('2-й поколение', '').replace('на 3 и более поколение', '')
    s = s.replace('2387-р', '')
    s = s.replace('мм-3-09/536@', '')
    return (s) 

def norm_before_strip (s):
    s = s.replace('12 год', '1-2 год').replace('35 год', '3-5 год')
    s = s.replace('поздний 20 март года','').replace('не поздний 20 май года','').replace('в течение 5 день со день', '')
    if re.search('(ii\sкв)|(iii\sкв)|(iv\sкв)|(\d\sкв)', s):
        s = s.replace('кв','квартал')
    if re.search('\d\sдень', s):
        s = s.replace('день','day')
    if re.search('\d\sдней', s):
        s = s.replace('дней','day')
    if re.search('\d\sрабочий\sдень', s):
        s = s.replace('день','day')
    if re.search('календарный\sдень\sс', s):
        s = s.replace('день','day') 
    if re.search('\d\s+месяц', s):
        s = s.replace('месяц','month')
    if re.search(r'(\s+\d{2}\s+год)', s):
        s = s.replace('год','year')
    if re.search(r'(\s+\d{1}\s+год)', s):
        s = s.replace('год','year')
    if re.search(r'(\d{1}-\d{1}\s+год)', s):
        s = s.replace('год','year')
    if re.search(r'(\d{1}\s+лет)', s):
        s = s.replace('лет','year')
    if re.search(r'(течение\s+год)|(истечение\s+год)', s):
        s = s.replace('год','1 year')
    s = s.replace('10 год', '10 year')
    if re.search(r'(\d{2}).(\d{2}).(\d{4})', s):
        s = s.replace('/','.') 
    return (s)

def strip_spaces(s):
    s = "".join(s.split())
    return (s)

def months_num (s):
    s = s.replace('январь', '.01.')
    s = s.replace('февраль', '.02.')
    s = s.replace('март', '.03.')
    s = s.replace('апрель', '.04.')
    s = s.replace('май', '.05.')
    s = s.replace('июнь', '.06.')
    s = s.replace('июль', '.07.')
    s = s.replace('август', '.08.')
    s = s.replace('сентябрь', '.09.')
    s = s.replace('октябрь', '.10.').replace('октябярить','.10.')
    s = s.replace('ноябрь', '.11.')
    s = s.replace('декабрь', '.12.')
    s = s.replace('|', '')
    s = s.replace('2-й', '2')
    s = re.sub(r'(ivквартал)|(4квартал)|(четвертыйквартал)|(4йквартал)', '.12.', s)
    s = re.sub(r'(iiiквартал)|(3квартал)|(третийквартал)', '.09.', s)
    s = re.sub(r'(iiквартал)|(2квартал)|(2ойквартал)|(второйквартал)', '.06.', s)
    s = re.sub(r'(iквартал)|(1квартал)|(первыйквартал)', '.03.', s)
    s = re.sub(r'(iiполугодие)|(2полугодие)|(второйполугодие)', '.12.', s)
    s = re.sub(r'(iполугодие)|(1полугодие)|(первыйполугодие)', '.06.', s)
    s = s.replace('1бытьполугодие','.01.')
    return (s)

def drop_law_dates (s):
    s = re.sub(r'от(\d{1}).(\d{2}).(\d{4})', '', s)
    s = re.sub(r'от(\d{2}).(\d{2}).(\d{4})', '', s)
    s = re.sub(r'(\d{2}).(\d{2}).(\d{4}г\.\№\d+)', '', s)
    return (s)

def remove_law_attr (s):
    s = s.lower()
    s = s.replace('1-йчисло','') 
    s = s.replace('1гочисло','')
    s = re.sub(r'закон(\d*).(\d{2}).(\d{4})', '', s)
    s = re.sub(r'\№\d*-\d*', '',s)
    s = re.sub(r'\№\d+', '', s)
    s = re.sub(r'\d+\№', '', s)
    s = s.replace('ст.','статья')
    s = re.sub(r'статья\d*\.\d*', '', s)
    s = re.sub(r'(статья\d*)|(\d*статья)', '', s)
    s = re.sub(r'(стать\d*\.\d*)|(стать\d*)','', s)
    s = re.sub(r'(пункт\d\.\d*)|(пункт\d*)||(п\.\d*)', '', s) 
    s = re.sub(r'(часть\d*\.\d*)|(часть\d*)', '', s)
    s = re.sub(r'глава\d*', '', s)
    s = re.sub(r'ставка\d*', '', s)
    s = re.sub(r'\d*перечень', '', s) 
    s = re.sub(r'законопроект\d*', '', s)
    s = re.sub(r'постановление\d*', '', s)
    s = re.sub(r'\dприложение*', '', s)
    s = re.sub(r'\d*кодекс', '', s) 
    s = re.sub(r'\d*федеральный', '', s)
    s = re.sub(r'\d*"земельный','', s)
    s = re.sub(r'\d*нк', '', s) 
    s = re.sub(r'пр-\d*', '', s)
    s = re.sub(r'(\d*).(\d{2}).(\d{4})г\.\,регистрационный', '', s) 
    s = s.replace('гн2.2.5.18', '').replace('пб05-580-03', '').replace('рд-05-328-99', '').replace('рд-05-350-00', '')
    s = s.replace('пб05-619-03', '').replace('гн 2.2.5.3532-18', '').replace('01/136009-ак','').replace('гн 2.2.5.3532-18','')
    return (s)

def drop_rus (s):
    s = re.findall(r'[^А-я]+', s)
    s = "".join(s)
    return (s)

def drop_eng (s):
    s = re.sub(r'\d+[A-Za-z]+','',s)
    return (s)

def extract_num (s):
    s = re.findall(r'\d+', s)
    s = "-".join(s)
    return (s)

def extract_eng (s):
    s = re.findall('\d+[A-Za-z]', s)
    s = "".join(s)
    return (s)

def extract_period (s):
    s = re.findall(r'(\d+year)|(\d+month)|(\d+day)', s)
    s = (''.join(map(lambda x: str(x[0]) + '' + str(x[1]) + '' + str(x[2]), s)))
    return (s)

In [7]:
goals_t_num = pd.Series(goals_timing_norm)

goals_t_num = goals_t_num.apply(let_to_num) 
goals_t_num = goals_t_num.apply(insert_dots) 
goals_t_num = goals_t_num.apply(drop_trash) 
goals_t_num = goals_t_num.apply(norm_before_strip)
goals_t_num = goals_t_num.apply(strip_spaces)

goals_t_num = goals_t_num.apply(months_num) 
goals_t_num = goals_t_num.apply(drop_law_dates)  
goals_t_num = goals_t_num.apply(remove_law_attr)

goals_t_num2 = goals_t_num.apply(remove_law_attr)
goals_t_num2 = goals_t_num2.apply(extract_period)

goals_t_num = goals_t_num.apply(drop_rus)
goals_t_num = goals_t_num.apply(drop_eng)
goals_t_num = goals_t_num.apply(extract_num)

In [8]:
sup_date = pd.Series(sup_date_norm)

sup_date = sup_date.apply(let_to_num) 
sup_date = sup_date.apply(insert_dots) 
sup_date = sup_date.apply(drop_trash) 
sup_date = sup_date.apply(norm_before_strip)
sup_date = sup_date.apply(strip_spaces)

sup_date = sup_date.apply(months_num) 
sup_date = sup_date.apply(drop_law_dates)  
sup_date = sup_date.apply(remove_law_attr)

sup_date = sup_date.apply(drop_rus)
sup_date = sup_date.apply(drop_eng)
sup_date = sup_date.apply(extract_num)

sup_df = pd.DataFrame({'id': main_df['header: id'], 'sup_date_raw': main_df['supposed_date: Предполагаемая дата вступления в силу проекта акта'],  'sup_date': sup_date})

sup_df.iloc[3997, sup_df.columns.get_loc('sup_date')] = '2019'
sup_df.iloc[457, sup_df.columns.get_loc('sup_date')] = '30-06-2016'
sup_df.iloc[6048, sup_df.columns.get_loc('sup_date')] = '30-06-2019'

## Разделение дат и приведение к единому формату (гггг-мм-дд)

In [9]:
def clean_sup_date (s):
    i = len(str(s))
    if i < 3:
        s = ''
    return (s)

def drop_trash (s):
    s = s.replace('38399', '').replace('79616', '').replace('95319', '').replace('398', '')
    s = s.replace('02-04-03-19-00089665', '').replace('76953', '').replace('91271', '')
    s = re.sub(r'^06-2$', '', s)
    s = re.sub(r'^1-09$', '', s)
    s = s.replace('02-08-04-20-00101035', '')
    s = re.sub(r'^01-07-201$', '', s)
    s = re.sub(r'^1-01-20126$', '', s)
    s = re.sub(r'^06-2$', '', s) 
    s = s.replace('7-1-1-7-1-4-7-1-13-7-1-14-7-1-31-7-1-37-7-1-38-7-3-3-7-4-4-7-19-1-01-2017', '1-01-2017')
    return (s)

def transform_unique (s):
    s = s.replace('09-2107','09-2017').replace('08-1018','08-2018').replace('08-2919','08-2019')
    s = re.sub(r'^1-01$','1-01-2018', s)
    s = re.sub(r'^02017$','2017', s)
    s = re.sub(r'^22015$','31-12-2015', s)
    s = re.sub(r'^01-122017$','01-12-2017', s)
    s = re.sub(r'^22017$','31-12-2017', s)
    s = re.sub(r'^42017$','31-12-2017', s) 
    s = re.sub(r'^12021$','1-01-2021', s) 
    s = re.sub(r'^12-218$','31-12-2018', s) 
    s = re.sub(r'^1-01-219$','1-01-2019', s) 
    s = re.sub(r'^05-201$','1-05-2020', s) 
    s = re.sub(r'^01201$','01-01-2020', s)
    s = re.sub(r'^02019$','2019', s) 
    s = re.sub(r'^1-01-2018206$', '1-01-2018', s)
    s = re.sub(r'^07-218$', '07-2018', s)
    if re.search(r'^1-20\d{2}$', s):  # кварталы
        s = re.sub('^1', '01-03', s)
    if re.search(r'^2-20\d{2}$', s):  
        s = re.sub('^2', '01-06', s)
    if re.search(r'^3-20\d{2}$', s):  
        s = re.sub('^3', '01-09', s)
    if re.search(r'^4-20\d{2}$', s):  
        s = re.sub('^4', '01-12', s)
    return (s)

def split_multiple (s):
    if re.search(r'^\d{2}-\d{5}-\d*-20\d{2}$', s):  # 09-20181-01-2020
        s = re.sub('\d{1}-\d*-20\d{2}$', '', s)
    if re.search(r'^\d{2}-\d{4}-\d{1}-\d{2}-\d{4}-\d{1}-\d{2}-\d{4}$', s):  # 06-2021-1-07-2021-1-07-2021
        s = re.sub('-\d{1}-\d{2}-\d{4}-\d{1}-\d{2}-\d{4}$', '', s)    
    if re.search(r'^\d{2}-20\d{2}-\d*-\d*-20\d{2}-\d*-20\d{2}$', s):   # 06-2021-1-07-2021-07-2021
        s = re.sub('-\d*-\d*-20\d{2}-\d*-20\d{2}$', '', s)
    if re.search(r'^\d*-\d*-\d{4}-\d*-\d*-\d{4}-\d*-\d*-\d{4}-\d*-\d*-\d{4}$', s): 
        s = re.sub('-\d*-\d*-\d{4}-\d*-\d*-\d{4}-\d*-\d*-\d{4}$', '', s)
    if re.search(r'^\d*-\d*-\d{4}-\d*-\d*-\d{4}$', s): 
        s = re.sub('-\d*-\d*-\d{4}$', '', s)
    if re.search(r'^\d*-\d{4}-\d*-\d{4}$', s): 
        s = re.sub('-\d*-\d{4}$', '', s)
    if re.search(r'^\d*-\d{4}-\d*-\d{4}$', s): 
        s = re.sub('^\d*-\d{4}-', '', s)
    if re.search(r'^\d*-\d*-\d{6}-\d*-\d*$', s):   # 1-01-201821-01-2019
        s = re.sub('\d{2}-\d*-\d{4}$', '', s)
    if re.search(r'^\d*-\d*-\d{5}-\d*-\d*$', s):  
        s = re.sub('\d{1}-\d*-\d{4}$', '', s)
    if re.search(r'^20\d{2}20\d{2}', s): 
        s = re.sub('\d{4}$', '', s)
    if re.search(r'^\d{1}-\d*-20\d{2}-\d*$', s):  # 1-01-2018-2018
        s = re.sub('-\d*$', '', s)
    if re.search(r'^\d*-\d*-\d*-20\d{2}$', s):  # 4-01-07-2020
        s = re.sub('^\d*-', '', s)    
    if re.search(r'^\d{1}-\d*-20\d{2}20\d{2}$', s):  # 1-01-20202025
        s = re.sub('\d{4}$', '', s)    
    if re.search(r'^\d{4}-\d*-\d*-20\d{2}$', s):  # 2021-1-01-2025
        s = re.sub('-\d*-\d*-\d{4}$', '', s)  
    if re.search(r'^\d{2}-20\d{2}-\d*-\d*-20\d{2}$', s):   # 12-2018-1-12-2019
        s = re.sub('-\d*-\d*-\d{4}$', '', s) 
    if re.search(r'^\d{2}-20\d{2}20\d{2}$', s):   # 12-20182019
        s = re.sub('\d{4}$', '', s)
    if re.search(r'^20\d{2}-20\d{2}$', s):   # 2017-2018
        s = re.sub('-20\d{2}$', '', s)  
    if re.search(r'^\d{2}-\d{2}-20\d{2}-\d{2}-20\d{2}$', s):   # 30-04-2021-07-2021
        s = re.sub('-\d{2}-20\d{2}$', '', s)    
    return (s)

def add_sep (s):
    if re.search(r'^\d{4}20\d{2}$', s):  # 01022016
        s = s[:2] + "-" + s[2:4] + "-" + s[4:]
    if re.search(r'^\d{2}20\d{2}$', s):  # 012025
        s = '01-' + s[:2] + "-" + s[2:] 
    if re.search(r'^\d{4}2\d{1}$', s):  # 010220
        s = s[:2] + "-" + s[2:4] + "-20" + s[4:]      
    if re.search(r'^0\d{1}2\d{1}$', s):  # 0120
        s = '01-' + s[:2] + "-" + s[2:4] + "20"
        # 12-20 (?)
    if re.search(r'^\d{2}-\d{2}-\d{2}$', s):  # 01-04-18
        s = s[:6] + "20" + s[6:] 
    if re.search(r'^\d{2}-\d{2}-\d{1}-\d{3}$', s):  # 30-11-2-017
        s = s[:7] + s[8:]     
    return (s)

def correct_wrong_dm (s):
    s = re.sub(r'^30-02', '28-02', s)
    s = re.sub(r'^31-06', '30-06', s)
    s = re.sub(r'^31-09', '30-09', s)
    s = re.sub(r'^31-11', '30-11', s)
    return (s)

def add_start_day (s):
    if re.search(r'^\d{2}-20\d{2}$', s): 
        s = '01-'+ s
    if re.search(r'^20\d{2}$', s): 
        s = '01-01-'+ s
    return (s)

In [10]:
sup_df['sup_date'] = sup_df['sup_date'].apply(clean_sup_date)
sup_df['sup_date'] = sup_df['sup_date'].apply(drop_trash)
sup_df['sup_date'] = sup_df['sup_date'].apply(transform_unique)
sup_df['sup_date'] = sup_df['sup_date'].apply(split_multiple)
sup_df['sup_date'] = sup_df['sup_date'].apply(add_sep)
sup_df['sup_date'] = sup_df['sup_date'].apply(correct_wrong_dm)
sup_df['sup_date'] = sup_df['sup_date'].apply(add_start_day)

sup_df['sup_date'] = sup_df['sup_date'].replace('', np.nan)
sup_df['sup_date'] = pd.to_datetime(sup_df['sup_date'], format='%d-%m-%Y')

In [11]:
def add_end_day (s):
    if re.search(r'^02-20\d{2}$', s): 
        s = '28-'+ s
    if re.search(r'^\d{2}-20\d{2}$', s): 
        s = '30-'+ s
    if re.search(r'^20\d{2}$', s): 
        s = '31-12-'+ s
    return (s)

def clean_goalstim (s):
    i = len(str(s))
    if i < 4: 
        s = ''
    return (s)

def remove_complex (s):
    s = re.sub(r'^136009$', '', s)
    s = re.sub(r'^2-2-5-3532-18$', '', s)
    s = re.sub(r'^2387$', '', s)
    s = re.sub(r'^31-12$', '', s)
    s = re.sub(r'^7-15-319-1009$', '', s)
    s = re.sub(r'^3-09-536$', '', s)
    s = re.sub(r'^28-06$', '', s)
    s = re.sub(r'^30-30$', '', s)
    s = re.sub(r'^9-8992$', '', s)
    s = re.sub(r'^7-6-777$', '', s)
    s = re.sub(r'^1232$', '', s)
    s = re.sub(r'^24-01-2019-9-02-2019-03-2019$', '', s) 
    s = re.sub(r'^1-07$','1-07-2020', s)
    s = re.sub(r'^31-12-202019$','31-12-2019',s)
    s = re.sub(r'^1-01-20121$','1-01-2021',s)
    s = re.sub(r'^2018-2017-2018$','01-06-2017',s)
    s = re.sub(r'^01-20-2025$','01-01-2020',s)
    s = re.sub(r'^3861-11-2019$','1-11-2019',s) 
    s = re.sub(r'^06-2016-2016$','30-06-2016',s)
    s = re.sub(r'^12-1017$','31-12-2017',s)
    s = re.sub(r'^03-2019-4$','31-03-2019', s)
    s = re.sub(r'^2018-2$','31-12-2018',s)
    s = re.sub(r'^20-16-2017$','31-12-2018', s)
    return (s)
    
def split_dates (s):
    s = re.sub(r'^201620172018$', '2016 2017 2018', s)
    s = re.sub(r'^01-2021-01-09-20218$', '01-2021 01-09-2021', s)
    s = re.sub(r'^1-01-2011-01-2019$', '01-01-2019', s)
    s = re.sub(r'^09-20181-01-2020$', '09-2018 1-01-2020', s)
    s = re.sub(r'^31-12-2023-2018-20192024-2025$', '31-12-2023 2019 2024 2025', s)
    s = re.sub(r'^03-2019-03-2019-03-2019-03-2019-03-2019$', '31-03-2019', s)
    s = re.sub(r'^05-2018-11-2018-11-2018-02-2018$', '05-2018 11-2018 11-2018 02-2018 ', s)
    s = re.sub(r'^12-2019-12-20192020$', '12-2019 2020', s)
    s = re.sub(r'^1-01-20192025$', '1-01-2019 2025', s)
    s = re.sub(r'^12-2019-12-20192020$', '12-2019 2020', s)
    s = re.sub(r'^1-01-2018-20192020$', '1-01-2018 2019 2020', s)
    s = re.sub(r'^01-01-20162017-2017$', '01-01-2016 2017 2017', s)
    s = re.sub(r'^2019-2019-2020-2020-2025$', '2019 2020 2025', s)
    s = re.sub(r'^1-30-12-2017-2-1-05-2018$', '30-12-2017 1-05-2018', s)
    s = re.sub(r'^1-01-2016-07-20161-07-2017$', '1-01-2016 1-07-2016 1-07-2017', s)  
    s = re.sub(r'^1-1-01-20202-1-01-2022$', '1-01-2020 1-01-2022', s)
    s = re.sub(r'^1-01-20191-01-20201-01-2021$', '1-01-2019 1-01-2020 1-01-2021', s)
    s = re.sub(r'^201920202021-1-03-1-01-2019-2019-1-03-2019$', '1-03-2019', s)
    s = re.sub(r'^1-01-2020-7-21-271-7-21-575-1-01-2020$', '1-01-2020', s)
    s = re.sub(r'^01-2021-01-09-2021$', '01-2021 01-09-2021', s)
    s = re.sub(r'^1-01-2019-1-01-20201-01-2021$', '1-01-2019 1-01-2020 1-01-2021', s)
    
    if re.search(r'^20\d{2}-\d{2}$', s): # 2017-18
        s = s.replace('-', ' 20')
    if re.search(r'^20-\d{2}-20\d{2}$', s): # 20-24-2025
        s = s.replace('20-', '20').replace('-', ' ')
    if re.search(r'^20\d{2}-20\d{2}$', s): # 2025-2028
        s = re.sub(r'-', ' ', s)
    if re.search(r'^\d{1}-\d{2}-20\d{4}-\d{2}-20\d{2}$', s): # 1-07-201931-12-2019
        s = s[:9] + ' ' + s[9:]
    if re.search(r'^\d{2}-\d{2}-20\d{4}-\d{2}-20\d{2}$', s): # 30-08-202030-12-2020
        s = s[:10] + ' ' + s[10:]
    if re.search(r'^\d{1}-\d{2}-20\d{3}-\d{2}-20\d{2}$', s): # 1-01-20191-01-2019
        s = s[:9] + ' ' + s[9:]
    if re.search(r'^\d{2}-20\d{2}-\d{2}-20\d{2}$', s): # 12-2017-03-2017
        s = re.sub(r'(\d{2}-20\d{2})-(\d{2}-20\d{2})', r'\1 \2', s)
    if re.search(r'^\d*-\d{2}-\d*-\d{2}-20\d{2}$', s): # 20-05-31-08-2019
        s = re.sub(r'^\d*-\d{2}-', '', s)
    if re.search(r'^\d*-\d{2}-20\d{2}-\d*-\d{2}-20\d{2}$', s): # 01-01-2020-31-12-2024
        s = re.sub(r'(\d*-\d{2}-20\d{2})-(\d*-\d{2}-20\d{2})', r'\1 \2', s)
    if re.search(r'^20\d{2}20\d{2}$', s): # 20202021
        s = re.sub(r'(\d{4})(\d{4})', r'\1 \2', s)
    return (s)

In [14]:
goalstim_df = pd.DataFrame({'id': goals['id'], 'goals_timing': goals['Установленные сроки достижения целей предлагаемого регулирования'], 'output': goals_t_num, 'period': goals_t_num2})
sup_df = sup_df[['id','sup_date']]
sup_df = sup_df.drop_duplicates(subset = ['id'])

goalstim_df = pd.merge(goalstim_df, sup_df, on = ["id"], how = 'left')
goalstim_df['output'] = goalstim_df['output'].replace('1-01', '')

In [15]:
goalstim_df['output'] = goalstim_df['output'].apply(clean_sup_date)
goalstim_df['output'] = goalstim_df['output'].apply(drop_trash)
goalstim_df['output'] = goalstim_df['output'].apply(transform_unique)
goalstim_df['output'] = goalstim_df['output'].apply(add_sep)
goalstim_df['output'] = goalstim_df['output'].apply(correct_wrong_dm)
goalstim_df['output'] = goalstim_df['output'].apply(add_end_day)
goalstim_df['output'] = goalstim_df['output'].apply(clean_goalstim)
goalstim_df['output'] = goalstim_df['output'].apply(remove_complex)
goalstim_df['output'] = goalstim_df['output'].apply(split_dates)

In [16]:
### one-to-many таблица
goalstim_df['all_dates'] = goalstim_df['output']
goalstim_df_otm = goalstim_df.assign(output = goalstim_df['output'].astype(str).str.split(' ')).explode('output')
goalstim_df_otm = goalstim_df_otm.drop(['period', 'sup_date', 'all_dates','goals_timing'], axis = 1)

goalstim_df_otm['output'] = goalstim_df_otm['output'].replace('', np.nan) 
goalstim_df_otm['output'] = goalstim_df_otm['output'].astype(str).apply(add_end_day)
goalstim_df_otm['output'] =  pd.to_datetime(goalstim_df_otm['output'], format ='%d-%m-%Y')
goalstim_df_otm = goalstim_df_otm.rename(columns = {'output': 'Установленные сроки достижения целей предлагаемого регулирования (дата)'})

In [17]:
goalstim_df_otm.head(15)

Unnamed: 0,id,Установленные сроки достижения целей предлагаемого регулирования (дата)
0,02/04/11-20/00110705,NaT
1,02/08/02-17/00062320,2019-12-31
2,02/07/07-17/00068049,2018-01-01
3,02/04/01-18/00077703,2025-12-31
3,02/04/01-18/00077703,2028-12-31
4,02/08/07-17/00068406,NaT
5,02/07/11-20/00110259,NaT
6,02/04/09-16/00054845,NaT
7,02/08/07-20/00106204,2020-12-30
8,02/04/11-19/00096996,NaT


In [18]:
len(goalstim_df_otm)

8509

In [19]:
### one-to-one таблица (для расчета сроков)
goalstim_df['output'] = goalstim_df['output'].str.split().str[-1]
goalstim_df['output'] = goalstim_df['output'].astype(str).apply(add_end_day)
goalstim_df['output'] = pd.to_datetime(goalstim_df['output'], format='%d-%m-%Y')

In [20]:
len(goalstim_df)

8384

### Расчет периода (в днях) между двумя датами 
Если заполнение — дата 

In [21]:
goalstim_df['term'] = goalstim_df['output'] - goalstim_df['sup_date']
goalstim_df['term'] = (goalstim_df['term'] / np.timedelta64(1,'D')).astype('Int64')
goalstim_df['end_date'] = goalstim_df['output'] 

### Расчет конечной даты относительно указанного периода
Если заполнение —  период 

In [22]:
def split_period (s):
    s = re.sub(r'(\d)([a-z])', r'\1 \2', s)
    return (s)

In [23]:
period_calc = goalstim_df.copy()

period_calc['period'] = period_calc['period'].replace('1month1month', '1month').replace('180day2year', '2year').replace('3day', '')
period_calc['period'] = period_calc['period'].replace('6month2year', '2year').replace('1year25year20182022year', '5year').replace('70year70year', '')

period_calc['period'] = period_calc['period'].apply(split_period)
period_calc[['term', 'date_type']] = period_calc['period'].str.split(" ", expand=True)
period_calc = period_calc[(period_calc['term'] != '')]
period_calc['term'] = period_calc['term'].astype('float').astype('Int64')

period_calc['term'] *= np.where(period_calc['date_type'] =='year', 365, 1)
period_calc['term'] *= np.where(period_calc['date_type'] =='month', 30, 1)
period_calc['term'] = period_calc['term'].astype('Int64')

period_calc['end_date'] =  period_calc['sup_date'] + period_calc['term'].apply(pd.offsets.Day)
period_calc = period_calc[['id', 'period', 'term', 'end_date']]



In [24]:
def insert_rus (s):
    s = str(s)
    s = re.sub(r'1 year', '1 год', s)
    s = re.sub(r'1 month', '1 месяц', s)
    if re.search(r'(2year)|(3year)(4year)', s):
        s = re.sub(r'year', 'года', s)
    else:
        s = re.sub(r'year', 'лет', s)
    if re.search(r'(2month)|(3month)(4month)', s):
        s = re.sub(r'month', 'месяца', s)
    else:
        s = re.sub(r'month', 'месяцев', s)
    if re.search(r'day', s):
        s = re.sub(r'day', 'дней', s)
    return (s)

In [25]:
goalstim_df = goalstim_df.drop(['period'], axis = 1) 
period_calc = period_calc.drop_duplicates(subset = ['id'])
full_df = pd.merge(goalstim_df, period_calc, on = ["id"], how ='left')

full_df.end_date_x.fillna(full_df.end_date_y, inplace=True)
full_df.term_x.fillna(full_df.term_y, inplace=True)

full_df['period'] = full_df['period'].apply(insert_rus)

full_df = full_df.drop(columns = ['all_dates', 'goals_timing', 'end_date_y', 'term_y'])
full_df = full_df.rename(columns = {'output': 'Установленные сроки достижения целей предлагаемого регулирования (дата)', 
                                    'period': 'Установленные сроки достижения целей предлагаемого регулирования (период)',
                                    'sup_date': 'Предполагаемая дата вступления в силу проекта акта',
                                   'term_x': 'Срок достижения целей (дней)', 'end_date_x': 'Дата достижения'})

In [26]:
full_df = full_df[['Предполагаемая дата вступления в силу проекта акта',
                   'Установленные сроки достижения целей предлагаемого регулирования (дата)', 
                   'Установленные сроки достижения целей предлагаемого регулирования (период)',
                   'Срок достижения целей (дней)', 'Дата достижения']]

In [27]:
full_df.tail(15)

Unnamed: 0,Предполагаемая дата вступления в силу проекта акта,Установленные сроки достижения целей предлагаемого регулирования (дата),Установленные сроки достижения целей предлагаемого регулирования (период),Срок достижения целей (дней),Дата достижения
8369,2019-08-01,NaT,,,NaT
8370,2016-09-01,2017-12-31,,486.0,2017-12-31
8371,2020-10-01,NaT,,,NaT
8372,2020-01-01,2021-01-01,,366.0,2021-01-01
8373,2017-09-01,NaT,,,NaT
8374,2020-01-01,2021-01-01,,366.0,2021-01-01
8375,2021-01-01,2021-01-01,,0.0,2021-01-01
8376,2018-01-01,2018-01-01,,0.0,2018-01-01
8377,2020-12-01,2020-12-30,,29.0,2020-12-30
8378,2018-04-01,NaT,,,NaT


In [28]:
full_df['Срок достижения целей (дней)'].describe() 

count    3443.000000
mean      196.577403
std       432.395694
min      -794.000000
25%         0.000000
50%        29.000000
75%       364.000000
max      5143.000000
Name: Срок достижения целей (дней), dtype: float64

### Отрицательные значения периодов 

In [29]:
minus = full_df[full_df['Срок достижения целей (дней)'] < 0]
len(minus)

227