In [1]:
import pandas as pd
import numpy as np
import random
import os

import requests
from bs4 import BeautifulSoup
import urllib

import time
import datetime
from datetime import datetime

import re
import nltk
from nltk.metrics.distance import edit_distance

TYPOS = 3 # Acceptable number of typos

### Headers to navigate through the pages

In [2]:
# cookie_for_session string must be updated for each session!
cookie_for_session = '_ym_uid=1632331820855760533;' +\
                     '_ym_d=1650113641;' +\
                     'bankrotcookie=b25b68549ffbd22969577431e73877e5;' +\
                     'ASP.NET_SessionId=ryu5topjnv54x3i4ihyvnwp1;' +\
                     '_ym_isad=2;' +\
                     '_ym_visorc=w;'

def get_headers_for_TIN(TIN):
    """
        Returns Request Headers for url_debtor_search given TIN
        Headers must be updated for each session
    """
    
    return {
        'cookie': cookie_for_session + ' debtorsearch=typeofsearch=Persons' +\
                                       '&orgname=&orgaddress=&orgregionid=&orgogrn=&orginn=&orgokpo=' +\
                                       '&OrgCategory=&prslastname=&prsfirstname=&prsmiddlename=&prsaddress=' +\
                                       '&prsregionid=&prsinn=' + str(TIN) + '&prsogrn=&prssnils=&PrsCategory=&pagenumber=0',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0'
    }




def get_headers_for_names(last_name, first_name, middle_name):
    """
        Returns Request Headers for url_debtor_search given name
        Headers must be updated for each session
    """
    
    return {
        'cookie': cookie_for_session + ' debtorsearch=typeofsearch=Persons' +\
                                       '&orgname=&orgaddress=&orgregionid=&orgogrn=&orginn=&orgokpo=&OrgCategory=' +\
                                       '&prslastname=' + urllib.parse.quote_plus(last_name) +\
                                       '&prsfirstname=' + urllib.parse.quote_plus(first_name) +\
                                       '&prsmiddlename=' + urllib.parse.quote_plus(middleName) +\
                                       '&prsaddress=&prsregionid=&prsinn=&prsogrn=&prssnils=&PrsCategory=&pagenumber=0',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0'
    }

### Functions to extract the data required

In [3]:
def find_string(reqired_string, list_of_strings):
    """
        Returns the string from list_of_strings which is most similar to reqired_string (in terms of Levenshtein distance)
        If the most similar string and sought_after_string have Levenshtein distance > TYPOS -> raise ValueError
    """
    
    typos = [edit_distance((reqired_string), string_i.lower(), transpositions=True) for string_i in list_of_strings]
    i = np.argmin(typos)
    if typos[i] < TYPOS:
        return list_of_strings[i]
    else:
        raise ValueError
        

        
        
def get_link_for_private_debtor_card(response):
    """
        Returns the link to the private debtor card
    """
    
    return response.text[response.text.index('ID='):response.text.index('ID=') + 100].split('"')[0] 
   
    
    
    
def get_document_KAD(soup):
    """
        Returns the name and the date of the last КАД document which name contains 'заверш'
    """
    
    def get_form_data_for_page(num_page):
        """
            Returns data prepared for a post request to the page №num_page of the КАД document table
        """

        return {
            'ctl00$PrivateOffice1$ctl00': 'ctl00$cphBody$upKadDocuments|ctl00$cphBody$gvKadDocuments',
            '__EVENTTARGET': 'ctl00$cphBody$gvKadDocuments',
            '__EVENTARGUMENT': 'Page$' + str(num_page),
            '__ASYNCPOST': 'true'
        }


    try:
        table = soup.find('table', {'id': 'ctl00_cphBody_gvKadDocuments'})
        df_aux = pd.read_html(str(table))[0]

        pagging_info = soup.find('td', {'id': "ctl00_cphBody_paiKadDocuments_tdPaggingAdvInfo"}).text
        shown_from, shown_till, overall = [int(page) for page in re.findall('\d+', pagging_info)]
        num_page = 1

        while (shown_till < overall):
            num_page += 1
            try:
                soup_next_page = BeautifulSoup(requests.post(url=url_private_debtor_card + '?' + ref_private_debtor_card, \
                                                             data=get_form_data_for_page(num_page), \
                                                             headers=get_headers_for_TIN_Info()).text)
                time.sleep(random.random() + 1) # a pause for not to exceed the number of requests per unit of time
                table_next_page = soup_next_page.find('table', {'id': 'ctl00_cphBody_gvKadDocuments'})
                df_aux_next_page = pd.read_html(str(table_next_page))[0]
                dfAux = pd.concat([df_aux, df_aux_next_page])
                pagging_info = soup_next_page.find('td', {'id': "ctl00_cphBody_paiKadDocuments_tdPaggingAdvInfo"}).text
                shown_from, shown_till, overall = [int(page) for page in re.findall('\d+', pagging_info)]
            except:
                break

        df_aux = df_aux[df_aux[find_string('Наименование', df_aux.columns)].str.contains('заверш', flags=re.IGNORECASE)]
        df_aux[find_string('Дата документа', df_aux.columns)] = \
            df_aux[find_string('Дата документа', df_aux.columns)].apply(lambda x: datetime.strptime(x, '%d.%m.%Y'))
        df_aux = df_aux.sort_values(find_string('Дата документа', df_aux.columns), ascending=False)
        document_KAD = df_aux.iloc[0][find_string('Наименование', df_aux.columns)]
        date_document_KAD = str(df_aux.iloc[0][find_string('Дата документа', df_aux.columns)]).split()[0]
        date_document_KAD = date_document_KAD[-2:] + '.' + date_document_KAD[-5:-3] + '.' + date_document_KAD[:4]
        return document_KAD, date_document_KAD
    except:
        return '(н/д)', '(н/д)'


    
    
    
def get_link_for_final_report(soup):
    """
        Returns the link to the last final report
    """
    
    table = soup.find('table', {'id': 'ctl00_cphBody_gvAuReports'})
    df_table = pd.read_html(str(table))[0]
    df_table[find_string('Дата публикации (время московское)', df_table.columns)] = \
        df_table[find_string('Дата публикации (время московское)', df_table.columns)] \
        .apply(lambda x: datetime.strptime(x, '%d.%m.%Y %H:%M:%S'))
    list_of_dates = list(df_table.sort_values('Дата публикации (время московское)', ascending=False)\
                         [find_string('Дата публикации (время московское)', df_table.columns)])
    date = str(list_of_dates[0])
    date = date[8:10] + '.' + date[5:7] + '.' + date[:4]
    return str(table)[str(table).index(date) : str(table).index(date) + 300].split("aspx?")[1].split("'")[0]   
    

    
    
def get_start_and_end_dates(response):
    """
        Returns the beginning and closing dates
    """
    
    date_regex = r'\b(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20\d\d)\b'
    try:
        start_dates = re.findall(date_regex, response.text[response.text.index('ата начала'):\
                                                           response.text.index('ата начала') + 100])
        start_date = ['.'.join(map(str, x)) for x in start_dates][0]
        end_dates = re.findall(date_regex, response.text[response.text.index('ата окончания'):\
                                                         response.text.index('ата окончания') + 100])
        end_date = ['.'.join(map(str, x)) for x in end_dates][0]
        return start_date, end_date
    except:
        return '(н/д)', '(н/д)'
    
    
    
    
def get_amount_of_accepted_claims(soup):
    """
        Returns the amount of the accepted claims
    """
    
    try:
        table = soup.find('div', {'id': 'block15'}).findChildren()[0].findChildren()[0]
        df_table = pd.read_html(str(table))[0]
        return df_table[df_table[find_string('Требования', df_table.columns)].str.contains('3')]\
                [[find_string('Сумма удовлетворенных требований, руб.', df_table.columns)]].iloc[-1][0]
    except:
        return '(н/д)'
    
    
    
    
def get_manager_conclusion(soup):
    """
        Returns the conclusion of the financial manager 
    """
    
    try:
        table = soup.find('div', {'id': 'block8'}).findChildren()[0]
        df_table = pd.read_html(str(table))[-1]
        return df_table[df_table[0].str.contains('освобождение гражданина от обязательств', flags=re.IGNORECASE)][1][0]
    except:
        return '(н/д)'
    
    
    
    
def get_completion_of_bankruptcy(soup):
    """
        Returns the status of bankruptcy proceedings
    """
    
    try:
        table = soup.find('div', {'id': 'block22'}).findChildren()[0].findChildren()[0]
        df_table = pd.read_html(str(table))[0]
        return df_table[df_table[0].str.contains('Производство по делу о банкротстве', flags=re.IGNORECASE)][1][0]
    except:
        return '(н/д)'
    
    
    
    
def get_attachment(soup):
    """
        Returns the name of the attached file
    """
    
    try:
        data = soup.find('a', {'id': 'ctl00_cphBody_ucAuReportView_repFiles_ctl01_HyperLink1'})
        return data.text
    except:
        return '(н/д)'
    
    
    
    
def get_info_from_card(soup, info):
    """
        Returns the information about the client from the element 'info'
    """
    
    try:
        return soup.find('span', {'id': info}).text
    except:
        return '(н/д)'

### Input TINs

In [4]:
os.listdir('data/')

['TINs.xlsx']

In [5]:
df_tins = pd.read_excel('data/' + os.listdir('data/')[0])
df_tins.head()

Unnamed: 0,TIN
0,742001227066
1,543312280693
2,100703456223


In [6]:
TINs = df_tins['TIN']
print('The number of TINs = {}'.format(len(TINs)))

The number of TINs = 3


### Iterating through the TINs

In [7]:
# The resulting DataFrame

df = pd.DataFrame(columns=['Фамилия',
                           'Имя',
                           'Отчество',
                           'Дата рождения',
                           'Место рождения',
                           'Регион ведения дела о банкротстве',
                           'ИНН',
                           'ОГРНИП',
                           'СНИЛС',
                           'Ранее имевшееся ФИО',
                           'Категория должника',
                           'Место жительства',
                           'Дополнительная информация',
                           'Документ КАД',
                           'Дата документа КАД',
                           'Дата начала',
                           'Дата окончания',
                           'Сумма удовлетворенных требований',
                           'Освобождение гражданина от обязательств',
                           'Признак завершения производства по делу о банкротстве',
                           'Прикрепленный файл'])

# URL of the search page
url_debtor_search = 'https://old.bankrot.fedresurs.ru/DebtorsSearch.aspx'

# URL of the private debtor card page
url_private_debtor_card = 'https://old.bankrot.fedresurs.ru/PrivatePersonCard.aspx'

# URL of the final report page
url_final_report = 'https://old.bankrot.fedresurs.ru/AuReportCard.aspx'

In [8]:
start_time = time.time()

# for Name in Names: # for iterating through names instead of TINs
for i, TIN in enumerate(TINs): 
    try:
        # for iterating through names instead of TINs:        
        # response_debtor_search = requests.get(urlDebtorSearch, headers=get_headers_for_names(Name[0], Name[1], Name[2]))
        response_debtor_search = requests.get(url_debtor_search, headers=get_headers_for_TIN(TIN))
        link_private_debtor_card = get_link_for_private_debtor_card(response_debtor_search)
    except:
        response_debtor_search, link_private_debtor_card = '', ''
        
    try:
        response_private_debtor_card = requests.get(url=url_private_debtor_card + '?' + link_private_debtor_card,\
                                                    headers=get_headers_for_TIN(TIN))
        soup_private_debtor_card = BeautifulSoup(response_private_debtor_card.text, 'lxml')
    except:
        response_private_debtor_card, soup_private_debtor_card = '', ''
        
    try:
        link_final_report = get_link_for_final_report(soup_private_debtor_card)
        response_final_report = requests.get(url_final_report + '?' + link_final_report, headers=get_headers_for_TIN(TIN))
        soup_final_report = BeautifulSoup(response_final_report.text, 'lxml')
    except:
        link_final_report, response_final_report, soup_final_report = '', '', ''
    
    
    df.loc[len(df)] = [get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblLastName"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblFirstName"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblMiddleName"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblBirthdate"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblBirthplace"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblRegion"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblINN"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblOGRN"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblSNILS"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblNameHistory"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblCategoryName"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblAddress"),
                       get_info_from_card(soup_private_debtor_card, "ctl00_cphBody_lblAdvInfo"),
                      
                       get_document_KAD(soup_private_debtor_card)[0],
                       get_document_KAD(soup_private_debtor_card)[1],
                       get_start_and_end_dates(response_final_report)[0],
                       get_start_and_end_dates(response_final_report)[1],
                       get_amount_of_accepted_claims(soup_final_report),
                       get_manager_conclusion(soup_final_report),
                       get_completion_of_bankruptcy(soup_final_report),
                       get_attachment(soup_final_report)
                      ]
    
    time.sleep(random.random() + 1) # a pause for not to exceed the number of requests per unit of time
    df.to_excel('output/BankruptInformation.xlsx', index=None)
    print('Processed client №{}'.format(i + 1))

end_time = time.time()
print(f'The process took {round(end_time - start_time, 2)} sec; on average: {round((end_time - start_time) / len(TINs), 2)}')

Processed client №1
Processed client №2
Processed client №3
The process took 8.85 sec; on average: 2.95


In [9]:
df

Unnamed: 0,Фамилия,Имя,Отчество,Дата рождения,Место рождения,Регион ведения дела о банкротстве,ИНН,ОГРНИП,СНИЛС,Ранее имевшееся ФИО,...,Место жительства,Дополнительная информация,Документ КАД,Дата документа КАД,Дата начала,Дата окончания,Сумма удовлетворенных требований,Освобождение гражданина от обязательств,Признак завершения производства по делу о банкротстве,Прикрепленный файл
0,Краев,Владимир,Вениаминович,14.05.1966,с. Медведево Чебаркульского района Челябинской...,Челябинская область,742001227066,(н/д),010-685-751 35,(н/д),...,"Челябинская область, г. Чебаркуль, ул. Куйбыше...",(н/д),О завершении реализации имущества гражданина и...,13.01.2021,26.05.2020,13.01.2021,"79 868,11",Применяется,завершено,Определение о завершении.pdf
1,Абабков,Валерий,Викторович,(н/д),(н/д),Новосибирская область,543312280693,305547535000013,(н/д),(н/д),...,"630523 Новосибисркая область, Новосибирский ра...",(н/д),"Удовлетворить заявление, жалобу, ходатайство (...",12.09.2014,11.11.2013,11.09.2014,000,(н/д),завершено,(н/д)
2,Синюк,Владимир,Валерьевича,(н/д),(н/д),Республика Карелия,100703456223,309103526800023,(н/д),(н/д),...,г.Сортавала п.Туоксъярви д.9,(н/д),Завершить конкурсное производство (ст.149 ФЗ О...,14.01.2013,26.10.2017,02.10.2018,000,(н/д),завершено или прекращено,(н/д)
