In [None]:
import requests
import time
import datetime
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import sys
import os

In [None]:
def log(generator, log_progress=False):
    def fmt(start, progress):
        td = datetime.timedelta(seconds=int(time.perf_counter()-start))
        return f"{min(int(100 * progress), 100)}% {td}" if log_progress else str(td)

    start = time.perf_counter()
    for progress in generator:
        print(fmt(start, progress), end="\r", flush=True)
    print(fmt(start, 1))


def log_response(response):
    print(f"response: status={response.status_code}, url={response.url}")

In [None]:
def request_procurements(okpd2_ids, publish_date_from, publish_date_to, page_number, records_per_page=100):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }
    params = {
        "morphology": "on",
        "search-filter": "Дате размещения",
        "pageNumber": page_number,
        "sortDirection": "true",
        "recordsPerPage": records_per_page,
        "showLotsInfoHidden": "false",
        "sortBy": "PUBLISH_DATE",
        
        # Закон
        "fz44": "on", # 44-ФЗ
        #"fz233": "on", # 223-ФЗ
        #"fz94": "on", # 94-ФЗ
        
        # Этап закупки
        "af": "on", # Подача заявок
        "ca": "on", # Работа комиссии
        "pc": "on", # Закупка завершена
        "pa": "on", # Закупка отменена
        
        # Цена
        "priceFromGeneral": 100000, # Минимальная
        #"priceToGeneral": 100000000, # Максимальная
        "currencyIdGeneral": '-1', # Валюта
        "contractPriceCurrencyId": '-1',
        
        # Дата
        "publishDateFrom": publish_date_from, # Размещение от
        "publishDateTo": publish_date_to, # Размещение до
        
        # ОКПД2
        "okpd2Ids": ",".join(okpd2_ids.keys()), #"8890631,8890630,8890629,8890628,8890627,8890626,8890639,8890638,8890644,8890643,8890641,8890640,8890622,8890621,8890648", 
        "okpd2IdsCodes": ",".join(okpd2_ids.values()), #"62.02.30.000,62.02.20.190,62.02.20.140,62.02.20.130,62.02.20.120,62.02.20.110,62.09.20.120,62.09.20.110,63.11.19.000,63.11.13.000,63.11.11.000,62.09.20.190,62.01.12.000,62.01.11.000,63.12.10.000",
        
        "OrderPlacementSmallBusinessSubject": "on",
        "OrderPlacementExecutionRequirement": "on",
        "orderPlacement94_0": "0",
        "orderPlacement94_1": "0",
        "orderPlacement94_2": "0",
    }
    return requests.get("https://zakupki.gov.ru/epz/order/extendedsearch/results.html", headers=headers, params=params)

In [None]:
def parse_procurement(content):
    procurement = {}
    header = content.find("div", class_="registry-entry__header")
    body = content.find("div", class_="registry-entry__body")
    
    mid_number = header.find("div", class_="registry-entry__header-mid__number")
    procurement["reg_number"] = mid_number.a.text.strip().strip("№ ")
    procurement["href"] = mid_number.a["href"]
    
    mid_title = header.find("div", class_="registry-entry__header-mid__title")
    procurement["stage"] = mid_title.text.strip()
    
    procurement["law"] = header.find("div", class_="col-9 p-0 registry-entry__header-top__title text-truncate").text.strip()
    
    body_value = body.find("div", class_="registry-entry__body-value")
    procurement["description"] = body_value.text.strip()
    
    body_href = body.find("div", class_="registry-entry__body-href")
    procurement["organization"] = body_href.text.strip()
    procurement["organization_href"] = body_href.a["href"]

    price = content.find("div", class_="price-block__value")
    procurement["start_price"] = price.text.strip().replace(u"\xa0", " ")

    data = content.find("div", class_="data-block mt-auto")
    title_to_name = {
        "Размещено": "start_time",
        "Обновлено": "update_time",
        "Окончание подачи заявок": "end_time"
    }
    for title in data("div", class_="data-block__title"):
        time = title.parent.find("div", class_="data-block__value", recursive=False).text.strip()
        procurement[title_to_name[title.text]] = time
    
    return procurement


def parse_procurements(content):
    parse_only = SoupStrainer("div", class_="search-registry-entry-block box-shadow-search-input") 
    bs = BeautifulSoup(content, "html.parser", parse_only=parse_only)
    return [parse_procurement(procurement_content) for procurement_content in bs]

In [None]:
def select_procurements(procurements, okpd2, publish_date_from, publish_date_to, min_count):
    page_number = 0
    okpd2_ids = {okpd2[0]: okpd2[1]}
    while min_count > len(procurements):
        page_number += 1
        time.sleep(1)
        response = request_procurements(okpd2_ids, publish_date_from, publish_date_to, page_number)
        if not response.ok:
            log_response(response)
            break
        
        page_procurements = parse_procurements(response.content)
        if not page_procurements:
            break
        for procurement in page_procurements:
            procurement["okpd2"] = okpd2[0]
        prev_count = len(procurements)
        procurements.update({procurement["reg_number"]: procurement for procurement in page_procurements})
        count = len(procurements)
        print(f"okpd2={okpd2[0]}, page={page_number}, count={count}")
        if count <= prev_count:
            break
        yield count / min_count


def get_procurements(okpd2, publish_date_from, publish_date_to, limit):
    procurements = {}
    log(select_procurements(procurements, okpd2, publish_date_from, publish_date_to, limit))
    return list(procurements.values())[:limit]

In [None]:
def concat(root, verify_integrity=True):
    dfs = [pd.read_csv(path) for path in [root + fname for fname in os.listdir(root)]]
    return pd.concat(dfs, axis=0, join="outer", ignore_index=True, verify_integrity=verify_integrity) if dfs else None    

In [None]:
# Поиск закупок
publish_date_from = "01.01.2021"
publish_date_to = "01.01.2022"
okpd2_ids = {
    #"8890631": "62.02.30.000",
    #"8890630": "62.02.20.190",
    #"8890629": "62.02.20.140",
    #"8890628": "62.02.20.130",
    #"8890627": "62.02.20.120",
    #"8890626": "62.02.20.110",
    #"8890639": "62.09.20.120",
    #"8890638": "62.09.20.110",
    #"8890644": "63.11.19.000",
    #"8890643": "63.11.13.000",
    #"8890641": "63.11.11.000",
    #"8890640": "62.09.20.190",
    #"8890622": "62.01.12.000",
    #"8890621": "62.01.11.000",
    #"8890648": "63.12.10.000"
}
for okpd2 in okpd2_ids.items():
    procurements = get_procurements(okpd2, publish_date_from, publish_date_to, 1000_000)
    df = pd.DataFrame.from_records(procurements)
    path = f"./procurements/{okpd2[0]}_{publish_date_from}-{publish_date_to}.csv"
    df.to_csv(path, index=False)
print("Successfully!")

In [None]:
# Формирование набора данных закупок
concat("./procurements/").to_csv("./procurements.csv", index=False)

In [None]:
df_procurements = pd.read_csv("./procurements.csv")
#FIXME: assert df_procurements.reg_number.nunique() == len(df_procurements)
df_procurements.info()

In [None]:
df_procurements.head()

In [None]:
def request_contracts(order_number):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }
    params = {
        "searchString": "",
        "orderNumber": order_number,
        "openMode": "USE_DEFAULT_PARAMS",
        "fz44": "on",
        "priceFrom": 0,
        "priceTo": 200_000_000_000,
        "contractStageList": r"MCUyQzElMkMyJTJDMw==",
        "budgetaryFunds": "on",
        "extraBudgetaryFunds": "on"
    }
    return requests.get("https://zakupki.gov.ru/epz/contract/search/results.html", headers=headers, params=params)

In [None]:
def parse_contract(content):
    contract = {}
    header = content.find("div", class_="registry-entry__header")
    body = content.find("div", class_="registry-entry__body")
    
    mid_number = header.find("div", class_="registry-entry__header-mid__number")
    contract["reestr_number"] = mid_number.a.text.strip().strip("№ ")
    contract["href"] = mid_number.a["href"]
    
    mid_title = header.find("div", class_="registry-entry__header-mid__title")
    contract["stage"] = mid_title.text.strip()

    body_value = body.find("div", class_="registry-entry__body-value")
    contract["description"] = body_value.text.strip()

    price = content.find("div", class_="price-block__value")
    contract["price"] = price.text.strip().replace(u"\xa0", " ")
    return contract


def parse_contracts(content):
    parse_only = SoupStrainer("div", class_="search-registry-entry-block box-shadow-search-input") 
    bs = BeautifulSoup(content, "html.parser", parse_only=parse_only)
    return [parse_contract(contract_content) for contract_content in bs]

In [None]:
def select_contracts(df_procurements):
    def select(contracts):
        for i, (_, procurement) in enumerate(df_procurements.iterrows()):
            order_number = procurement["reg_number"]       
            time.sleep(1)
            response = request_contracts(order_number)
            if not response.ok:
                log_response(response)
                break

            procurement_contracts = parse_contracts(response.content)
            if procurement_contracts:
                for contract in procurement_contracts:
                    contract["order_number"] = order_number
                contracts.update({c["reestr_number"]: c for c in procurement_contracts})
            yield i / len(df_procurements)

    contracts = {}
    log(select(contracts), True)
    return list(contracts.values())


def select_and_cache_contracts(df_procurements, prefix):
    can_contracts = df_procurements.stage == "Определение поставщика завершено"
    df_procurements = df_procurements[can_contracts]
    df_uncached_procurements = df_procurements

    df_contcat_contracts = concat("./contracts/", False)
    if df_contcat_contracts is not None:
        exclude = df_contcat_contracts["order_number"].unique()
        include = df_procurements["reg_number"].apply(lambda n: n not in exclude)
        df_uncached_procurements = df_procurements[include]
    print(f"procurements: count={len(df_procurements)}, uncached={len(df_uncached_procurements)}")

    new_contracts = select_contracts(df_uncached_procurements)
    print(f"contracts: new={len(new_contracts)}")
    if not new_contracts:
        return None

    df = pd.DataFrame.from_records(new_contracts)
    now = datetime.datetime.now()
    path = f"./contracts/{prefix}{now.month}.{now.day}_{now.hour}_{now.minute}_{now.second}_{now.microsecond}.csv"
    df.to_csv(path, index=False)
    return df

In [None]:
# Поиск и сохранение контрактов
def collect_contracts(start, stop, step):
    while start < stop:
        end = start + step
        print(start, end-1)
        select_and_cache_contracts(df_procurements[start:end], f"{start}-{end-1}_")
        start = end


collect_contracts(0, 1000, 50)
print("Successfully!")

In [None]:
# Формирование набора данных контрактов
concat("./contracts/").to_csv("./contracts.csv", index=False)

In [None]:
df_contracts = pd.read_csv("./contracts.csv")
assert df_contracts.reestr_number.nunique() == len(df_contracts)
df_contracts.info()

In [None]:
df_contracts.head()

In [None]:
def request_contract_info(href):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }
    return requests.get("https://zakupki.gov.ru"+href, headers=headers)

In [None]:
def parse_contract_supplier(prefix, content):
    def parse_supplier_inn(content):
        for section in content.td("section", class_="section", recursive=False):
            for span in section("span"):
                if span.text.strip().startswith("ИНН:"):
                    return span.find_next_sibling("span").text.strip()
        return None
    
    supplier = {}
    try:
        supplier[prefix+"organization"] = next(content.td.stripped_strings)
    except StopIteration:
         pass
        
    inn = parse_supplier_inn(content)
    if inn:
        supplier[prefix+"inn"] = inn
    return supplier


def parse_contract_info(content):
    parse_only = SoupStrainer("tbody", class_="tableBlock__body")
    bs = BeautifulSoup(content, "html.parser", parse_only=parse_only)
    contract_info = {}
    for index, supplier_content in enumerate(bs):
        contract_info.update(parse_contract_supplier(f"supplier{index+1}_", supplier_content))
    return contract_info

In [None]:
def select_extended_contracts(df_contracts):
    def select(extended_contracts):
        for i, (_, contract) in enumerate(df_contracts.iterrows()):
            time.sleep(1)
            response = request_contract_info(contract["href"])
            if not response.ok:
                log_response(response)
                break

            contract_info = parse_contract_info(response.content)
            extended_contracts.update({contract["reestr_number"]: contract_info})
            yield i / len(df_contracts)
    
    extended_contracts = {}
    log(select(extended_contracts), True)
    return extended_contracts


def select_and_cache_extended_contracts(df_contracts, prefix):
    df_uncached_contracts = df_contracts
    df_contcat_ex_contracts = concat("./extended_contracts/", False)
    if df_contcat_ex_contracts is not None:
        exclude = df_contcat_ex_contracts["reestr_number"].unique()
        include = df_contracts["reestr_number"].apply(lambda n: n not in exclude)
        df_uncached_contracts = df_contracts[include]
    print(f"contracts: count={len(df_contracts)}, uncached={len(df_uncached_contracts)}")

    new_ex_contracts = select_extended_contracts(df_uncached_contracts)
    print(f"extended contracts: new={len(new_ex_contracts)}")
    if not new_ex_contracts:
        return
    
    extended_contracts = {c["reestr_number"]: c for c in df_contracts.to_dict(orient="records")}
    for key, new_ex_contract in new_ex_contracts.items():
        contract = extended_contracts[key]
        contract.update(new_ex_contract)

    df = pd.DataFrame.from_records(list(extended_contracts.values()))
    now = datetime.datetime.now()
    path = f"./extended_contracts/{prefix}{now.month}.{now.day}_{now.hour}_{now.minute}_{now.second}_{now.microsecond}.csv"
    df.to_csv(path, index=False)

In [None]:
# Поиск и сохранение дополнительной информации о контрактах
def collect_extended_contracts(df_contracts, start, stop, step):
    while start < stop:
        end = start + step
        print(start, end-1)
        select_and_cache_extended_contracts(df_contracts[start:end], f"{start}-{end-1}_")
        start = end


collect_extended_contracts(df_contracts, 0, 1000, 50)
print("Successfully!")

In [None]:
# Формирование расширенного набора данных о контрактах
concat("./extended_contracts/").to_csv("./extended_contracts.csv", index=False)

In [None]:
df_extended_contracts = pd.read_csv("extended_contracts.csv")
assert df_extended_contracts.reestr_number.nunique() == len(df_extended_contracts)
df_extended_contracts.info()

In [None]:
df_extended_contracts.head()

In [None]:
#TODO: Объединить procurements.csv и extended_contracts.csv в общий набор zakupki.csv
# по столбцам reg_number и order_number