# Parsing House Management Data from GosUslugi portal

## The code helps to parse house-level data from government portal. It uses API as well as other manually developed instruments to get the data. The received data is used to run regression analysis to see the effectiveness of SOE vs non SOE (State Owned Enterprices) house management companies.

In [None]:
pip install gosuslugi-api

In [32]:
#sw0 = HTTPClient()
#sw1 = GosUslugiAPIClient(keep_alive=True)
#sw1.get_organizations('3507007915')

In [33]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [34]:
import json
import time
import logging
from io import BytesIO
from typing import Union
from urllib.parse import urlencode
#from zipfile import Zipfile

In [35]:
import requests
from openpyxl import load_workbook

In [36]:
from gosuslugi_api.consts import REGION_CODES_AND_NAMES
from gosuslugi_api.exceptions import RegionCodeIsAbsentError

In [37]:
logger = logging.getLogger(__name__)

In [38]:
def _get_body_for_logging(body: Union[bytes, str]) -> str:
    try:
        if isinstance(body,bytes):
            return(b' BODY:' + body).decode('utf-8')
        elif isinstance(body, str):
            return 'BODY: ' + body
        else:
            return ''
    except UnicodeDecodeError:
        return ''

In [39]:
def _get_duration_for_logging(duration: str) -> str:
    if duration is not None:
        return '{0:.6f}s'.format(duration)
    else:
        return ''

In [40]:
class HTTPClient:
    GET_HTTP_METHOD = 'GET'
    POST_HTTP_METHOD = 'POST'
    PATCH_HTTP_METHOD = 'PATCH'
    PUT_HTTP_METHOD = 'PUT'
    
    BODY_LESS_METHODS = [GET_HTTP_METHOD]
    LOG_REQUEST_TEMPLATE = '%(method)s%(url)s%(request_body)s%(duration)s'
    LOG_RESPONSE_TEMPLATE = (
        LOG_REQUEST_TEMPLATE
        + ' - HTTP %(status_code)s%(response_body)s%(duration)s')
    
    def __init__(self, timeout=3, keep_alive=False, default_headers=None):
        self.timeout = timeout
        self.keep_alive = keep_alive
        self.default_headers = default_headers or {}
        self._session = None
        
    def _log_request(
           self, method, url, body, duration=None, log_method=logger.info):
        message_params = {
            'method':method, 'url':url,
            'request_body': _get_body_for_logging(body),
            'duration': _get_duration_for_logging(duration)}
        log_method(self.LOG_REQUEST_TEMPLATE, message_params)
        
        
    def _log_response(self, response, duration, log_method=logger.info):
        message_params = {
            'method': response.request.method,
            'url': response.request.url,
            'request_body': _get_body_for_logging(response.request.body),
            'status_sode': response.status_code,
            'response_body': _get_body_for_logging(response.content),
            'duration': _get_duration_for_logging(duration)}
        log_method(self.LOG_RESPONSE_TEMPLATE, message_params)
        
        
        
    def _make_request(self, method, url, **kwargs) -> requests.Response:
        kwargs.setdefault('timeout', self.timeout)
        session = self.session
        timeout = kwargs.pop('timeout', self.timeout)
        
        headers = self.default_headers.copy()
        headers.update(kwargs.pop('headers',{}))
        
        request = requests.Request(method, url, headers=headers, **kwargs)
        prepared_request = request.prepare()
        self._log_request(method, url, prepared_request.body)
        start_time = time.time()
        try:
            response = session.send(prepared_request, timeout=timeout)
            duration = time.time() - start_time
            if response.status_code>=400:
                log_method = logging.error
            else:
                log_method = logging.debug
                
            
            self._log_response(
                response, duration=duration, log_method=log_method)
            return response
        except requests.exceptions.RequestException as e:
            duration = time.time() - start_time
            if e.response:
                self._log_response(
                    e.response, duration=duration, log_method=logging.error)
            else:
                self._log_request(
                     method, url, prepared_request.body,
                     log_method=logging.exception)
            raise
        finally:
            if not self.keep_alive:
                session.close()
    
    @property
    def session(self) ->requests.Session:
        if self.keep_alive:
            if not self._session:
                self._session=requests.Session()
            return self._session
        else:
            return requests.Session()
        
    def get(self, url, params=None, **kwargs) -> requests.Response:
        if params:
            url_with_query_params=url+'?' +urlencode(params)
        else:
            url_with_query_params=url
            
        return self._make_request(
            self.GET_HTTP_METHOD, url_with_query_params, **kwargs)
        
        
    def post(self,url, **kwargs) -> requests.Response:
        return self._make_request(self.POST_HTTP_METHOD, url, **kwargs)


In [62]:
class GosUslugiAPIClient:
    REGION_CODES_AND_NAMES = REGION_CODES_AND_NAMES
    
    BASE_URL = 'https://dom.gosuslugi.ru/'
    LICENSE_UID_URL = (
        f'{BASE_URL}licenses/api/rest/services/public/'
        'licenses/region-license-xls/{}')
    HOUSE_CODE_URL = (
    f'{BASE_URL}nsi/api/rest/services/nsi/fias/v4/houses?'
        'houseCodes={}&includeDuplicates=false&actual={}')
    
    ORGANIZATIONS_URL = (
        f'{BASE_URL}ppa/api/rest/services/ppa/'
        'organizations/chooser/search;page=1;itemsPerPage=11')
    
    ORGANIZATION_URL = (
        f'{BASE_URL}ppa/api/rest/services/ppa/public/organizations'
        '/orgByGuid?organizationGuid={}')
    
    ORGANIZATION_PAYLOAD_PART_1 = (
        '{"sortCriteriaList":[{"sortedBy":"organizationType",'
        '"ascending":false},'
        '{"sortedBy":"shortName","ascending":true},{"sortedBy":"fullName",'
        '"ascending":true},{"sortedBy":"parentKpp","ascending":true},'
        '{"sortedBy":"kpp","ascending":true}],"organizationStatuses":'
        '{"coll":["REGISTERED"], "operand":"OR"}, "organizationTypes":'
        '{"coll":["B", "L", "A"], "operand": "OR"}, "subordinationOrgTypeList":'
        '{"coll":["HEAD", "BRANCH"], "operand":"OR"}, "commonSearchString":"')
    ORGANIZATION_PAYLOAD_PART_2 = (
        '",","roleConstraints":{"coll":[{"roleCode":"1",'
        '"roleStatuses":["APPROVED"]},{"roleCode":"19","roleStatuses":'
        '["APPROVED"]},{"roleCode":"20", "roleStatuses":["APPROVED"]},'
        '{"roleCode":"22", "roleStatuses":["APPROVED"]},'
        '{"roleCode":"21", "roleStatuses":["APPROVED"]}],"operand":"OR"}}')
    
    def __init__(self, timeout=5, keep_alive=False):
        self._region_codes = set(self.REGION_CODES_AND_NAMES)
        self._http_client = HTTPClient(timeout=timeout, keep_alive=keep_alive)
        
    def _get_response_body(self, response: requests.Response):
        status_code = response.status_code
        if status_code>=400:
            response.raise_for_status()
        elif not response.content:
            return ''
        else:
            return response.json()
        
    def get_organizations(self, inn):
        payload = self.ORGANIZATION_PAYLOAD_PART_1 + str(inn)
        payload += self.ORGANIZATION_PAYLOAD_PART_2
        
        url = self.ORGANIZATIONS_URL
        headers = {'Content-Type':'application/json'}
        response = self._http_client.post(url, data=payload, headers=headers)
        return self._get_response_body(response)
    
    def get_organization(self,guid):
        url = self.ORGANIZATION_URL.format(guid)
        return self._get_response_body(self._http_client.get(url))
    
    def get_actual_houses(self, house_code):
        url = self.HOUSE_CODE_URL.format(house_code, 'true')
        return self._get_response_body(self._http_client.get(url))
    
    def get_home_managements(self, org_guid, start_page=1, per_page=1):
        url=self.HOME_MANAGEMENTS_URL.format(
            page_number=start_page, elems_per_page=per_page)
        payload = json.dumps({'organizationGuid': org_guid, 'calcCount': True})
        headers = {'Content-Type': 'application/json'}
        response = requests.post(url, data=payload, headers=headers)
        json_body = self._get_response_body(response)
        yield json_body
        objects_number = json_body['total'] or 0
        for page_num in range(2, objects_number +1):
            url=self.HOME_MANAGEMENTS_URL.format(
                page_number=page_num, elems_per_page=per_page)
            response = requests.post(url, data=payload, headers=headers)
            yield self._get_response_body(response)
   

    

In [95]:
GO1=GosUslugiAPIClient()
#работает: guid взят с dom.gosuslugi.ru/#!/licensee-houses-info/
#GO1.get_organization('c3f46dba-f446-4d07-9122-bcc566434c93')

In [57]:
GO1=GosUslugiAPIClient()
#работает:
#GO1.get_actual_houses(house_code='5a541513-b462-449f-98f4-7dd660b49c9e')

In [93]:
GO1=GosUslugiAPIClient()
#работает:
#GO1.get_actual_houses(house_code='e4c272a8-63c3-45df-aa7f-76fdd1b622f9')

In [99]:
GO1=GosUslugiAPIClient()
#GO1.get_organizations('3435120482')


In [67]:
GO1=GosUslugiAPIClient()
GO1.get_home_managements('6d2367eb-92dc-493e-b38e-902b9ce32269')

<generator object GosUslugiAPIClient.get_home_managements at 0x7f956a743900>

We check all licence-holding functions by management companies:

In [68]:
LIZ_UID='050000924'


In [69]:

license_uid_url='https://dom.gosuslugi.ru/licenses/api/rest/services/public/licenses/region-license-xls/{}'.format(LIZ_UID)


In [70]:
Lic_results=requests.get(license_uid_url).json()


We also parse directly:

In [83]:
url='https://dom.gosuslugi.ru/#!/house-view?guid=a5893115-8e9d-4b4c-9fd6-9aa07fd59053&typeCode=1'

In [84]:
response = requests.get(url)
#print(response.text)

In [85]:
from bs4 import BeautifulSoup
r=requests.get(url)
with open('test.html','w') as output_file:output_file.write(r.text)
    


.. And indirectly via gosuslugi API (as suggested by Greg Eremeev - API developer):

In [101]:
from gosuslugi_api.clients import GosUslugiAPIClient

In [102]:
client=GosUslugiAPIClient()

In [116]:
#client.get_organizations('7706724054')

In [120]:
#client.get_organization('dfef9591-f591-47ac-9d65-794a2f8114c5')

In [127]:
client.get_organization('4105ee91-9191-4840-b788-f3d4db7c5f1b')

{'guid': '4105ee91-9191-4840-b788-f3d4db7c5f1b',
 'createDate': None,
 'lastEventDate': '2021-03-31 08:51:13',
 'orgOid': '1063040229',
 'fullName': 'ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ "УПРАВЛЯЮЩАЯ КОМПАНИЯ ЮГСТРОЙИНВЕСТ"',
 'shortName': 'ООО "УК ЮГСТРОЙИНВЕСТ"',
 'ogrn': '1173443023830',
 'ogrnip': None,
 'inn': '3435132150',
 'kpp': '343501001',
 'okopf': {'guid': 'dc79475c-20f2-4291-aae7-43838ad8230b',
  'code': '12300',
  'rootEntityGuid': None,
  'actual': True,
  'lastUpdateDate': '2015-06-12 19:41:30',
  'createDate': '2015-06-12 19:41:30',
  'name': 'Общества с ограниченной ответственностью',
  'parent': {'guid': '659a5227-0977-493a-a1a0-5695a088bf2c',
   'code': '12000',
   'rootEntityGuid': None,
   'actual': True,
   'lastUpdateDate': '2015-02-06 11:26:36',
   'createDate': '2015-02-06 11:26:36',
   'name': 'Хозяйственные общества',
   'parent': {'guid': '50f87739-cd50-42bd-b059-c262040919a6',
    'code': '10000',
    'rootEntityGuid': None,
    'actual': True,
    'la

In [121]:
client.get_home_management('a5893115-8e9d-4b4c-9fd6-9aa07fd59053')
#аргумент - guid из url адреса карточки дома = home_management_guid

{'guid': 'a5893115-8e9d-4b4c-9fd6-9aa07fd59053',
 'rootGuid': None,
 'versionNumber': None,
 'lastUpdateUnixTime': None,
 'lastUpdateDate': None,
 'createDate': None,
 'readOnly': None,
 'active': None,
 'asyncProcessing': None,
 'asyncProcessType': None,
 'status': 'APPROVED',
 'cancellationReasonCode': None,
 'cancellationComment': None,
 'cancellationDate': None,
 'address': {'region': {'guid': 'af757d44-3438-4040-9b68-d95099318998',
   'code': None,
   'rootEntityGuid': None,
   'actual': True,
   'lastUpdateDate': None,
   'createDate': None,
   'aoGuid': 'da051ec8-da2e-4a66-b542-473b8d221ab4',
   'aoLevel': 1,
   'postalCode': None,
   'formalName': 'Волгоградская',
   'offName': 'Волгоградская',
   'shortName': 'обл',
   'parentGuid': None,
   'oktmo': None,
   'regionCode': '34',
   'autoCode': None,
   'areaCode': None,
   'cityCode': None,
   'ctarCode': None,
   'placeCode': None,
   'planCode': None,
   'streetCode': None,
   'extrCode': None,
   'sextCode': None,
   'updat

In [128]:
client.get_actual_houses('e4c272a8-63c3-45df-aa7f-76fdd1b622f9')
#аргумент - идентификационный код адреса из карточки дома = house code

[{'guid': 'e4c272a8-63c3-45df-aa7f-76fdd1b622f9',
  'code': None,
  'rootEntityGuid': None,
  'actual': True,
  'lastUpdateDate': '24.04.2021 16:39',
  'createDate': '23.04.2021 16:39',
  'houseGuid': 'e4c272a8-63c3-45df-aa7f-76fdd1b622f9',
  'aoGuid': '1b3c6f13-85c7-4836-867c-d2b563e64db5',
  'postalCode': '404133',
  'houseNumber': '170',
  'buildingNumber': None,
  'structNumber': None,
  'additionalName': None,
  'houseCondition': {'classType': 'ru.lanit.hcs.nsi.api.dto.HouseCondition',
   'guid': '2d3ae73e-6c72-4740-9122-9c632d1893a7',
   'code': '2',
   'rootEntityGuid': '2d3ae73e-6c72-4740-9122-9c632d1893a7',
   'actual': True,
   'lastUpdateDate': '06.02.2015 11:19',
   'createDate': '06.02.2015 11:19',
   'houseCondition': 'Исправный'},
  'propertyStateGuid': '4979fb1a-8373-4f99-b87b-0c4ee832da15',
  'oktmo': {'classType': 'ru.lanit.hcs.nsi.api.dto.Oktmo',
   'guid': 'c9d955ce-7599-409d-9813-a9cbe3346d5b',
   'code': '18710000001',
   'rootEntityGuid': None,
   'actual': True,


After having all the data needed , it is required to upload them into Excel format ready-to-run the regression analysis.