In [28]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import json
import xml.etree.ElementTree as et

In [None]:
html = requests.get('https://www.fedstat.ru/indicator/33459')
print(html.status_code)

200


In [141]:
soup = BeautifulSoup(html.text, "lxml")

In [161]:
#Pull the filters out of the html code
def get_filters(soup):
    script = soup.find_all("script")[11].text
    pattern = r"filters:\s*(\{.*?\})(?=\s*,\s*left_columns)"
    match = re.search(pattern, script, re.DOTALL).group(0)
    filters = "{" + match + "}"
    filters = re.sub(r'([{,]\s*)(\w+)(\s*:)', r'\1"\2"\3', filters)
    filters = filters.replace("'", '"')
    filters = json.loads(filters)
    return filters['filters']
filters = get_filters(soup)
filters

{'0': {'title': 'Показатель',
  'all': True,
  'values': {'33459': {'title': 'Численность постоянного населения - женщин по возрасту на 1 января',
    'order': 0,
    'checked': True}},
  'indicator': True},
 '3': {'title': 'Год',
  'all': False,
  'values': {'1990': {'title': '1990', 'order': 0, 'checked': True},
   '1991': {'title': '1991', 'order': 1, 'checked': True},
   '1992': {'title': '1992', 'order': 2, 'checked': True},
   '1993': {'title': '1993', 'order': 3, 'checked': True},
   '1994': {'title': '1994', 'order': 4, 'checked': True},
   '1995': {'title': '1995', 'order': 5, 'checked': True},
   '1996': {'title': '1996', 'order': 6, 'checked': True},
   '1997': {'title': '1997', 'order': 7, 'checked': True},
   '1998': {'title': '1998', 'order': 8, 'checked': True},
   '1999': {'title': '1999', 'order': 9, 'checked': True},
   '2000': {'title': '2000', 'order': 10, 'checked': True},
   '2001': {'title': '2001', 'order': 11, 'checked': True},
   '2002': {'title': '2002', 'ord

In [None]:
# Find all available categories
def get_categories(filters):
    for key in filters.keys():
        print(filters[key]['title'])
get_categories(filters)

Показатель
Год
Возраст
Классификатор объектов административно-территориального деления (ОКАТО)
Единица измерения
Период
Тип поселения


In [238]:
#Find all available values for each category
categories =[]
for key in filters.keys():
    categories.append({
        key: list(filters[key]['values'].keys())
    })
ids = [f"{k}_{val}" for item in categories for k, v in item.items() for val in v]

In [None]:
#All values
data = {
        "lineObjectIds": ["0", "30611", "58335", "57831", "58274"],
        "columnObjectIds": ["3", "33560"],
        "selectedFilterIds": ids
}
params = {
    "format" : "sdmx",
    "id" : "33459"
}
response = requests.post("https://www.fedstat.ru/indicator/data.do?", params = params, data = data)
print(response.status_code, response.headers)

In [None]:
# Adding  regions
regions = [f"{k}_{val}" for item in categories for k, v in item.items() if k =="57831" for val in v]

In [254]:
#Adding years
years = [f"{k}_{val}" for item in categories for k, v in item.items() if k =="3" for val in v]

In [258]:
#Adding settlement type
settlement_type = [f"{k}_{val}" for item in categories for k, v in item.items() if k =="58274" for val in v]
settlement_type

['58274_1744150', '58274_1750789', '58274_1750788']

In [262]:
#Adding age categories
age =  [f"{k}_{val}" for item in categories for k, v in item.items() if k =="58335" for val in v]

In [247]:
data = {
        "lineObjectIds": ["0", "30611", "58335", "57831", "58274"],
        "columnObjectIds": ["3", "33560"],
        "selectedFilterIds": [
            "0_33459",
            [f"3_{year}" for year in range(1990, 2026)],
            "30611_950458",
            "33560_1540248",
            regions,
            "58274_1744150",
            "58335_1709566"
        ]
    }
params = {
    "format" : "sdmx",
    "id" : "33459"
}
response = requests.post("https://www.fedstat.ru/indicator/data.do?", params = params, data = data)
print(response.status_code, response.headers)

200 {'Content-Type': 'text/xml', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Date': 'Fri, 11 Jul 2025 17:39:22 GMT', 'Set-Cookie': 'JSESSIONID=CAB10F10E09B3307BB468F68321DB59A; Path=/; Secure; HttpOnly, session-cookie=1851432563da909cd248bc5a204218c46e2e9294605d4d2ed8278397dcd15ae46c4f2c8b8d4fdbc36ab93695e11ec49e; Max-Age=86400; Path=/; secure; HttpOnly', 'Cache-Control': 'no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': '0', 'Pragma': 'public', 'Content-Disposition': "attachment; filename*=UTF-8''data.xml", 'vary': 'accept-encoding', 'Content-Language': 'ru', 'X-Frame-Options': 'SAMEORIGIN', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'Content-Security-Policy': 'upgrade-insecure-requests', 'Referrer-Policy': 'strict-origin'}


In [248]:
# Save the data in a file
with open("example.xml", "wb") as file:
    file.write(response.content)

In [252]:
#Transform sdmx file into pandas dataframe
def read_sdmx(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    namespaces = {
        "generic": "http://www.SDMX.org/resources/SDMXML/schemas/v1_0/generic",
        "message": "http://www.SDMX.org/resources/SDMXML/schemas/v1_0/message"
    }

    dataset = root.find(".//message:DataSet", namespaces)
    data = []
    for series in dataset.findall(".//generic:Series", namespaces):
        observations = series.findall(".//generic:Obs", namespaces)
        pair = {}
        for obs in observations:
            time = obs.find(".//generic:Time", namespaces).text
            value = obs.find(".//generic:ObsValue", namespaces).attrib['value']
            data.append({
                "year" : time,
                "population" : value
                })
    return pd.DataFrame(data)

data = read_sdmx("example.xml")

In [263]:
#Let's get the data in excel
data = {
        "lineObjectIds": ["0", "30611", "58335", "57831", "58274"],
        "columnObjectIds": ["3", "33560"],
        "selectedFilterIds": [
            "0_33459",
            years,
            "30611_950458",
            "33560_1540248",
            regions,
            settlement_type,
            age
        ]
    }
params = {
    "format" : "excel",
    "id" : "33459"
}
response = requests.post("https://www.fedstat.ru/indicator/data.do?", params = params, data = data)
print(response.status_code, response.headers)

200 {'Content-Type': 'application/vnd.ms-excel', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Date': 'Fri, 11 Jul 2025 18:05:48 GMT', 'Set-Cookie': 'JSESSIONID=E42BDC6B959407B8C835F78E51792387; Path=/; Secure; HttpOnly, session-cookie=18514445e8d8a8b8d248bc5a204218c4cfd9a940ec5954828b4d680c6602528b7b009b545b297e2ad7cdbde9861f835e; Max-Age=86400; Path=/; secure; HttpOnly', 'Cache-Control': 'no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': '0', 'Pragma': 'public', 'Content-Disposition': "attachment; filename*=UTF-8''data.xls", 'Content-Language': 'ru', 'X-Frame-Options': 'SAMEORIGIN', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'Content-Security-Policy': 'upgrade-insecure-requests', 'Referrer-Policy': 'strict-origin'}


In [264]:
#Save the excel data
with open("regions.xls", "wb") as file:
    file.write(response.content)