In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import json
import xml.etree.ElementTree as et

In [2]:
html = requests.get('https://www.fedstat.ru/indicator/33459')
print(html.status_code)

200


In [3]:
soup = BeautifulSoup(html.text, "lxml")

In [4]:
#Pull the filters out of the html code
def get_filters(soup):
    script = soup.find_all("script")[11].text
    pattern = r"filters:\s*(\{.*?\})(?=\s*,\s*left_columns)"
    match = re.search(pattern, script, re.DOTALL).group(0)
    filters = "{" + match + "}"
    filters = re.sub(r'([{,]\s*)(\w+)(\s*:)', r'\1"\2"\3', filters)
    filters = filters.replace("'", '"')
    filters = json.loads(filters)
    return filters['filters']
filters = get_filters(soup)
filters

{'0': {'title': 'Показатель',
  'all': True,
  'values': {'33459': {'title': 'Численность постоянного населения - женщин по возрасту на 1 января',
    'order': 0,
    'checked': True}},
  'indicator': True},
 '3': {'title': 'Год',
  'all': False,
  'values': {'1990': {'title': '1990', 'order': 0, 'checked': True},
   '1991': {'title': '1991', 'order': 1, 'checked': True},
   '1992': {'title': '1992', 'order': 2, 'checked': True},
   '1993': {'title': '1993', 'order': 3, 'checked': True},
   '1994': {'title': '1994', 'order': 4, 'checked': True},
   '1995': {'title': '1995', 'order': 5, 'checked': True},
   '1996': {'title': '1996', 'order': 6, 'checked': True},
   '1997': {'title': '1997', 'order': 7, 'checked': True},
   '1998': {'title': '1998', 'order': 8, 'checked': True},
   '1999': {'title': '1999', 'order': 9, 'checked': True},
   '2000': {'title': '2000', 'order': 10, 'checked': True},
   '2001': {'title': '2001', 'order': 11, 'checked': True},
   '2002': {'title': '2002', 'ord

In [5]:
# Find all available categories
def get_categories(filters):
    for key in filters.keys():
        print(filters[key]['title'])
get_categories(filters)

Показатель
Год
Возраст
Классификатор объектов административно-территориального деления (ОКАТО)
Единица измерения
Период
Тип поселения


In [6]:
#Find all available values for each category
categories =[]
for key in filters.keys():
    categories.append({
        key: list(filters[key]['values'].keys())
    })
ids = [f"{k}_{val}" for item in categories for k, v in item.items() for val in v]

# SDMX data

In [None]:
#All values
data = {
        "lineObjectIds": ["0", "30611", "58335", "57831", "58274"],
        "columnObjectIds": ["3", "33560"],
        "selectedFilterIds": ids
}
params = {
    "format" : "sdmx",
    "id" : "33459"
}
response = requests.post("https://www.fedstat.ru/indicator/data.do?", params = params, data = data)
print(response.status_code, response.headers)

In [7]:
# Adding  regions
regions = [f"{k}_{val}" for item in categories for k, v in item.items() if k =="57831" for val in v]

In [8]:
#Adding years
years = [f"{k}_{val}" for item in categories for k, v in item.items() if k =="3" for val in v]

In [9]:
#Adding settlement type
settlement_type = [f"{k}_{val}" for item in categories for k, v in item.items() if k =="58274" for val in v]
settlement_type

['58274_1744150', '58274_1750789', '58274_1750788']

In [10]:
#Adding age categories
age =  [f"{k}_{val}" for item in categories for k, v in item.items() if k =="58335" for val in v]

In [12]:
#collect regional data
data = {
        "lineObjectIds": ["0", "30611", "58335", "57831", "58274"],
        "columnObjectIds": ["3", "33560"],
        "selectedFilterIds": [
            "0_33459",
            years,
            "30611_950458",
            "33560_1540248",
            regions,
            "58274_1744150",
            "58335_1709566"
        ]
    }
params = {
    "format" : "sdmx",
    "id" : "33459"
}
response = requests.post("https://www.fedstat.ru/indicator/data.do?", params = params, data = data)
print(response.status_code, response.headers)

200 {'Content-Type': 'text/xml', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Date': 'Sat, 12 Jul 2025 10:39:48 GMT', 'Set-Cookie': 'JSESSIONID=20E85FCD89FE3297FE03E7FAF9608248; Path=/; Secure; HttpOnly, session-cookie=18517ad4dc4c6a7edcd5a505204218c4d2506ea5bd73d1c7df057e3ecf38e193bc79ec9cdd66ebf6efc4768835a5ee4b; Max-Age=86400; Path=/; secure; HttpOnly', 'Cache-Control': 'no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': '0', 'Pragma': 'public', 'Content-Disposition': "attachment; filename*=UTF-8''data.xml", 'vary': 'accept-encoding', 'Content-Language': 'ru', 'X-Frame-Options': 'SAMEORIGIN', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'Content-Security-Policy': 'upgrade-insecure-requests', 'Referrer-Policy': 'strict-origin'}


In [13]:
# Save the data in a file
with open("regions.xml", "wb") as file:
    file.write(response.content)

In [41]:
#Transform sdmx file into pandas dataframe
def read_sdmx(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    namespaces = {
        "generic": "http://www.SDMX.org/resources/SDMXML/schemas/v1_0/generic",
        "message": "http://www.SDMX.org/resources/SDMXML/schemas/v1_0/message",
        "structure": 'http://www.SDMX.org/resources/SDMXML/schemas/v1_0/structure'
    } 

    dataset = root.find(".//message:DataSet", namespaces)
    data = []
    for series in dataset.findall(".//generic:Series", namespaces):
        observations = series.findall(".//generic:Obs", namespaces)
        pair = {}
        for obs in observations:
            time = obs.find(".//generic:Time", namespaces).text
            value = obs.find(".//generic:ObsValue", namespaces).attrib['value']
            region = obs.find(".//")
            data.append({
                "year" : time,
                "population" : value
                })
    return pd.DataFrame(data)

data = read_sdmx("regions.xml")
data

Unnamed: 0,year,population
0,1990,78549601
1,1991,78817695
2,1992,78914788
3,1993,78899303
4,1994,78770458
...,...,...
3502,1996,3833075
3503,1999,3700204
3504,2000,3620686
3505,2001,3620686


# Excel data

In [None]:
#Let's get the data in excel
data = {
        "lineObjectIds": ["0",  "57831", "58335", "58274", "30611"],
        "columnObjectIds": ["3", "33560"],
        "selectedFilterIds": [
            regions,
            "0_33459",
            years,
            "30611_950458",
            "33560_1540248",
            settlement_type,
            age
        ]
    }
params = {
    "format" : "excel",
    "id" : "33459"
}
response = requests.post("https://www.fedstat.ru/indicator/data.do?", params = params, data = data)
print(response.status_code, response.headers)

In [23]:
#Save the excel data
with open("regions.xls", "wb") as file:
    file.write(response.content)

In [46]:
df = pd.read_excel("regions.xls", header = 2)

In [47]:
df.drop([df.columns[0], df.columns[4]], axis = 1, inplace = True)
df.drop(0, axis = 0, inplace = True)

In [48]:
df.rename(columns = {
    'Unnamed: 1' : "region",
    "Unnamed: 2" : "age",
    "Unnamed: 3" : "settlement"
    }, inplace = True)

In [50]:
df.sample(10)

Unnamed: 0,region,age,settlement,1990,1991,1992,1993,1994,1995,1996,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
17330,Республика Дагестан,90-94 года,все население,1648.0,2491.0,2592.0,2796.0,2853.0,2701.0,2448.0,...,1881.0,2232.0,2335.0,2818.0,2825.0,3708.0,2757.0,3480.0,3800.0,4279.0
31144,Красноярский край,50 лет,сельское население,5548.0,4995.0,4350.0,2761.0,1770.0,1551.0,1976.0,...,4285.0,3976.0,3805.0,3664.0,3748.0,3530.0,3808.0,3805.0,3789.0,4001.0
29659,Республика Алтай,57 лет,сельское население,881.0,664.0,718.0,549.0,665.0,767.0,748.0,...,1166.0,1206.0,1166.0,1149.0,1084.0,992.0,950.0,842.0,868.0,807.0
14120,Республика Калмыкия,40-44 лет,все население,7042.0,8581.0,9621.0,10565.0,11463.0,12074.0,12548.0,...,8204.0,8363.0,8511.0,8666.0,8885.0,9139.0,9371.0,9645.0,10080.0,10422.0
37702,Приморский край,89 лет,сельское население,121.0,82.0,117.0,169.0,178.0,186.0,168.0,...,263.0,280.0,271.0,232.0,226.0,225.0,335.0,262.0,245.0,352.0
34919,Дальневосточный федеральный округ (с 03.11...,51 лет,все население,,,,,,,,...,54215.0,52852.0,50359.0,47982.0,49184.0,50404.0,53912.0,53694.0,55974.0,54951.0
14504,Республика Крым,42 лет,все население,,,,,,,,...,12756.0,13414.0,13536.0,13985.0,14017.0,14287.0,14601.0,15032.0,15158.0,15638.0
41779,Центрально-Черноземный район,70-74 лет,сельское население,,94810.0,,,,,103725.0,...,,,,,,,,,,
40658,Северо-Западный район,72 года,все население,,28493.0,,,,,46032.0,...,,,,,,,,,,
6327,Тверская область,73 года,сельское население,2763.0,2516.0,2676.0,2886.0,2976.0,3169.0,4188.0,...,1218.0,581.0,714.0,853.0,1512.0,1600.0,1618.0,2167.0,2180.0,2020.0


In [58]:
ages  = df.age.unique()
for age in ages:
    numbers = re.findall(r'\d+', age)
    print(numbers, len(numbers))

[] 0
['0'] 1
['0', '17'] 2
['0', '4'] 2
['1'] 1
['10'] 1
['100'] 1
['10', '14'] 2
['11'] 1
['12'] 1
['13'] 1
['14'] 1
['15'] 1
['15', '19'] 2
['16'] 1
['16', '29'] 2
['17'] 1
['18'] 1
['19'] 1
['2'] 1
['20'] 1
['20', '24'] 2
['20', '29'] 2
['20', '39'] 2
['21'] 1
['22'] 1
['23'] 1
['24'] 1
['25'] 1
['25', '29'] 2
['26'] 1
['27'] 1
['28'] 1
['29'] 1
['3'] 1
['30'] 1
['30', '34'] 2
['31'] 1
['32'] 1
['33'] 1
['34'] 1
['35'] 1
['35', '39'] 2
['36'] 1
['37'] 1
['38'] 1
['39'] 1
['4'] 1
['40'] 1
['40', '44'] 2
['41'] 1
['42'] 1
['43'] 1
['44'] 1
['45'] 1
['45', '49'] 2
['46'] 1
['47'] 1
['48'] 1
['49'] 1
['5'] 1
['50'] 1
['50', '54'] 2
['51'] 1
['52'] 1
['53'] 1
['54'] 1
['55'] 1
['55', '59'] 2
['56'] 1
['57'] 1
['58'] 1
['59'] 1
['5', '9'] 2
['6'] 1
['60'] 1
['60', '64'] 2
['61'] 1
['62'] 1
['63'] 1
['64'] 1
['65'] 1
['65', '69'] 2
['66'] 1
['67'] 1
['68'] 1
['69'] 1
['7'] 1
['70'] 1
['70', '74'] 2
['71'] 1
['72'] 1
['73'] 1
['74'] 1
['75'] 1
['75', '79'] 2
['76'] 1
['77'] 1
['78'] 1
['79'