In [1]:
import requests
import json

import pandas as pd

from selectolax.parser import HTMLParser
from typing import *
from tqdm import tqdm
from w3lib.html import remove_tags

In [2]:
def parse_header(raw_header: str):
    header = dict()
    for line in raw_header.split("\n"):
        if line.startswith(":"):
            a, b = line[1:].split(":", 1)
            a = f":{a}"
        else:
            a, b = line.split(":",1)
        header[a.strip()] = b.strip()
    return header

In [3]:
header_str = """accept: application/json, text/plain, */*
accept-encoding: gzip, deflate, br
accept-language: ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7
content-length: 193
content-type: application/json;charset=UTF-8
origin: https://ces.hse.ru
referer: https://ces.hse.ru/
sec-ch-ua: "Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"
sec-ch-ua-mobile: ?0
sec-ch-ua-platform: "macOS"
sec-fetch-dest: empty
sec-fetch-mode: cors
sec-fetch-site: same-site
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"""

In [4]:
headers = parse_header(header_str)

In [5]:
class HSEPublish:
    
    def __init__(self, headers):
        self.headers = headers
    
    def update_data(self, pageId: int) -> None:
        self.data = json.dumps({
            "type": "ANY",
            "filterParams": "\"acceptLanguage\":\"ru\"|\"pubsUnit\": 38477612|\"widgetName\": \"Search\"",
            "paginationParams": {
                "publsSort": ["YEAR_DESC","TITLE_ASC"],
                "publsCount": 21,
                "pageId": pageId
            }
        })
    
    def post_request(self) -> dict:
        resp = requests.post(
            "https://publications.hse.ru/api/searchPubs",
            data=self.data,
            headers=self.headers
        ).json()
        return resp
    
    def parse_publications(self) -> List[Dict[str, Any]]:
        json_data = []
        page_count = 1
        
        while True:
            
            self.update_data(page_count)
            resp = self.post_request()
            
            for item in resp["result"]["items"]:
                
                data = {
                    "title": item["title"],
                    "type": item["type"],
                    "year": item["year"]
                }
                
                if "en" in item["annotation"]:
                    data["description"] = item["annotation"]["en"]
                else:
                    if "ru" in item["annotation"]:
                        data["description"] = item["annotation"]["ru"]
                    else:
                        data["decription"] = None
                
                if (authors:=item["authorsByType"]["author"]):
                    data["authors"] = [author["title"]["ru"] for author in authors]
                else:
                    data["authors"] = []
                    
                json_data.append(data)
                
            page_count += 1
            
            if resp["result"]["remaining"] == 0:
                break
                
        return json_data

In [6]:
hse = HSEPublish(headers)
data = hse.parse_publications()

In [7]:
with open("data.json", "w") as file:
    json.dump(data, file)

In [8]:
df = pd.read_json("data.json")

In [9]:
def clear_description(value):
    if value:
        return remove_tags(value)
    return 

In [10]:
df.description = df.description.apply(clear_description)

In [11]:
df.to_csv("publications.csv", index=False)