In [58]:
import requests
import pandas as pd
import numpy as np
import re

from selectolax.parser import HTMLParser
from tqdm import tqdm

In [15]:
resp = requests.get(
    "https://www.marketscreener.com/quote/stock/CHINA-RAILWAY-GROUP-LIMIT-6500027/financials/"
)

In [16]:
dom = HTMLParser(resp.text)

In [80]:
table = dom.css_first("div#Tableau_Histo_ECR_a > table > tbody")

In [81]:
def clear_value(el):
    if el == "-":
        return np.nan
    
    return re.sub(",", ".", re.sub("[^\d\,\./]", "", el))

In [None]:
def transform

In [91]:
{
    el.css_first("sup").text(): el.text()[2:] for el in rows[-2].css_first("td").css("div")
}

{'1': 'CNY in Million', '2': 'CNY'}

In [90]:
rows = table.css("tr")

years = [el.text() for el in rows[0].css("td")[1:]]
data = {}

for row in rows[1:-2]:
    cols = row.css("td")
    
    row_name = cols[0].text()
    
    if (sup:=cols[0].css_first("sup")):
        row_name = cols[0].text()[:-1]
    
    values = [
        clear_value(col.text()) for col in cols[1:]
    ]
    
    for year, value in zip(years, values):
        data[f"{row_name}_{year}"] = value
    
data

{'Net sales_2018': '740383',
 'Net sales_2019': '848440',
 'Net sales_2020': '971405',
 'Net sales_2021': '1070417',
 'Net sales_2022': '1151501',
 'Net sales_2023': '1288012',
 'Net sales_2024': '1413030',
 'Net sales_2025': '1502822',
 'EBITDA_2018': '38588',
 'EBITDA_2019': '41709',
 'EBITDA_2020': '42926',
 'EBITDA_2021': '48559',
 'EBITDA_2022': '53324',
 'EBITDA_2023': '60303',
 'EBITDA_2024': '68657',
 'EBITDA_2025': '69803',
 'Operating profit (EBIT)_2018': '28713',
 'Operating profit (EBIT)_2019': '31882',
 'Operating profit (EBIT)_2020': '33578',
 'Operating profit (EBIT)_2021': '38782',
 'Operating profit (EBIT)_2022': '43049',
 'Operating profit (EBIT)_2023': '49554',
 'Operating profit (EBIT)_2024': '56368',
 'Operating profit (EBIT)_2025': '58379',
 'Operating Margin_2018': '3.88',
 'Operating Margin_2019': '3.76',
 'Operating Margin_2020': '3.46',
 'Operating Margin_2021': '3.62',
 'Operating Margin_2022': '3.74',
 'Operating Margin_2023': '3.85',
 'Operating Margin_2024

In [116]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

In [94]:
element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "myDynamicElement"))
    )

'/Users/borokoko/HSE/Thesis/collection'

In [99]:
df = pd.read_csv("data.csv")
companies = df.company_name.tolist()

In [118]:
class LinkCollector:
    
    def __init__(self):
        self.driver = webdriver.Chrome()
        self.driver.get(
            "https://www.marketscreener.com/"
        )
        
    def collect_links(self, companies):
        
        search_bar = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "input[data-submit-form='recherche_menu']")
            )
        )
        
        search_res = []
        
        for company in tqdm(companies):
            search_bar.send_keys(company)
            
            self.driver.implicitly_wait(0.3)
            
            first_match = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "table.table--hover tr.table-child--pointer > td > a")
                )
            )

            search_res.append({
                "company_name": company,
                "matched_to": first_match.text,
                "url": first_match.get_attribute("href")
            })
            
            search_bar.clear()
        
        return search_res

In [112]:
collector = LinkCollector()

In [3]:
# result = collector.collect_links(companies)

In [24]:
resp = requests.get(
    "https://www.marketscreener.com/quote/stock/CHINA-RAILWAY-GROUP-LIMIT-6500027/financials/"
)

dom = HTMLParser(resp.text)

In [41]:
table_val = dom.css_first("div#Tableau_Histo_ECR_a tbody")

years = [
    el.text().strip() for el in table_val.css_first("tr:nth-child(1)").css("td")[1:]
]

In [61]:
data = []

for row in table_val.css("tr")[1:-3]:
    td_index = row.css_first("td:nth-child(1)")
    adjustment = td_index.css_first("sup")
    index_name = td_index.text().strip() if not adjustment else td_index.text().strip()[:-1]
    values = [el.text().strip() for el in row.css("td")[1:]]
    for value, year in zip(values, years):
        data.append({
            "index_name": f"{index_name}_{year}",
            "adjustment": adjustment.text() if adjustment else None,
            "value": value
        })

df = pd.DataFrame.from_dict(data)
df.head()

Unnamed: 0,index_name,adjustment,value
0,Net sales_2018,1,740 383
1,Net sales_2019,1,848 440
2,Net sales_2020,1,971 405
3,Net sales_2021,1,1 070 417
4,Net sales_2022,1,1 151 501


In [57]:
mapping = {
    el.css_first("sup").text(): el.text().strip()[2:]
    for el in table_val.css_first("tr:last-child > td").css("div")
}

In [62]:
df.adjustment.map(mapping)

0     CNY in Million
1     CNY in Million
2     CNY in Million
3     CNY in Million
4     CNY in Million
           ...      
91               CNY
92               CNY
93               CNY
94               CNY
95               CNY
Name: adjustment, Length: 96, dtype: object