In [1]:
from lxml import etree
import sympy as sp

def localname(e):
    return etree.QName(e).localname

def parse_element(elem):
    tag = localname(elem)

    if tag == "mn":
        text = elem.text.strip()
        
        # Заменяем запятую на точку для десятичных дробей
        text = text.replace(',', '.')
        
        # rational like 1/2
        if "/" in text:
            num, den = text.split("/")
            return sp.Rational(num, den)

        # integer or float
        try:
            return sp.Integer(text)
        except Exception:
            return sp.Float(text)

    if tag == "mi":
        return sp.Symbol(elem.text.strip())

    if tag == "msup":
        base = parse_element(elem[0])
        exp = parse_element(elem[1])
        return base ** exp

    if tag == "msqrt":
        return sp.sqrt(parse_element(elem[0]))

    if tag == "mfrac":
        num = parse_element(elem[0])
        den = parse_element(elem[1])
        return num/den

    if tag == "mo":
        text = elem.text.strip()
        # Обрабатываем различные математические операторы
        if text in ('|', '∣', '∥'):
            return '|'
        elif text == ',':
            return ','  # возвращаем запятую как есть
        return text

    if tag == "mrow":
        items = []
        ops = []

        for ch in elem:
            if localname(ch) == "mo":
                op_text = ch.text.strip()
                # Пропускаем некоторые служебные операторы, но не запятую
                if op_text not in ('(', ')', '[', ']', '{', '}', '|', '∣', '∥'):
                    ops.append(op_text)
            else:
                items.append(parse_element(ch))

        # Обрабатываем случай, когда запятая используется как разделитель в числах
        # Объединяем числа, разделенные запятыми
        new_items = []
        new_ops = []
        i = 0
        
        while i < len(items):
            if i < len(ops) and ops[i] == ',':
                # Если следующий оператор - запятая, объединяем числа
                if (i + 1 < len(items) and 
                    isinstance(items[i], (sp.Number, sp.Float, sp.Integer, sp.Rational)) and 
                    isinstance(items[i + 1], (sp.Number, sp.Float, sp.Integer, sp.Rational))):
                    
                    # Объединяем два числа в одно десятичное
                    left_str = str(items[i])
                    right_str = str(items[i + 1])
                    combined = float(left_str + '.' + right_str)
                    new_items.append(sp.Float(combined))
                    i += 2  # пропускаем следующий элемент
                else:
                    new_items.append(items[i])
                    new_ops.append(ops[i])
                    i += 1
            else:
                new_items.append(items[i])
                if i < len(ops):
                    new_ops.append(ops[i])
                i += 1
        
        items = new_items
        ops = new_ops

        # no operators → implicit multiplication
        if not ops:
            expr = items[0]
            for it in items[1:]:
                expr = expr * it
            return expr

        # left-to-right parse
        expr = items[0]
        idx = 1
        for op in ops:
            if idx >= len(items):
                break
            right = items[idx]
            idx += 1

            if op == "+":
                expr = expr + right
            elif op in ("-", "−"):
                expr = expr - right
            elif op in ("*", "×", "·"):
                expr = expr * right
            elif op == "/":
                expr = expr / right
            elif op in ("⁢", "\u2062", ""):
                expr = expr * right  # invisible times
            elif op == "|":
                # Обработка оператора модуля или других использований '|'
                expr = expr * right
            elif op == ",":
                # Для запятой используем неявное умножение или просто пропускаем
                expr = expr * right
            else:
                raise NotImplementedError(f"Unsupported operator {op!r}")

        return expr

    if len(elem):
        return parse_element(elem[0])

    raise NotImplementedError(f"Tag {tag}")

def mathml_to_sympy(s):
    root = etree.fromstring(s.encode("utf-8"))
    if localname(root) == "math":
        return parse_element(root[0])
    return parse_element(root)

In [2]:
%pip install lxml -qU

Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
import os
import requests

from bs4 import BeautifulSoup
from pathlib import Path


root = Path(os.getcwd()).parent.parent
excel_path = root / r"src/notebooks/index_links.xlsx"
root_url = "https://www.indexdatabase.de//"


links = pd.read_excel(excel_path)

# Извлечение литературы

In [29]:

article_page_url = "https://www.indexdatabase.de/db/r-single.php?id=441"

def parse_article_page(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    try:
        table = soup.select("div#content div.table.vektor tr")
        article_link = table[4].select("a")[0]['href']
        article_doi = table[3].get_text().split("\n")[3]
        tags = table[11].get_text().split(";")[1:]
    except:
        article_link=" "
        article_doi=" "
        tags=[]

    return {
        "link": article_link,
        "doi": article_doi,
        "tags": tags
    }

parse_article_page(article_page_url)

{'link': 'http://www.sciencedirect.com/science/article/pii/S0176161796802859',
 'doi': '10.1016/s0176-1617(96)80285-9',
 'tags': [' reflectance spectra of leaves',
  ' red edge position',
  ' vegetation indices\r\n      \n']}

# Извлечение формулы

In [32]:
root_url = "https://www.indexdatabase.de"

def parse_index_page(url: str):
    response = requests.get(url)
    response.raise_for_status()
    html = response.text
    
    soup= BeautifulSoup(html, "html.parser")
    basic_information = soup.select("div#content div.table.vektor tr")
    
    name = basic_information[0].get_text().split("\n")[3] 
    abbreviation = basic_information[1].get_text().split("\n")[3] 
    try:
        formula = mathml_to_sympy(str(basic_information[2].select("math")[0]))
    except:
        formula = "Can't parse"
    articles = []
    try:
        references = soup.select("div#content div.table.matrix")[2].select("table.matrix tbody tr")
        for r in references:
            title = r.select("td")[1].get_text()
            aricle_page_link = root_url + r.select("td")[1].select("a")[0]['href']
            
            article_data = parse_article_page(aricle_page_link)
            #print(aricle_page_link)
            link = article_data['doi']
            doi = article_data['link']
            
            articles.append({
                "article_name": title,
                "article_link": link,
                "article_doi": doi
            })
    except:
        pass
    
    return {
        "idb_link": url,
        "name": name,
        "abbreviation": abbreviation,
        "formula": str(formula),
        "references": articles
    }

In [33]:
links[0][10]

'https://www.indexdatabase.de//db/i-single.php?id=264'

In [34]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

index_set = []
with tqdm(total=len(links[0])) as pbar:
    for l in links[0]:
        pbar.desc = l
        # print(l)
        index_set.append(parse_index_page(l))
        pbar.update(1)

https://www.indexdatabase.de//db/i-single.php?id=400: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 338/338 [08:55<00:00,  1.58s/it]


In [37]:
import json 

with open("index_data.json", "w", encoding="utf8") as f:
    json.dump(index_set, f, ensure_ascii=False,indent=4)

In [203]:
index_set

[{'name': 'Modified Normalized Difference 734/747/715/720\r',
  'abbreviation': 'MD734/747/715/72\r',
  'formula': (734nm - 747nm)/(715nm - 720nm),
  'references': [{'article_name': 'Gitelson, Anatoly A.; Merzlyak, Mark N.; Lichtenthaler, Hartmut K. - Detection of Red Edge Position and Chlorophyll Content by Reflectance Measurements Near 700 nm\n',
    'article_link': '10.1016/s0176-1617(96)80285-9',
    'article_doi': 'http://www.sciencedirect.com/science/article/pii/S0176161796802859'},
   {'article_name': 'le Maire, G.; Francois, C.; Dufrene, E. - Towards universal broad leaf chlorophyll indices using PROSPECT simulated database and hyperspectral reflectance measurements\n',
    'article_link': '10.1016/j.rse.2003.09.004',
    'article_doi': ''},
   {'article_name': 'Moss, D.M.; Rock, B.N. - Analysis of red edge spectral characteristics and total chlorophyll values for red spruce (Picea rubens) branch segments from Mt. Moosilauke, N.H., U.S.A. 11th\n',
    'article_link': '',
    'a

In [38]:
[f"{i}. {x['formula']}" for i, x in enumerate(index_set)]

['0. (734nm - 747nm)/(715nm - 720nm)',
 '1. 735nm/[700:710]',
 '2. [8475:8825]**2/([8125:8475]*[8925:9275])',
 '3. [700]*([670]*a + [670] + b)/([670]*(a**2 + 1)**0.5)',
 '4. (1094nm - 1205nm)/(1094nm + 1205nm)',
 '5. 900nm*(680nm + 800nm)/(970nm*(-680nm + 800nm))',
 '6. 0.666666666666667*700nm*(0.2*550nm - 670nm + 0.8*700nm)*sqrt(5*sqrt(670nm) - 6*800nm + (2*800nm + 1)**2 + 0.5)/(670nm*(1.3*550nm - 2.5*670nm + 1.2*800nm))',
 '7. (-BLUE - GREEN + NIR - RED)/(BLUE + GREEN + NIR + RED)',
 '8. [2145:2185]/[2185:2225]',
 '9. (531nm - 570nm)/(531nm + 570nm)',
 '10. Averagereflectancebetween750nmand850nm',
 '11. 2250nm',
 '12. (-550nm + 800nm)/(550nm + 800nm)',
 '13. 0',
 '14. NIR - RED*a',
 '15. 703nm',
 '16. (NIR - RED + y*(-BLUE + RED))/(NIR + RED - y*(-BLUE + RED))',
 '17. [2145:2185]/[2185:2225]',
 '18. 683nm/510nm',
 '19. 760nm*r470/(470nm*r760)',
 '20. [2185:2225]/[2295:2365]',
 '21. (-1/700nm + 1/550nm)/NIR',
 '22. 700nm*(0.2*550nm - 670nm + 0.8*700nm)/670nm',
 '23. 2080nm',
 '24. (-6