In [5]:
import sys
import typing

import spacy
import numpy as np
import lxml.etree
import requests

# python -m spacy download en_core_web_md
english: spacy.lang.en.English = spacy.load("en_core_web_md")


class Node:
    def __init__(self) -> None:
        self.xpath: str = '/'
        self.element: lxml.etree._Element = lxml.etree._Element()
        self.vector: typing.Optional[np.array] = None
        self.position: int = 0

    def __repr__(self) -> str:
        return "<Node: {}>".format(self.element)

    def get_parent(self) -> typing.Optional[lxml.etree._Element]:
        return self.element.getparent()

    def get_children(self) -> typing.Generator[lxml.etree._Element, None, None]:
        yield from self.element

    def __get_tag(self) -> str:
        return self.element.tag

    def __get_text(self) -> str:
        return ' '.join([
            self.element.text or '',
            self.element.tag or '',
        ]).strip()

    def __get_attributes(self) -> dict:
        return self.element.attrib

    def get_shape(self) -> tuple:
        return (5, 300)

    def get_vector(self) -> np.array:
        if self.vector is None:
            tag: str = self.__get_tag()
            text: str = self.__get_text()
            x1: np.array = english(tag).vector
            x2: np.array = english(text).vector
            x3: np.array = np.zeros(x1.shape)
            x4: np.array = np.array([self.position, ] * self.get_shape()[1])
            x5: np.array = english(' '.join([
                name.split('[')[0]
                for name in self.xpath.split('/')
            ])).vector
            for key, value in self.__get_attributes().items():
                x3 += english(tag).vector * english(value).vector
            self.vector: np.array = np.array([
                x1,  # Tag type.
                x2,  # Text vector.
                x3,  # Numeric representation of attributes.
                x4,  # Indicator of vertical position.
                x5,  # Numeric representation of xpath.
            ])
            assert self.vector.shape == self.get_shape()
        return self.vector

    def __add__(self, node: 'Node') -> 'Node':
        assert isinstance(node, self.__class__)
        self.vector = self.get_vector() + node.get_vector()
        return self


class Html2Vec:
    def __init__(self) -> None:
        self.relatives: int = 5

    def __repr__(self) -> str:
        return "<Model: {}>".format(self.__class__.__name__)

    def fit(self, text: str) -> typing.Generator[Node, None, None]:
        assert isinstance(text, str)
        assert text
        html: lxml.etree.HTML = lxml.etree.HTML(text)
        root: lxml.etree._ElementTree = html.getroottree()
        total_nodes: int = len(root.xpath(".//*"))
        index: dict = {}
        for i, element in enumerate(html.iter()):
            xpath: str = root.getpath(element)
            node: Node = Node()
            node.position = i / total_nodes
            node.element = element
            node.xpath = xpath
            index[xpath] = node
        for level in range(self.relatives):
            for node in index.values():
                if node.get_parent() is not None:
                    xpath: str = root.getpath(node.get_parent())
                    parent: Node = index[xpath]
                    node += parent
                for element in node.get_children():
                    xpath: str = root.getpath(element)
                    child: Node = index[xpath]
                    node += child
        yield from index.values()

In [7]:
url = "https://investors.3m.com/governance/corporate-officers/default.aspx"

In [12]:
html = requests.get(url).text

In [None]:
requests.Response = requests.get(url)

In [13]:
model = Html2Vec()
model.relatives = 5
for node in model.fit(html):
    print(node)
    print(node.get_vector())

TypeError: sequence item 1: expected str instance, cython_function_or_method found