In [1]:
import re
import pickle
import networkx as nx
from uuid import uuid4
from PIL import Image
from typing import Iterator
from src import TypeReturn
from itertools import combinations
from tqdm.notebook import tqdm

In [2]:
re2enum = {
    r'^#+ .+': TypeReturn.HEADING,
    r'(\| .+ \| ?)+': TypeReturn.TABLE,
    r'^(\d+\.)+ .+': TypeReturn.LIST,
    r'^\* .+': TypeReturn.LIST,
    r'!\[.+\]\(.+\)': TypeReturn.IMAGE,
    r'\[\[\d+\]\]\(#footnote-\d+\)': TypeReturn.FOOTNOTE
}
multiline_types = {
    TypeReturn.TABLE,
    TypeReturn.LIST
}


def get_type_return(s:  str) -> TypeReturn:
    n = re.sub(r'(^[\*]+)|([\*]+$)', '', s)
    for k, v in re2enum.items():
        if re.fullmatch(k, n) is not None:
            return v
    return TypeReturn.TEXT


def collate_f(reader: Iterator[str]) -> Iterator[tuple[TypeReturn, str, None | Image.Image]]:
    last_type = None
    temp =  []
    for el in reader:
        el = el.strip()
        if el == '':
            continue

        type_ =  get_type_return(el)

        if type_ == TypeReturn.IMAGE:
            img_text = re.findall(r'(?<=\[).+(?=\])', el)[0]
            el = el.replace(img_text, '')
            img_path = re.findall(r'(?<=\().+(?=\))', el)[0]

            yield type_, img_text, Image.open(f'./__output__/{img_path}')
            continue

        if type_ in multiline_types:
            last_type = type_
            temp.append(el)
            continue

        if last_type is not None and last_type != type_:
            yield last_type, '\n'.join(temp), None
            temp = []
            last_type = None
        elif last_type is None:
            yield type_, el, None


def get_level(s: str | Image.Image) -> tuple[int, str | Image.Image]:
    if isinstance(s, Image.Image):
        return -1, s

    level_str = r'^#+ '
    match = re.match(level_str, s)
    if match is None:
        return -1, s

    return len(match.group().strip()), re.sub(level_str, '', s)


root_id = str(uuid4())
last_seen_ids = [
    (root_id, -1)
]

G = nx.Graph()
G.add_node(
    root_id,
    position=-1,
    text='',
    node_type=TypeReturn.HEADING,
    image=None,
    level=-1
)

with open('./__output__/study_fies_no_uri.md', 'r', encoding='utf-8') as f:
    for i, (type_, line, img) in enumerate(collate_f(f)):
        current_level, line = get_level(line)

        node_id = str(uuid4())
        G.add_node(
            node_id,
            position=i,
            text=line,
            node_type=type_,
            image=img,
            level=current_level if current_level != -1 else last_seen_ids[-1][1]
        )

        while current_level != -1 and last_seen_ids[-1][1] >= current_level:
            last_seen_ids.pop()
        
        G.add_edge(last_seen_ids[-1][0], node_id)

        if current_level != -1:
            last_seen_ids.append((node_id, current_level))

In [3]:
G.remove_edges_from(nx.selfloop_edges(G))

In [4]:
[
    f'{id_} -> {G.nodes[id_]}'
    for id_ in G[root_id].keys()
]

["476c33e3-22df-4d93-a81c-24b3c2f74d0f -> {'position': 0, 'text': '**МИНИСТЕРСТВО НАУКИ И ВЫСШЕГО ОБРАЗОВАНИЯ РФ**', 'node_type': <TypeReturn.TEXT: 'text'>, 'image': None, 'level': -1}",
 "c083b887-7689-46cf-a074-2ad36fdf7c4e -> {'position': 1, 'text': '**УЧЕБНО-МЕТОДИЧЕСКИЙ КОМПЛЕКС МОДУЛЯ (ДИСЦИПЛИНЫ)', 'node_type': <TypeReturn.TEXT: 'text'>, 'image': None, 'level': -1}",
 "3d54f9b8-019a-4513-b847-43e6d076746d -> {'position': 2, 'text': 'ДЛЯ ОБРАЗОВАТЕЛЬНЫХ ОРГАНИЗАЦИЙ ВЫСШЕГО ОБРАЗОВАНИЯ**', 'node_type': <TypeReturn.TEXT: 'text'>, 'image': None, 'level': -1}",
 "2b5b7066-e0bc-4556-83e3-041757b5455b -> {'position': 3, 'text': 'ФИЛОСОФИЯ', 'node_type': <TypeReturn.TEXT: 'text'>, 'image': None, 'level': -1}",
 "17e40c8b-4a84-485f-85ed-67f4aefae80c -> {'position': 4, 'text': 'ДНК РОССИИ БИБЛИОТЕКА ПРОЕКТА', 'node_type': <TypeReturn.IMAGE: 'image'>, 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=414x156 at 0x25593F39010>, 'level': -1}",
 "ae5a1ec3-c6a7-49e7-aa3a-f171d2349

In [5]:
windows_size = 6
with_ancestors = {
    i
    for i in G.nodes
    if len(G[i]) > 1
}
new_edge = []

for id_ in tqdm(with_ancestors):
    nodes = list(G[id_])
    for first_id, second_id in combinations(nodes, 2):
        distance = abs(G.nodes[first_id]['position'] - G.nodes[second_id]['position'])
        if first_id in with_ancestors or second_id in with_ancestors or distance > windows_size:
            continue
        new_edge.append((first_id, second_id))

G.add_edges_from(new_edge)

  0%|          | 0/165 [00:00<?, ?it/s]

In [6]:
G.remove_edges_from(nx.selfloop_edges(G))

In [7]:
with open('./__output__/binaries/graph_1.pkl', 'wb') as f:
    pickle.dump(G, f)

with open('./__output__/binaries/nodes_data.pkl', 'wb') as f:
    pickle.dump(
        [
            {
                'id': k,
                **v
            }
            for k, v in dict(G.nodes.data()).items()
        ],
        f
    )