In [2]:
import json
from objectbox import (
    Entity,
    Id,
    String,
    Store,
    Box,
    Float32Vector,
    HnswIndex,
    VectorDistanceType,
    
)
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from pathlib import Path
import yaml

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
rules_chunks_pth = Path("../data/iki/chunks.json")
rules_chunks: list[dict] = json.loads(rules_chunks_pth.read_text())
term_ner_pth = Path("../data/iki/terms.json")
term_ner: list[dict] = json.loads(term_ner_pth.read_text())

In [4]:
rules_chunks[0]

{'id': 'iki_001',
 'type': 'main_title',
 'section': 'title',
 'content': 'Iki ‚Äî –∏–≥—Ä–∞ –æ –º–∞—Å—Ç–µ—Ä–∞—Ö —ç–ø–æ—Ö–∏ –≠–¥–æ. –ü–æ–≥—Ä—É–∑–∏—Ç–µ—Å—å –≤ –º–∏—Ä –Ø–ø–æ–Ω–∏–∏ 1603‚Äì1868 –≥–≥. –í —Ä–∞–π–æ–Ω–µ –ù–∏—Ö–æ–Ω–±–∞—Å–∏ –Ω–∞—Ö–æ–¥–∏—Ç—Å—è —Å–∞–º—ã–π –æ–∂–∏–≤–ª—ë–Ω–Ω—ã–π —Ä—ã–Ω–æ–∫ –≥–æ—Ä–æ–¥–∞. –ù–∞ –≥–ª–∞–≤–Ω–æ–π —É–ª–∏—Ü–µ —Ä–∞–π–æ–Ω–∞ —Ä–∞—Å–ø–æ–ª–æ–∂–∏–ª–∏—Å—å —Å–∞–º—ã–µ —Ä–∞–∑–Ω—ã–µ –ª–∞–≤–∫–∏ –∏ —Ç–æ—Ä–≥–æ–≤—Ü—ã, –ø—Ä–∏–≥–ª–∞—à–∞—é—â–∏–µ –ø–æ–∫—É–ø–∞—Ç–µ–ª–µ–π –∑–∞–≥–ª—è–Ω—É—Ç—å –∫ –Ω–∏–º –Ω–∞ –æ–≥–æ–Ω—ë–∫. –ù–µ–ø–æ–¥–∞–ª—ë–∫—É –æ—Ç –≥–ª–∞–≤–Ω–æ–π —É–ª–∏—Ü—ã –Ω–∞—Ö–æ–¥–∏—Ç—Å—è —Ä—ã–Ω–æ–∫ –º–æ—Ä–µ–ø—Ä–æ–¥—É–∫—Ç–æ–≤, –≥–¥–µ —Ä—ã–±–∞–∫–∏ —Å–±—ã–≤–∞—é—Ç —Å–≤–æ–π —É–ª–æ–≤ ‚Äî –æ—Ç—Ç—É–¥–∞ —Å–ª—ã—à–∞—Ç—Å—è –≥–æ–ª–æ—Å–∞ –ø–æ–∫—É–ø–∞—Ç–µ–ª–µ–π –∏ —Ç–æ—Ä–≥—É—é—â–µ–µ —Ä—ã–±–æ–π. –†–µ–º–µ—Å–ª–µ–Ω–Ω–∏–∫–∏ –∏ –º–∞—Å—Ç–µ—Ä–∞ –∂–∏–≤—É—Ç –≤ –¥–æ–º–∞—Ö-–Ω–∞–≥–∞—è –≤–æ–∫—Ä—É–≥ –≥–ª–∞–≤–Ω–æ–π —É–ª–∏—Ü—ã. –ò—Ö —É–º–µ–Ω–∏—è –∏ —Ç–æ–≤–∞—Ä—ã –ø–æ–º–æ–≥–∞—é—Ç –ø—Ä–æ—Å—Ç—ã–º –ª—é–¥—è–º –≤ –≠–¥–

In [5]:
term_ner[0]

{'id': 'term_iki_001',
 'name': '–û—è–∫–∞—Ç–∞',
 'slug': 'oyakata',
 'kind': 'TERM',
 'path': 'iki/game_components/figures/oyakata',
 'group': 'game_components',
 'definition': '–§–∏–≥—É—Ä–∫–∞ –∏–≥—Ä–æ–∫–∞, –∫–æ—Ç–æ—Ä–æ–π –æ–Ω –ø–µ—Ä–µ–º–µ—â–∞–µ—Ç—Å—è –ø–æ –≥–ª–∞–≤–Ω–æ–π —É–ª–∏—Ü–µ –ù–∏—Ö–æ–Ω–±–∞—Å–∏, –∑–∞–∫–ª—é—á–∞–µ—Ç —Å–¥–µ–ª–∫–∏ –≤ –º–∞–≥–∞–∑–∏–Ω–∞—Ö –∏ –∞–∫—Ç–∏–≤–∏—Ä—É–µ—Ç –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–π –≤ –∫–æ–º–Ω–∞—Ç–∞—Ö –Ω–∞–¥ –º–∞–≥–∞–∑–∏–Ω–∞–º–∏.',
 'extra': {'used_in': ['—Ñ–∞–∑–∞ B', '–∑–∞–∫–ª—é—á–µ–Ω–∏–µ —Å–¥–µ–ª–æ–∫', '–¥–≤–∏–∂–µ–Ω–∏–µ'],
  'symbols': ['üßë\u200dü¶∞']}}

In [6]:
for rule in rules_chunks:
    tags = " ".join(
        [
            f"#section:{rule['section']}",
            f"#type:{rule['type']}",
        ]
    )
    rule["scenario"] = f"{tags}\n---\n{rule["scenario"]}"
    rule["req_term"] = yaml.safe_dump(rule["req_term"], allow_unicode=True)

In [7]:
print(rules_chunks[2]["content"])

–°–æ—Å—Ç–∞–≤ –∏–≥—Ä—ã: 1 –¥–≤—É—Å—Ç–æ—Ä–æ–Ω–Ω–µ–µ –ø–æ–ª–µ –∏ 4 –¥–≤—É—Å—Ç–æ—Ä–æ–Ω–Ω–∏—Ö –ø–ª–∞–Ω—à–µ—Ç–∞ –∏–≥—Ä–æ–∫–∞ (–¥–ª—è 2 –∏–≥—Ä–æ–∫–æ–≤ –∏ 3-4 –∏–≥—Ä–æ–∫–æ–≤). –ö–∞—Ä—Ç—ã: 4 —Å—Ç–∞—Ä—Ç–æ–≤—ã—Ö –∫–∞—Ä—Ç—ã –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–π, 56 –∫–∞—Ä—Ç –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–π (14 –∫–∞—Ä—Ç –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Å–µ–∑–æ–Ω–∞): –í–µ—Å–Ω–∞ üå∏, –õ–µ—Ç–æ ‚òÄÔ∏è, –û—Å–µ–Ω—å üçÇ, –ó–∏–º–∞ ‚ùÑÔ∏è. –î–µ—Ä–µ–≤—è–Ω–Ω—ã–µ —ç–ª–µ–º–µ–Ω—Ç—ã: 4 —Ñ–∏–≥—É—Ä–∫–∏ –û—è–∫–∞—Ç–∞ üßë‚Äçü¶∞ (—Ñ–∏–æ–ª–µ—Ç–æ–≤–∞—è, —Å–∏–Ω—è—è, –∂—ë–ª—Ç–∞—è, –∫—Ä–∞—Å–Ω–∞—è), 4 —Ñ–∏–≥—É—Ä–∫–∏ –ò–∫–∏–∑–∞–º–∞ üßë‚Äçü¶± (—Ñ–∏–æ–ª–µ—Ç–æ–≤–∞—è, —Å–∏–Ω—è—è, –∂—ë–ª—Ç–∞—è, –∫—Ä–∞—Å–Ω–∞—è), 16 —Ñ–∏–≥—É—Ä–æ–∫ –ö–æ–±—É–Ω üßë‚Äçü¶≥ (–ø–æ 4 —Ñ–∏–æ–ª–µ—Ç–æ–≤—ã—Ö, 4 —Å–∏–Ω–∏—Ö, 4 –∂—ë–ª—Ç—ã—Ö, 4 –∫—Ä–∞—Å–Ω—ã—Ö), 4 –º–∞—Ä–∫–µ—Ä–∞ –¥–ª—è –ø–æ–¥—Å—á—ë—Ç–∞ –æ—á–∫–æ–≤ –ò–∫–∏ üå∏ (–ø–æ –æ–¥–Ω–æ–º—É —Ü–≤–µ—Ç—É: —Ä–æ–∑–æ–≤—ã–π, —Ñ–∏–æ–ª–µ—Ç–æ–≤—ã–π, –∂—ë–ª—Ç—ã–π, –∫—Ä–∞—Å–Ω—ã–π), 4 –º–∞—Ä–∫–µ—Ä–∞ –æ–≥–Ω–µ—É–ø–æ—Ä–Ω–æ—Å—Ç–∏ üî• (–ø–æ –æ–¥–Ω–æ–º—É —Ü–≤–µ

In [8]:
for term in term_ner:
    tags =  f"#group:{term["group"]}"
    term["content"] = f"{tags}\n---\n{term["name"]}"
    term["extra"] = yaml.safe_dump(term["extra"] if "extra" in term else [], allow_unicode=True)

In [9]:
term_ner[0]

{'id': 'term_iki_001',
 'name': '–û—è–∫–∞—Ç–∞',
 'slug': 'oyakata',
 'kind': 'TERM',
 'path': 'iki/game_components/figures/oyakata',
 'group': 'game_components',
 'definition': '–§–∏–≥—É—Ä–∫–∞ –∏–≥—Ä–æ–∫–∞, –∫–æ—Ç–æ—Ä–æ–π –æ–Ω –ø–µ—Ä–µ–º–µ—â–∞–µ—Ç—Å—è –ø–æ –≥–ª–∞–≤–Ω–æ–π —É–ª–∏—Ü–µ –ù–∏—Ö–æ–Ω–±–∞—Å–∏, –∑–∞–∫–ª—é—á–∞–µ—Ç —Å–¥–µ–ª–∫–∏ –≤ –º–∞–≥–∞–∑–∏–Ω–∞—Ö –∏ –∞–∫—Ç–∏–≤–∏—Ä—É–µ—Ç –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–π –≤ –∫–æ–º–Ω–∞—Ç–∞—Ö –Ω–∞–¥ –º–∞–≥–∞–∑–∏–Ω–∞–º–∏.',
 'extra': 'symbols:\n- üßë\u200dü¶∞\nused_in:\n- —Ñ–∞–∑–∞ B\n- –∑–∞–∫–ª—é—á–µ–Ω–∏–µ —Å–¥–µ–ª–æ–∫\n- –¥–≤–∏–∂–µ–Ω–∏–µ\n',
 'content': '#group:game_components\n---\n–û—è–∫–∞—Ç–∞'}

In [10]:
from objectbox import Int16


@Entity()
class Rule:
    id = Id
    internal_id = String
    content = String
    section = String
    game= String
    req_term = String
    scenario = String
    priority = Int16
    zone = String
    vector = Float32Vector(
        index=HnswIndex(dimensions=768, distance_type=VectorDistanceType.EUCLIDEAN)
    )
   
@Entity()
class Terminology:
    id = Id
    internal_id = String
    game = String
    content = String
    name = String
    slug = String
    kind = String
    path = String
    group = String
    definition = String
    extra = String
    vector = Float32Vector(
        index = HnswIndex(dimensions=768)
    )

@Entity()
class Game:
    id = Id
    name = String
    latin_name = String
    vector = Float32Vector(
        index = HnswIndex(dimensions=768)
    )

In [11]:
store = Store(directory="../db")
rules_box = Box(store=store, entity=Rule)
term_ner_box = Box(store=store, entity=Terminology)
game_box = Box(store=store, entity=Game)

---

In [12]:
model = SentenceTransformer("../model").to("cpu")

In [13]:
box_objects = [
    Rule(
        internal_id=obj["id"],
        content=obj["content"],
        section=obj["section"],
        game="Iki",
        req_term=obj["req_term"],
        scenario=obj["scenario"],
        priority=obj["priority"],
        zone=obj["zone"],
        vector=model.encode(obj["scenario"]).tolist(),
    )
    for obj in tqdm(rules_chunks)
]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 52/52 [00:10<00:00,  4.94it/s]


In [14]:
box_terms = [
    Terminology(
        internal_id=obj["id"],
        content=obj["content"],
        name=obj["name"],
        slug=obj["slug"],
        kind=obj["kind"],
        path=obj["path"],
        group=obj["group"],
        definition=obj["definition"],
        extra=obj["extra"],
        vector=model.encode(obj["content"]).tolist(),
    )
    for obj in tqdm(term_ner)
]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32/32 [00:06<00:00,  5.05it/s]


In [15]:
box_games = [
    Game(
        name="–ü–æ–¥–∑–µ–º–µ–ª—å–µ –∏ –ø—ë—Å–∏–∫–∏",
        latin_name="Podzemelie i pesiki",
        vector=model.encode("–ü–æ–¥–∑–µ–º–µ–ª—å–µ –∏ –ø—ë—Å–∏–∫–∏").tolist(),
    ),
    Game(
        name="Iki",
        latin_name="Iki",
        vector=model.encode("Iki").tolist(),
    )
]

In [16]:
rules_box.put(box_objects)
term_ner_box.put(box_terms)
game_box.put(box_games)

In [17]:
store.close()