In [1]:
import os
import ast
import csv
import pickle
import unicodedata
import ollama
import spacy
import json
import re
from keybert import KeyBERT

In [5]:
with open("documents/unstructured/world_of_warcraft/library.pkl", "rb") as f:
    library = pickle.load(f)

In [50]:
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(
    chunk_text,
    keyphrase_ngram_range=(1, 1),
    stop_words='english',
    top_n=5,
    use_maxsum=True,  # or use_mmr=True
    diversity=0.7     # if using use_mmr
)

In [8]:
def extract_concepts(text, model='mistral'):
    prompt = f"""
    You are an expert knowledge extraction agent. Given the following documentation text, extract a list of key **terms or concepts** that should be represented as **nodes** in a knowledge graph.

    Instructions:
    - Extract domain-relevant terms only (e.g., concepts, methods, tools, models, datasets, or key terminology).
    - Do not include general-purpose stopwords.
    - Do not include gneric terms.
    - Return only a clean JSON list of strings (e.g., ["Concept A", "Tool B", "Dataset C"]).
    - Do NOT return explanations or extra text—only the JSON list.

    Text:
    \"\"\"{text}\"\"\"

    Extracted nodes:
    """

    response = ollama.chat(
        model=model,
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )

    return response['message']['content']

In [15]:
categories = [
    "Person", "Company", "Government Body", "title", "Research Group", "Scientific Concept", "Social Concept",
    "Theoretical Model", "Country", "City", "Continent", "Geographic Feature", "Facility", "Institution",
    "Book", "Article", "Chapter", "Section", "Law", "Legal Code", "File Name", "Reference", "Symbolic Expression",
    "Mathematical Function", "Algorithm", "Pseudocode", "Physical Item", "Tool", "Instrument", "Chemical",
    "Historical Event", "Scientific Discovery", "Accident", "Disaster", "War", "Battle", "Steps in a Method",
    "Biological Process", "Chemical Process", "Industrial Process", "Academic Field", "Genre", "Classification",
    "System of Knowledge", "Date", "Duration", "Period", "Number", "Unit", "Range", "Acronym", "Label", "Keyword",
    "Invention", "Patent", "Technical System"
]

In [13]:
def get_type(term, model='mistral'):
    category_list = "\n- " + "\n- ".join(categories)

    prompt = f"""
    You are a classification agent. Your task is to assign the given term to **exactly one** of the following categories:

    {category_list}

    Instructions:
    - Respond only with the category name. 
    - Do not explain your reasoning. 
    - Do not use a categorey that is not in the list of categories.
    - Only assign a single category.

    Term: "{term}"
    Category:
    """.strip()

    response = ollama.chat(
        model=model,
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )

    return response['message']['content'].strip()

In [6]:
len(library)

2185

In [9]:
list_of_concepts = []

for entry in library[:100]:
    list_of_concepts.append(extract_concepts(entry['text']))

all_items = set()

for line in list_of_concepts:
    parsed_list = ast.literal_eval(line)
    all_items.update(parsed_list)

In [10]:
all_items

{'1st-Level Character Wealth',
 'Abilities',
 'Abilities and Conditions',
 'Ability Modifiers',
 'Ability Score',
 'Ability Scores',
 'Abilitγ Modifi ers',
 'Academγ of Arcane Sciences',
 'Accusations',
 'Actions in Combat',
 'Admiral',
 'Admiral Daelin Proudmoore',
 'Admiral Proudmoore',
 'Adventurer',
 'Adventures',
 'Adventuring Gear',
 'Aedelas Blackmoore',
 'Aegwgnn',
 'Affiliation',
 'Affiliation Rating',
 'Affiliations',
 'Age',
 'Agilitgamma',
 'Agility (Agγ)',
 'Aging Effects',
 'Alexstrasza',
 'Alignment',
 'Alliance',
 'Alliance & Horde Compendium',
 "Alliance Player's Guide",
 'Alliance heroes',
 'Alliance or Horde',
 'Alligators',
 'Amg',
 'Amnesty',
 'Ancestral Throne',
 'Ancient',
 'Ancient Gurubashi Trolls',
 'Ancient troll edifice',
 'Andrew Bates',
 'Andrew Rowe',
 'Anduin Lothar',
 'Animal Companion',
 'Animal Companion Abilities Master Level',
 'Arathor',
 'Arc',
 'Arcana Unearthed',
 'Arcane Communitγ (Aco)',
 'Arcane Library',
 'Arcane Magic',
 'Arcane magic',
 'A

In [16]:
item_types = []

for item in all_items:
    _type = get_type(item)
    item_types.append((item,_type))

In [17]:
item_types

[('Armor Class Modifi ers', 'Algorithm'),
 ('Khaz Modan', 'Geographic Feature'),
 ('Green dragons', 'Social Concept'),
 ('Fog of War', 'Battle'),
 ('Stonevault Troggs',
  "Band (or possibly, a fictional band if it's from a book, movie, or game)"),
 ('racial war', 'War'),
 ('Ironforge Prospectors', 'Company'),
 ('Bloodgaem', 'Symbolic Expression'),
 ('Trees', 'Geographic Feature'),
 ('Communities', 'Social Concept'),
 ('Fortress', 'Geographic Feature'),
 ('arcane magic', 'Social Concept'),
 ('Bob Fitch', 'Person'),
 ("Theramore's approval", 'Title'),
 ('Clothing',
  'Geographic Feature (incorrect, but this is an exception due to common usage as a colloquial term for items used to cover bodies)\nCorrect Category: Physical Item'),
 ('Naga', 'Geographic Feature'),
 ('Steadfastness', 'Social Concept'),
 ('Ironforge', 'Facility'),
 ('Amg', 'Company'),
 ('Communitγ Skills List', 'Social Concept'),
 ('Craft',
  'Artisanry (can be considered as a Social Concept or Facility, but Craft is more co

In [134]:
nodes = {
    "node" : 0,
    "type" : 0,
    "topic" : 0
}