Antonio van Dijck

studentnumber: 12717673

Email: antonio.van.dijck@student.uva.nl

The Context-Aware Knowledge Extraction Notebook


# imports 
The imports are also provided in the requirements.txt file. 

Some of the imports used in this notebook are mac specific, and may be altered to support other operating systems. 

In [None]:
!pip uninstall -y llama-cpp
!pip install "chonkie[semantic]"
!pip install faiss-cpu
!pip install sentence_transformers
!pip install transformers
!pip install llama-cpp-python tqdm nltk



## MACOS ONLY, SEE LLAMA.CPP GITHUB FOR OTHER PLATFORMS

In [None]:
!CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python --upgrade --force-reinstall --no-cache-dir

In [None]:
!CMAKE_ARGS="-DGGML_METAL=on" FORCE_CMAKE=1 python3 -m pip install "git+https://github.com/abetlen/llama-cpp-python.git@refs/pull/1901/head" --force-reinstall --upgrade --no-cache-dir

## download model

In [None]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
    repo_id="bartowski/Qwen2.5-7B-Instruct-GGUF",
    filename="*Q4_K_M.gguf",
    verbose=False,
    local_dir="models",
)

# Knowledge Extraction
The knowledge extraction is done using the Qwen 2.5 model. The model is downloaded from the hugginfacehub.

The model extracts knowledge from each semantically chunked sentence(s) and stores it in a dictionary.

The resulting dictionary is then used to generate vector representations of the knowledge extracted from the text.


In [None]:
import json
import os
import pickle
import threading
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from json import JSONDecodeError

# llama singleton to ensure one model is used for ram usage efficiency
class LlamaSingleton:
    _instance = None
    _lock = threading.Lock()

    def __new__(cls, model_path="models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", chat_format="chatml"):
        with cls._lock:
            if cls._instance is None:
                cls._instance = super(LlamaSingleton, cls).__new__(cls)
                cls._instance.llm = Llama(model_path=model_path, chat_format=chat_format, n_ctx=2048)
            return cls._instance

# chatbot class to extract knowledge from chunks
class Chatbot:
    def __init__(self, 
                 messages_file='messages.json', 
                 knowledge_file='knowledge.json', 
                 faiss_index_file='faiss_index.pkl',
                 model_name='all-MiniLM-L6-v2'):
        self.messages_file = messages_file
        self.knowledge_file = knowledge_file
        self.faiss_index_file = faiss_index_file
        self.llm = LlamaSingleton().llm
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.knowledge_data = []
        self.initialize_files()
        self.load_faiss_index()

    def initialize_files(self):
        for file in [self.messages_file, self.knowledge_file]:
            if not os.path.exists(file):
                with open(file, 'w') as f:
                    json.dump([], f)

    def load_json_data(self, file_path):
        with open(file_path, 'r') as f:
            return json.load(f)

    def save_json_data(self, file_path, data):
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=4)

    # this function extracts the knowledge from the chunk text
    def extract_valuable_knowledge(self, message):
        """
        Sends the chunk text to the model and asks it to return JSON with
        subject/predicate/object. No timestamps are generated by the model.
        """
        response = self.llm.create_chat_completion(
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are a knowledge extractor. Try to extract any knowledge.\n"
                        "Return ONLY JSON with the following schema:\n"
                        "{\n"
                        "  \"valuable_knowledge\": [\n"
                        "    {\n"
                        "      \"subject\": \"...\",\n"
                        "      \"predicate\": \"...\",\n"
                        "      \"object\": \"...\"\n"
                        "    }\n"
                        "  ]\n"
                        "}\n"
                        "If no knowledge can be extracted, return:\n"
                        "{\"valuable_knowledge\": []}"
                    )
                },
                {"role": "user", "content": message},
            ],
            response_format={
                "type": "json",
                "schema": {
                    "type": "object",
                    "properties": {
                        "valuable_knowledge": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "subject": {"type": "string"},
                                    "predicate": {"type": "string"},
                                    "object": {"type": "string"}
                                },
                                "required": ["subject", "predicate", "object"]
                            }
                        }
                    },
                    "required": ["valuable_knowledge"],
                },
            },
            temperature=0.5,
        )
        try:
            knowledge_data = json.loads(response['choices'][0]['message']['content'])
            print("Extracted knowledge from a chunk:", knowledge_data)
            if "valuable_knowledge" not in knowledge_data:
                knowledge_data["valuable_knowledge"] = []
            return knowledge_data["valuable_knowledge"]
        except (JSONDecodeError, KeyError):
            return []

    def save_knowledge(self, triplets):
        """
        Persists triplets to `knowledge.json` and updates FAISS index if new triplets
        are found. We do not add any timestamps here.
        """
        if not triplets:
            return
        knowledge = self.load_json_data(self.knowledge_file)
        existing_set = {(t['subject'], t['predicate'], t['object']) for t in knowledge}
        new_triplets = []
        for triplet in triplets:
            key = (triplet['subject'], triplet['predicate'], triplet['object'])
            if key not in existing_set:
                knowledge.append(triplet)
                new_triplets.append(triplet)
                existing_set.add(key)
        self.save_json_data(self.knowledge_file, knowledge)
        if new_triplets:
            self.update_faiss_index(new_triplets)

    def update_faiss_index(self, triplets):
        texts = [f"{t['subject']} {t['predicate']} {t['object']}" for t in triplets]
        embeddings = self.model.encode(texts)
        if self.index is None:
            self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(np.array(embeddings, dtype=np.float32))
        self.knowledge_data.extend(triplets)
        self.save_faiss_index()

    def save_faiss_index(self):
        with open(self.faiss_index_file, 'wb') as f:
            pickle.dump((self.index, self.knowledge_data), f)

    def load_faiss_index(self):
        if os.path.exists(self.faiss_index_file):
            with open(self.faiss_index_file, 'rb') as f:
                self.index, self.knowledge_data = pickle.load(f)
        else:
            self.index = None
            self.knowledge_data = []

def main():
    # Load the chunks from output_chunks.json
    with open('output_chunks.json', 'r', encoding='utf-8') as file:

        data = json.load(file)
    chunks = data.get("chunks", [])
    print(f"Total chunks loaded: {len(chunks)}")

    # Extract Knowledge from Each Chunk
    chatbot = Chatbot()

    for i, chunk in enumerate(chunks, start=1):
        text = chunk.get("text", "")
        start_time = chunk.get("start")
        end_time = chunk.get("end")

        print(f"\nProcessing chunk {i} (Start: {start_time}, End: {end_time})")

        # Extract valuable knowledge from the chunk text
        extracted_knowledge = chatbot.extract_valuable_knowledge(text)

        if extracted_knowledge:

            # Attach the chunk's start/end timestamps of video to each extracted item
            for triplet in extracted_knowledge:
                triplet['start'] = start_time
                triplet['end'] = end_time

            # Save the extracted knowledge
            chatbot.save_knowledge(extracted_knowledge)

    print("\nKnowledge extraction complete.")
    print("Please check 'knowledge.json' for the extracted valuable knowledge.")


if __name__ == "__main__":
    main()

# Test generated knowledge base with LLM with the integrated knowledge base

The knowledge base is tested with a LLM that uses the vector database to answer questions.

The extracted knowledge is stored in a vector databse and the LLM uses the vector database to answer questions.

The LLM is implemented using the python library `llama_cpp` and `torch`.

The LLM has acces to five knowledge items from the vector database, this is specified with a topk parameter.



In [None]:
import json
import os
import pickle
from datetime import datetime
from llama_cpp import Llama
import threading
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from json import JSONDecodeError

class LlamaSingleton:
    _instance = None
    _lock = threading.Lock()

    def __new__(cls, model_path="models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", chat_format="chatml"):
        with cls._lock:
            if cls._instance is None:
                cls._instance = super(LlamaSingleton, cls).__new__(cls)
                cls._instance.llm = Llama(model_path=model_path, chat_format=chat_format, n_ctx=2048)
            return cls._instance

class Chatbot:
    def __init__(self, 
                 messages_file='messages.json', 
                 knowledge_file='knowledge.json', 
                 faiss_index_file='faiss_index.pkl',
                 model_name='all-MiniLM-L6-v2'):
        self.messages_file = messages_file
        self.knowledge_file = knowledge_file
        self.faiss_index_file = faiss_index_file
        self.llm = LlamaSingleton().llm
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.knowledge_data = []
        self.initialize_files()
        self.load_faiss_index()

    def initialize_files(self):
        for file in [self.messages_file, self.knowledge_file]:
            if not os.path.exists(file):
                with open(file, 'w') as f:
                    json.dump([], f)

    def load_json_data(self, file_path):
        with open(file_path, 'r') as f:
            return json.load(f)

    def save_json_data(self, file_path, data):
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=4)

    def save_message(self, role, content):
        messages = self.load_json_data(self.messages_file)
        message = {"role": role, "content": content, "timestamp": datetime.utcnow().isoformat()}
        messages.append(message)
        self.save_json_data(self.messages_file, messages)

    def save_knowledge(self, triplets):
        if not triplets:
            return
        knowledge = self.load_json_data(self.knowledge_file)
        existing_set = {(t['subject'], t['predicate'], t['object']) for t in knowledge}
        new_triplets = []
        for triplet in triplets:
            triplet['timestamp'] = datetime.utcnow().isoformat()
            key = (triplet['subject'], triplet['predicate'], triplet['object'])
            if key not in existing_set:
                knowledge.append(triplet)
                new_triplets.append(triplet)
                existing_set.add(key)
        self.save_json_data(self.knowledge_file, knowledge)
        if new_triplets:
            self.update_faiss_index(new_triplets)

    def update_faiss_index(self, triplets):
        texts = [f"{t['subject']} {t['predicate']} {t['object']}" for t in triplets]
        embeddings = self.model.encode(texts)
        if self.index is None:
            self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(np.array(embeddings, dtype=np.float32))
        self.knowledge_data.extend(triplets)
        self.save_faiss_index()

    def save_faiss_index(self):
        with open(self.faiss_index_file, 'wb') as f:
            pickle.dump((self.index, self.knowledge_data), f)

    def load_faiss_index(self):
        if os.path.exists(self.faiss_index_file):
            with open(self.faiss_index_file, 'rb') as f:
                self.index, self.knowledge_data = pickle.load(f)
        else:
            self.index = None
            self.knowledge_data = []

    def search_knowledge(self, query, top_k=5):
        if self.index is None or len(self.knowledge_data) == 0:
            return []
        query_embedding = self.model.encode([query])
        distances, indices = self.index.search(np.array(query_embedding, dtype=np.float32), top_k)
        results = []
        for idx in indices[0]:
            if idx == -1:
                continue
            results.append(self.knowledge_data[idx])
        return results

    # this function generates a response based on the conversation history and user message and the knowledge top k 5 matches
    def generate_response(self, conversation_history, user_message):
        knowledge_matches = self.search_knowledge(user_message, top_k=5)
        current_time = datetime.utcnow().isoformat()
        system_message = f"Current date and time: {current_time}\n"
        if knowledge_matches:
            system_message += "Answer based on retrieved knowledge:\n"
            for t in knowledge_matches:
                system_message += f"- {t['subject']} {t['predicate']} {t['object']} (Videotimestamps: start: {t['start']}, end: {t['end']})\n"
            
        else:
            system_message += "No direct related knowledge found. Proceeding with general reasoning.\n"
        enriched_history = [{"role": "system", "content": f"You are a helpful assistent; {system_message}"}] #+ conversation_history
        enriched_history.append({"role": "user", "content": user_message})
        print(enriched_history)
        response = self.llm.create_chat_completion(
            messages=enriched_history,
            temperature=0.7,
        )['choices'][0]['message']['content']
        return response

    def chat(self):
        print("Chatbot is ready! Type 'exit' to end the conversation.")
        while True:
            user_message = input("You: ")
            if user_message.lower().strip() in ['exit', 'quit']:
                print("Chatbot: Goodbye!")
                break
            self.save_message(role='user', content=user_message)
            conversation = self.load_json_data(self.messages_file)[-3:]
            assistant_response = self.generate_response(conversation, user_message)
            print(f"Assistant: {assistant_response}")
            self.save_message(role='assistant', content=assistant_response)


if __name__ == "__main__":
    chatbot = Chatbot()
    chatbot.chat()

# EVALUTION PIPELINE
The evalution pipeline is implemented using the `llama_cpp` library.

The evalution pipeline is used to evaluate the performance of the LLM.

Gives each LLM a score based on the number of correct answers given by the LLM.

Generates a automatic result by answering questions from the test set with or without the knowledge base.

The pipeline can take multiple models as input and evaluate them all at once, and generate a score for each model. 

The pipeline can also be used to evaluate the performance of the LLM on a specific dataset.

## Initial version of the pipeline

In [None]:
import json
import os
import pickle
from datetime import datetime
from llama_cpp import Llama
import threading
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from json import JSONDecodeError

class LlamaSingleton:
    _instance = None
    _lock = threading.Lock()

    def __new__(cls, model_path="models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", chat_format="chatml"):
        with cls._lock:
            if cls._instance is None:
                cls._instance = super(LlamaSingleton, cls).__new__(cls)
                cls._instance.llm = Llama(model_path=model_path, chat_format=chat_format, n_ctx=2048)
            return cls._instance

class Chatbot:
    def __init__(self, 
                 messages_file='messages.json', 
                 knowledge_file='knowledge.json', 
                 faiss_index_file='faiss_index.pkl',
                 model_name='all-MiniLM-L6-v2'):
        self.messages_file = messages_file
        self.knowledge_file = knowledge_file
        self.faiss_index_file = faiss_index_file
        self.llm = LlamaSingleton().llm
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.knowledge_data = []
        self.initialize_files()
        self.load_faiss_index()

    def initialize_files(self):
        for file in [self.messages_file, self.knowledge_file]:
            if not os.path.exists(file):
                with open(file, 'w') as f:
                    json.dump([], f)

    def load_json_data(self, file_path):
        with open(file_path, 'r') as f:
            return json.load(f)


    def load_faiss_index(self):
        if os.path.exists(self.faiss_index_file):
            with open(self.faiss_index_file, 'rb') as f:
                self.index, self.knowledge_data = pickle.load(f)
        else:
            self.index = None
            self.knowledge_data = []

    def search_knowledge(self, query, top_k=5):
        if self.index is None or len(self.knowledge_data) == 0:
            return []
        query_embedding = self.model.encode([query])
        distances, indices = self.index.search(np.array(query_embedding, dtype=np.float32), top_k)
        results = []
        for idx in indices[0]:
            if idx == -1:
                continue
            results.append(self.knowledge_data[idx])
        return results

    def generate_response(self, user_message):
        knowledge_matches = self.search_knowledge(user_message, top_k=5)
        current_time = datetime.utcnow().isoformat()
        system_message = f"Current date and time: {current_time}\n"
        if knowledge_matches:
            system_message += "Answer based on retrieved knowledge, but only if it relates to the question:\n"
            for t in knowledge_matches:
                system_message += f"- {t['subject']} {t['predicate']} {t['object']} (Videotimestamps: start: {t['start']}, end: {t['end']})\n"
            
        else:
            system_message += "\n"
        enriched_history = [{"role": "system", "content": f"You are a helpful assistent; {system_message}"}] 
        enriched_history.append({"role": "user", "content": user_message})
        print(enriched_history)
        response = self.llm.create_chat_completion(
            messages=enriched_history,
            temperature=0.5,
        )['choices'][0]['message']['content']
        return response

    def chat(self):
        print("Chatbot is ready! Type 'exit' to end the conversation.")
        while True:
            user_message = input("You: ")
            if user_message.lower().strip() in ['exit', 'quit']:
                print("Chatbot: Goodbye!")
                break
            assistant_response = self.generate_response(user_message)
            print(f"Assistant: {assistant_response}")

if __name__ == "__main__":
    chatbot = Chatbot()
    chatbot.chat()

## Final evaluation script

In [None]:
import json
import os
import pickle
import threading
import sys
from datetime import datetime
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from json import JSONDecodeError

class LlamaSingleton:
    _instance = None
    _lock = threading.Lock()

    def __new__(cls, model_path="models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", chat_format="chatml"):
        with cls._lock:
            if cls._instance is None:
                cls._instance = super(LlamaSingleton, cls).__new__(cls)
                cls._instance.llm = Llama(model_path=model_path, chat_format=chat_format, n_ctx=2048)
            return cls._instance

class Chatbot:
    def __init__(self, 
                 messages_file='messages.json', 
                 knowledge_file='knowledge.json', 
                 faiss_index_file='faiss_index.pkl',
                 model_name='all-MiniLM-L6-v2',
                 llm_model_path="models/Qwen2.5-7B-Instruct-Q4_K_M.gguf"):
        self.messages_file = messages_file
        self.knowledge_file = knowledge_file
        self.faiss_index_file = faiss_index_file
        # Instantiate LlamaSingleton with the provided model path
        self.llm = LlamaSingleton(model_path=llm_model_path).llm
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.knowledge_data = []
        self.initialize_files()
        self.load_faiss_index()

    def initialize_files(self):
        for file in [self.messages_file, self.knowledge_file]:
            if not os.path.exists(file):
                with open(file, 'w') as f:
                    json.dump([], f)

    def load_json_data(self, file_path):
        with open(file_path, 'r') as f:
            return json.load(f)

    def load_faiss_index(self):
        if os.path.exists(self.faiss_index_file):
            with open(self.faiss_index_file, 'rb') as f:
                self.index, self.knowledge_data = pickle.load(f)
        else:
            self.index = None
            self.knowledge_data = []

    def search_knowledge(self, query, top_k=5):
        if self.index is None or len(self.knowledge_data) == 0:
            return []
        query_embedding = self.model.encode([query])
        distances, indices = self.index.search(np.array(query_embedding, dtype=np.float32), top_k)
        results = []
        for idx in indices[0]:
            if idx == -1:
                continue
            results.append(self.knowledge_data[idx])
        return results

    def generate_response_with_kb(self, user_message):
        knowledge_matches = self.search_knowledge(user_message, top_k=5)
        current_time = datetime.utcnow().isoformat()
        system_message = f"Current date and time: {current_time}\n"
        if knowledge_matches:
            system_message += "Choose one answer based on retrieved knowledge, if it relates to the question:\n"
            for t in knowledge_matches:
                system_message += f"- {t['subject']} {t['predicate']} {t['object']} (Videotimestamps: start: {t['start']}, end: {t['end']})\n"
        else:
            system_message += "\n"
        enriched_history = [{"role": "system", "content": f"You are a helpful assistent; {system_message}"}]
        enriched_history.append({"role": "user", "content": user_message})
        print(knowledge_matches)
        response = self.llm.create_chat_completion(
            messages=enriched_history,
            temperature=0.5,
        )['choices'][0]['message']['content']
        return response

    def generate_response_without_kb(self, user_message):
        current_time = datetime.utcnow().isoformat()
        system_message = f"Current date and time: {current_time}\n"
        enriched_history = [{"role": "system", "content": f"You are a helpful assistent. Choose one answer; {system_message}"}]
        enriched_history.append({"role": "user", "content": user_message})
        response = self.llm.create_chat_completion(
            messages=enriched_history,
            temperature=0.5,
        )['choices'][0]['message']['content']
        return response

def run_evaluation(llm_model_path, questions_path):

    # Load the questions from the  JSON file
    with open(questions_path, 'r', encoding='utf-8') as f:
        questions = json.load(f)

    # Chatbot using a model path
    chatbot = Chatbot(llm_model_path=llm_model_path)

    # Loop over all questions
    for question in questions:

        # make a prompt that includes the question text and options.
        prompt = f"Question: {question['question']}\nOptions:\n"

        for idx, opt in enumerate(question['options']):

            prompt += f"{opt}\n"

        # Generate responses using the two methods.
        response_with_kb = chatbot.generate_response_with_kb(prompt)
        response_without_kb = chatbot.generate_response_without_kb(prompt)

        # Add the responses to the question JSON object.
        question["llm_answer_with_kb"] = response_with_kb
        question["llm_answer_without_kb"] = response_without_kb

    # Derive the LLM name from the model path.
    llm_name = os.path.splitext(os.path.basename(llm_model_path))[0]
    result_filename = f"result_eval_{llm_name}.json"

    # Save the updated questions JSON to the new file.
    with open(result_filename, 'w', encoding='utf-8') as f:
        json.dump(questions, f, indent=4)

    print(f"Evaluation complete. Results saved to {result_filename}")

if __name__ == "__main__": 
    # Check command-line arguments
    model_paths = ["models/Qwen2.5-0.5B-Instruct-f16.gguf",
                   "models/Llama-3.2-1B-Instruct-f16.gguf",
                   "models/gemma-2-2b-it.F16.gguf",
                   "models/Llama-3.2-3B-Instruct-f16.gguf",
                   "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
                   "models/Llama-3.2-11B-Vision-Instruct.Q4_K_M.gguf",
                   "models/gemma-2-9b-it-Q4_K_M.gguf",
                   "models/phi-4-14b-Q4_K_M.gguf",
                   ]
    
    for model_path in model_paths:
        questions_path = "questions.json"
        run_evaluation(model_path, questions_path)