# Init Notebook

In [None]:
# Import necessary libraries
from dotenv import load_dotenv
from openai import OpenAI
import os
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm

from utils.llm_clients.cached_client import CachedLLMClient
from utils.llm_clients.cost_monitoring import LLMClientWithCostMonitoring
from utils.llm_clients.providers.open_ai_client import OpenAIClient
from utils.llm_clients.providers import supported_models

# Load environment variables from .env file
load_dotenv()

open_ai_key = os.environ["OPEN_AI_KEY"]

openai_client = OpenAIClient(api_key=open_ai_key, model_info=supported_models.GPT_4O)
openai_client = CachedLLMClient(client=openai_client)
openai_client = LLMClientWithCostMonitoring(client=openai_client)

import logging

logging.basicConfig(level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)

logging.info("Logging initiated")

# Data curation
Downloading set of articles to be used for assessment

In [None]:
from pathlib import Path
import utils.wiki_parser.wiki_parser as wiki_parser
from pprint import pprint
from utils.storage import ArticleStorage
import os

storage = ArticleStorage()

for filename in os.listdir("data"):
    if filename.endswith(".xml"):
        print(filename)
        raw_pages = wiki_parser.extract_articles_from_file("data/" + filename, output_folder=Path("data"))
        storage.save_articles(raw_pages)

pages_df = storage.load_all()

# Data filtering

Remove sections that don't contain text or are not relevant

In [None]:
pages_df = storage.load_all()
pages_df = pages_df[pages_df['Section Title'] != 'Linki zewnętrzne']
pages_df = pages_df[pages_df['Section Title'] != 'Zobacz też']
pages_df = pages_df[pages_df['Section Title'] != 'Bibliografia']
pages_df = pages_df[pages_df['Section Title'] != 'Przypisy']
pages_df = pages_df.drop_duplicates()
pages_df = pages_df.reset_index(drop=True)
pages_df
# pages_df.to_excel("data.xlsx")

# Augment data

In [None]:
from utils.question_generation import BASE_PROMT_PL, generate_question_for_text
from tqdm.auto import tqdm
from time import sleep

pbar = tqdm(total=len(pages_df[:3]))
questions_column = []

promt_tokens_bar = tqdm(desc="Promt tokens")
completions_tokens_bar = tqdm(desc="Completions tokens")

for _, row in pages_df[:3].iterrows():
    pbar.set_description(f"Generating questions for {row['Section Title']}")
    questions = generate_question_for_text(openai_client, row['Section Content'], BASE_PROMT_PL)
    questions_column.append(questions)
    pbar.update(1)
    # sleep(1)
    promt_tokens_bar.update(openai_client.promt_tokens - promt_tokens_bar.n)
    completions_tokens_bar.update(openai_client.completion_tokens - completions_tokens_bar.n)

# Data Ingestion

In [None]:
from utils.embedding_models.caching import CachedEmbeddingModel
from utils.embedding_models.providers import hugging_face
from utils.embedding_models.providers import open_ai
from utils.embedding_models.providers import supported_models
from utils.embedding_models.schema import EmbeddingModel
from utils.vectordb.vectordb import VectorDB
from tqdm.autonotebook import tqdm

def wrapper(model: EmbeddingModel):
    return CachedEmbeddingModel(model)

vector_db = VectorDB()
vector_db.add_index(f"OPENAI_SMALL", open_ai.init_model(api_key=open_ai_key, model_info=supported_models.TEXT_EMBEDDING_3_SMALL))
vector_db.add_index(f"OPENAI_LARGE", open_ai.init_model(api_key=open_ai_key, model_info=supported_models.TEXT_EMBEDDING_3_LARGE))
vector_db.add_index(f"OPENAI__ADA", open_ai.init_model(api_key=open_ai_key, model_info=supported_models.TEXT_EMBEDDING_ADA_002))
vector_db.add_index("HF_SDADAS", hugging_face.init_model(model_info=supported_models.ST_POLISH_PARAPHRASE_FROM_DISTILROBERTA))

for index_name in tqdm(vector_db.list_indices(), desc='Testing Embedding models'):
    vector_db.insert_texts(pages_df['Section Content'].values.tolist(), index_name)

# Perform the experiment

In [None]:
from tqdm.autonotebook import tqdm

pbar = tqdm(total=len(vector_db.list_indices()))
pbar.update(1)
for index_name in vector_db.list_indices():
    pbar.set_description(f"Testing {index_name}")
    identified_matches = []
    for _, row in tqdm(pages_df.iterrows(), desc=f"Testing rows for {index_name}"):
        matched_with_answer = 0
        for question in row['questions']:
            found_text = vector_db.find_text(text=question, top_k=1, index_name=index_name)[0]

            if found_text == row['Section Content']:
                matched_with_answer += 1
        identified_matches.append(matched_with_answer)
    pages_df[index_name] = identified_matches
    pbar.update(1)

In [None]:
pages_df