# Init Notebook

In [None]:
# Import necessary libraries
from dotenv import load_dotenv
from openai import OpenAI
import os
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm

from utils.llm_clients.cached_client import CachedLLMClient
from utils.llm_clients.cost_monitoring import LLMClientWithCostMonitoring
from utils.llm_clients.providers.open_ai_client import OpenAIClient
from utils.llm_clients.providers import supported_models

# Load environment variables from .env file
load_dotenv()

open_ai_key = os.environ["OPEN_AI_KEY"]

openai_client = OpenAIClient(api_key=open_ai_key, model_info=supported_models.GPT_4O)
openai_client = CachedLLMClient(client=openai_client)
openai_client = LLMClientWithCostMonitoring(client=openai_client)

import logging

logging.basicConfig(level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)

logging.info("Logging initiated")

# Data curation
Downloading set of articles to be used for assessment

In [None]:
from pathlib import Path
import utils.wiki_parser.wiki_parser as wiki_parser
from pprint import pprint
from utils.storage import ArticleStorage
import os

storage = ArticleStorage()

for filename in os.listdir("data"):
    if filename.endswith(".xml"):
        print(filename)
        raw_pages = wiki_parser.extract_articles_from_file("data/" + filename, output_folder=Path("data"))
        storage.save_articles(raw_pages)

pages_df = storage.load_all()

# Data filtering

Remove sections that don't contain text or are not relevant

In [None]:
pages_df = storage.load_all()
pages_df = pages_df[pages_df['Section Title'] != 'Linki zewnętrzne']
pages_df = pages_df[pages_df['Section Title'] != 'Zobacz też']
pages_df = pages_df[pages_df['Section Title'] != 'Bibliografia']
pages_df = pages_df[pages_df['Section Title'] != 'Przypisy']
pages_df = pages_df.drop_duplicates()
pages_df = pages_df.reset_index(drop=True)
pages_df['Section With Context'] = pages_df['Article Title'] + '\n' + pages_df['Section Title'] + '\n' + pages_df['Section Content']
pages_df.loc[pages_df['Section Title'] == 'Main', 'Section With Context'] = pages_df['Article Title'] + '\n' + pages_df['Section Content']
pages_df
# pages_df.to_excel("data.xlsx")

# Augment data

In [None]:
from utils.question_generation import BASE_PROMT_PL, generate_question_for_text
from tqdm.auto import tqdm
from time import sleep

logging.getLogger("httpx").setLevel(logging.WARNING)

pbar = tqdm(total=len(pages_df))
questions_column = []

promt_tokens_bar = tqdm(desc="Promt cost ($): ")
completions_tokens_bar = tqdm(desc="Completions cost ($):")

for _, row in pages_df.iterrows():
    pbar.set_description(f"Generating questions for {row['Article Title']}/{row['Section Title']}")
    questions = generate_question_for_text(openai_client, row['Section With Context'], BASE_PROMT_PL)
    questions_column.append(questions.questions)
    pbar.update(1)
    # sleep(1)
    promt_tokens_bar.update(openai_client.get_total_promt_cost() - promt_tokens_bar.n)
    completions_tokens_bar.update(openai_client.get_total_completion_cost() - completions_tokens_bar.n)

pages_df['questions'] = questions_column

# Data Ingestion

In [None]:
from utils.embedding_models.caching import CachedEmbeddingModel
from utils.embedding_models.providers import hugging_face
from utils.embedding_models.providers import open_ai
from utils.embedding_models.providers import supported_models
from utils.embedding_models.schema import EmbeddingModel
from utils.vectordb.vectordb import VectorDB, VectorIndex
from tqdm.autonotebook import tqdm
from time import sleep

def wrapper(model: EmbeddingModel):
    return CachedEmbeddingModel(model)

embedding_models = {
    "HF_SDADAS": hugging_face.init_model(model_info=supported_models.ST_POLISH_PARAPHRASE_FROM_DISTILROBERTA),
    "OPENAI_SMALL": open_ai.init_model(api_key=open_ai_key, model_info=supported_models.TEXT_EMBEDDING_3_SMALL),
    "OPENAI_LARGE": open_ai.init_model(api_key=open_ai_key, model_info=supported_models.TEXT_EMBEDDING_3_LARGE),
    "OPENAI__ADA": open_ai.init_model(api_key=open_ai_key, model_info=supported_models.TEXT_EMBEDDING_ADA_002),
}

vector_db = VectorDB()
for index_name, index in embedding_models.items():
    vector_db.add_index(index_name, index)  

logging.getLogger("httpx").setLevel(logging.WARNING)

sections = pages_df['Section With Context'].values.tolist()
for index_name in vector_db.list_indices():
    cost_bar = tqdm(desc=f'{index_name}. Total cost ($)')
    model = embedding_models[index_name]
    for text in tqdm(sections, desc=f'{index_name}. Processed items'):
        vector_db.insert_text(text, index_name)
        cost_bar.update(model.get_total_cost() - cost_bar.n)

# Perform the experiment

In [None]:
from tqdm.autonotebook import tqdm

test_df = pages_df.copy()[:400]

for index_name in vector_db.list_indices():
    model = embedding_models[index_name]
    cost = model.get_total_cost()
    embedding_tokens_bar = tqdm(desc=f"{index_name}. Embedding cost ($): ")

    identified_matches = []
    for _, row in tqdm(list(test_df.iterrows()), desc=f"{index_name}. Wiki sections: "):
        matched_with_answer = 0
        for question in row['questions']:
            found_text = vector_db.find_text(text=question, top_k=1, index_name=index_name)[0]

            if found_text == row['Section With Context']:
                matched_with_answer += 1
            
            embedding_tokens_bar.update(model.get_total_cost() - cost)
        identified_matches.append(matched_with_answer)
    test_df[index_name] = identified_matches

In [None]:
test_df['HF_SDADAS'].mean(), test_df['HF_SDADAS'].median()

In [None]:
test_df['OPENAI_SMALL'].mean(), test_df['HF_SDADAS'].median()

In [None]:
test_df['OPENAI_LARGE'].mean(), test_df['HF_SDADAS'].median()

In [None]:
test_df['OPENAI__ADA'].mean(), test_df['HF_SDADAS'].median()

In [None]:
sample = test_df.iloc[0]
question_0 = sample['questions'][0]
answer = sample['Section With Context']

In [None]:
found_asnwers = vector_db.find_text(text=question_0, top_k=1, index_name='OPENAI_LARGE')
found_asnwers

In [None]:
answer