# Init Google Colab

To be run only in Google Colab environment

In [14]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

## Install necessary libraries

In [15]:
# Install necessary libraries

if IN_COLAB:
    !pip install python-dotenv==1.0.1
    !pip install openai==1.56.2
    !pip install requests==2.32.3
    !pip install httpx==0.28.1
    !pip install gdown
    !pip install faiss-cpu==1.9.0
    !pip install tiktoken==0.8.0

## Download utils from git repo

In [16]:
if IN_COLAB:
    user = "AndreyProfDev"
    repo = "articles_materials"

    # remove local directory if it already exists
    if os.path.isdir(repo):
        !rm -rf {repo}

    !git clone https://github.com/{user}/{repo}.git
    !rm -rf "{repo}/src/OpenAI or Open-Source? Choosing the Right Embedding Model for Polish Text.ipynb"

    import sys

    src_dir = "src"

    path = f"{repo}/{src_dir}"
    if not path in sys.path:
        sys.path.insert(1, path)

## Download data and cache

If you want to generate results from scratch then don't donwnload cache folder or just remove it

In [17]:
import gdown
import tarfile
import os

if IN_COLAB:
    folders_to_download = [('https://drive.google.com/uc?id=1WairnDfu6HpYuTHxwUs1rdunL498UYqD', 'cache'),
                        ('https://drive.google.com/uc?id=14eXyre-s26vNXsh-cuzq9PvH9ZexuEIB', 'data')]

    for link, folder in folders_to_download:
        cache_folder = f'articles_materials/{folder}'

        if os.path.isdir(cache_folder):
            !rm -rf {cache_folder}

        file_path = f'articles_materials/{folder}.tar.gz'

        # Download the file
        gdown.download(link, file_path, quiet=False)

        # Path to the .tar.gz file

        # Open the tar.gz file
        with tarfile.open(file_path, 'r:gz') as tar:
            # Extract all the contents to a directory
            tar.extractall(path='articles_materials')

# Init Notebook

- Define necessary imports
- Initialise logging
- Initialize environment variables

In [None]:
from dotenv import load_dotenv
from openai import OpenAI
import os
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm
import matplotlib
import matplotlib.pyplot as plt
import logging
from tqdm.autonotebook import tqdm
import pandas as pd

# Load environment variables from .env file
load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)

logging.info("Logging initiated")

# Set up OpenAI API
open_ai_key = os.environ["OPEN_AI_KEY"]

# Define paths
data_path = "articles_materials/data"
cache_path = "articles_materials/cache"

# Display architecture of selected open source models

In [None]:
import utils.embedding_models.providers.supported_models as supported_models

logging.getLogger("sentence_transformers.SentenceTransformer").setLevel(logging.WARNING)

# embedding_model = SentenceTransformer(supported_models.ST_POLISH_PARAPHRASE_FROM_MPNET.model_name)
# embedding_model = SentenceTransformer(supported_models.ORB_ST_POLISH_KARTONBERTA_BASE_ALPHA_V1.model_name)
# embedding_model = SentenceTransformer(supported_models.ORB_KARTONBERT_USE.model_name)

embedding_model = SentenceTransformer(supported_models.ST_POLISH_PARAPHRASE_FROM_DISTILROBERTA.model_name)
num_params = sum(p.numel() for p in embedding_model.parameters())

embedding_model.bfloat16()

print("Number of parameters: ", num_params, "\n")

print("Model architecture:\n" + str(embedding_model))

# Move to CPU to free up GPU memory
embedding_model.to("cpu");

# Initialize Open AI client

Open AI client will be used to augment wiki data with generated question

In [None]:
from pathlib import Path
from utils.llm_clients.cached_client import CachedLLMClient
from utils.llm_clients.cost_monitoring import LLMClientWithCostMonitoring
from utils.llm_clients.providers.open_ai_client import OpenAIClient
from utils.llm_clients.providers import supported_models

openai_client = OpenAIClient(api_key=open_ai_key, model_info=supported_models.GPT_4O)
openai_client = CachedLLMClient(client=openai_client, path_to_cache = Path(f"{cache_path}/completion_cache"))
openai_client = LLMClientWithCostMonitoring(client=openai_client)

# Data curation

Process mediawiki xml files downloaded from wikipedia:
1) Extract text from xml
2) Clean text
3) Split text by sections

All intermediate data will be stored in the following subfolders:
- _0_raw files_ - contains original xml files
- _1_extracted_pages_ - contains yaml files with extracted text
- _2_processed_html_pages_ - contains yaml files with text after removing html tags
- _3_processed_markdown_pages_ - contains yaml files with text after removing markdown tags
- _4_split_sections_ - contains yaml files with text split by sections
- _5_remove_empty_articles_ - contains yaml files after removing empty articles

In [None]:
from pathlib import Path
import utils.wiki_parser.wiki_parser as wiki_parser
from pprint import pprint
from utils.storage import ArticleStorage
import os

storage = ArticleStorage()

data_files = os.listdir(f"{data_path}/0_raw files")
pbar = tqdm(total = len(data_files))
for filename in data_files:
    pbar.set_description(f'Processing {filename}')
    if filename.endswith(".xml"):
        raw_pages = wiki_parser.extract_articles_from_file(f"{data_path}/0_raw files/" + filename, output_folder=Path(data_path))
        storage.save_articles(raw_pages)
    pbar.update(1)

pages_df = storage.load_all()

# Data filtering

1) Load data from vector storage as pandas dataframe
2) Remove sections that don't contain text or sections with text just listing references
3) Build _Section With Context_ column by adding article title and section title to the section text

In [None]:
pages_df = storage.load_all()

pages_df = pages_df[pages_df['Section Title'] != 'Linki zewnętrzne']
pages_df = pages_df[pages_df['Section Title'] != 'Zobacz też']
pages_df = pages_df[pages_df['Section Title'] != 'Bibliografia']
pages_df = pages_df[pages_df['Section Title'] != 'Przypisy']

pages_df['Section With Context'] = pages_df['Article Title'] + '\n' + pages_df['Section Title'] + '\n' + pages_df['Section Content']
pages_df.loc[pages_df['Section Title'] == 'Main', 'Section With Context'] = pages_df['Article Title'] + '\n' + pages_df['Section Content']

pages_df = pages_df.drop_duplicates()
pages_df = pages_df.reset_index(drop=True)

pages_df.head()

# Data Augmentation

Generate artificial questions for each section using Open AI client

In [None]:
from utils.question_generation import BASE_PROMT_PL, generate_question_for_text
from tqdm.auto import tqdm
from time import sleep

logging.getLogger("httpx").setLevel(logging.WARNING)

pbar = tqdm(total=len(pages_df))
questions_column = []

promt_tokens_bar = tqdm(desc="Promt cost ($): ")
completions_tokens_bar = tqdm(desc="Completions cost ($):")

for _, row in pages_df.iterrows():
    pbar.set_description(f"Generating questions for {row['Article Title']}/{row['Section Title']}")
    
    questions = generate_question_for_text(openai_client, row['Section With Context'], BASE_PROMT_PL)
    questions_column.append(questions.questions)

    pbar.update(1)
    promt_tokens_bar.update(openai_client.get_total_promt_cost() - promt_tokens_bar.n)
    completions_tokens_bar.update(openai_client.get_total_completion_cost() - completions_tokens_bar.n)

pages_df['questions'] = questions_column

# Data Ingestion

## Ingestion of data into the database

In [None]:
from utils.embedding_models.providers import hugging_face
from utils.embedding_models.providers import open_ai
from utils.embedding_models.providers import supported_models
from utils.vectordb.vectordb import VectorDB

logging.getLogger("httpx").setLevel(logging.WARNING)

path_to_cache = Path(f"{cache_path}/embeddings_cache")

# Define embedding models
embedding_models = {
    "HF_SDADAS": hugging_face.init_model(model_info=supported_models.ST_POLISH_PARAPHRASE_FROM_DISTILROBERTA, path_to_cache=path_to_cache),
    "OPENAI_SMALL": open_ai.init_model(api_key=open_ai_key, model_info=supported_models.TEXT_EMBEDDING_3_SMALL, path_to_cache=path_to_cache),
    "OPENAI_LARGE": open_ai.init_model(api_key=open_ai_key, model_info=supported_models.TEXT_EMBEDDING_3_LARGE, path_to_cache=path_to_cache),
    "OPENAI__ADA": open_ai.init_model(api_key=open_ai_key, model_info=supported_models.TEXT_EMBEDDING_ADA_002, path_to_cache=path_to_cache),
    "HF_MPNET": hugging_face.init_model(model_info=supported_models.ST_POLISH_PARAPHRASE_FROM_MPNET, path_to_cache=path_to_cache),
    "HF_KARTONBERTA": hugging_face.init_model(model_info=supported_models.ORB_ST_POLISH_KARTONBERTA_BASE_ALPHA_V1, path_to_cache=path_to_cache),
    "ORB_KARTONBERT_USE": hugging_face.init_model(model_info=supported_models.ORB_KARTONBERT_USE),

}

# Initialize vector database and add indices (one per each embedding model)
vector_db = VectorDB()
for index_name, index in embedding_models.items():
    vector_db.add_index(index_name, index)

# Populate each index with data
sections = pages_df['Section With Context'].values.tolist()
for index_name in vector_db.list_indices():
    cost_bar = tqdm(desc=f'{index_name}. Total cost ($)')
    model = embedding_models[index_name]
    for text in tqdm(sections, desc=f'{index_name}. Processed items'):
        vector_db.insert_text(text, index_name)
        cost_bar.update(model.get_total_cost() - cost_bar.n)

## Ingestion cost

In [None]:
from chart_utils import draw_bar_chart

embeddings_costs = [{"Model" : model.model_info.model_name, "Cost ($)": model.get_total_cost()} for model in embedding_models.values()]
embeddings_costs_df = pd.DataFrame(embeddings_costs)
embeddings_costs_df = embeddings_costs_df.sort_values("Cost ($)", ascending=False)
embeddings_costs_df = embeddings_costs_df.set_index("Model")
embeddings_costs_df = embeddings_costs_df[embeddings_costs_df["Cost ($)"] > 0]

display(embeddings_costs_df)

draw_bar_chart(df=embeddings_costs_df, title='Total cost to generate embeddings ($)', bar_label_format = "%.2f $", 
               grey_colors=['#80C4E9'], highlight_colors=['#80C4E9'], numeric_labels_padding = -40, size=(8, 3))

## Ingestion time

In [None]:
from utils.embedding_models.providers.supported_models import ORB_KARTONBERT_USE
from chart_utils import draw_bar_chart

embeddings_time = [{"Model" : model.model_info.model_name, "Total time": model.get_total_time()} for model in embedding_models.values()]
embeddings_time_df = pd.DataFrame(embeddings_time)
embeddings_time_df = embeddings_time_df.sort_values(by="Total time", ascending=False)
embeddings_time_df = embeddings_time_df.set_index("Model")
display(embeddings_time_df)

draw_bar_chart(df=embeddings_time_df, title='Total time to generate embeddings (seconds)', 
               bar_label_format = "%d s", grey_colors=['#CDCDCD'], highlight_colors=['#80C4E9'], numeric_labels_padding=5, 
               to_highlight=[ORB_KARTONBERT_USE.model_name])

# Model Evaluation

## Running the evaluation

In [None]:
from tqdm.autonotebook import tqdm

test_df = pages_df.copy() #[:800]

evaluatiion_results = []

for index_name in vector_db.list_indices():
    model = embedding_models[index_name]
    cost = model.get_total_cost()
    embedding_tokens_bar = tqdm(desc=f"{index_name}. Embedding cost ($): ")

    for k in [1, 5, 10]:
        for _, row in tqdm(list(test_df.iterrows()), desc=f"{index_name}. k={k}. Wiki sections: "):
            matched_with_answer = 0
            for question in row['questions']:

                found_text = vector_db.find_text(text=question, top_k=k, index_name=index_name)
                
                embedding_tokens_bar.update(model.get_total_cost() - cost - embedding_tokens_bar.n)
                evaluatiion_results.append({
                    "Model Name": model.model_info.model_name,
                    "k": k,
                    "Section": row['Section With Context'],
                    "Question": question,
                    "Found text": found_text,
                    "Matched with answer": row['Section With Context'] in found_text
                })

evaluatiion_results_df = pd.DataFrame(evaluatiion_results)
evaluatiion_results_df = evaluatiion_results_df.pivot_table(index=['Section', 'Question', 'Model Name'], columns=['k'], values=['Matched with answer'], aggfunc='sum').add_prefix('k=')
evaluatiion_results_df = evaluatiion_results_df.droplevel(0, axis=1).reset_index()
evaluatiion_results_df.head()

## Average number of questions correctly answered

In [None]:
questions_per_section_df = evaluatiion_results_df.groupby(['Section', 'Model Name'])[['k=1', 'k=5', 'k=10']].sum()
meand_qs_df = questions_per_section_df.groupby('Model Name').mean().reset_index()
meand_qs_df.set_index('Model Name', inplace=True)
meand_qs_df = meand_qs_df.sort_values(by='k=1', ascending=True)

print('Mean number of questions matched with answer')
display(meand_qs_df.head())

meand_qs_df.columns = ['Top 1 record', 'Top 5 records', 'Top 10 records']
draw_bar_chart(meand_qs_df, title="Average # of questions matched with the answer", to_highlight=[ORB_KARTONBERT_USE.model_name])

## Total number of questions correctly answered

In [None]:
total_qs_df = evaluatiion_results_df.groupby(['Model Name'])[['k=1', 'k=5', 'k=10']].sum()
total_qs_df = total_qs_df.sort_values(by='k=1', ascending=True)

display(total_qs_df)

total_qs_df.columns = ['Top 1 record', 'Top 5 records', 'Top 10 records']
draw_bar_chart(total_qs_df, title="Total # of questions matched with the answer", 
               bar_label_format="%d", 
               to_highlight=[ORB_KARTONBERT_USE.model_name])