# Init Notebook

In [None]:
# Import necessary libraries
from dotenv import load_dotenv
from openai import OpenAI
import os
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm


# Load environment variables from .env file
load_dotenv()

open_ai_key = os.environ["OPEN_AI_KEY"]

# Data curation
Downloading set of articles to be used for assessment

In [None]:
from embeddings_comparison.utils import wiki_parser
from pprint import pprint
from embeddings_comparison.utils.storage import ArticleStorage

storage = ArticleStorage()

raw_pages = wiki_parser.extract_pages_from_file("data/Wikipedia-20241111162837.xml")
storage.save_articles(raw_pages)

# Data filtering

Remove sections that don't contain text or are not relevant

In [None]:
pages_df = storage.load_all()
pages_df = pages_df[pages_df['Section Title'] != 'Linki zewnętrzne']
pages_df = pages_df[pages_df['Section Title'] != 'Zobacz też']
pages_df = pages_df[pages_df['Section Title'] != 'Bibliografia']
pages_df

# Data Ingestion

In [None]:
from embeddings_comparison.utils.embedding_models.hugging_face import HF_EMBEDDING_MODEL_NAME, HFEmbeddingModel
from embeddings_comparison.utils.embedding_models.open_ai import OPENAI_EMBEDDING_MODEL_NAME, OpenAIEmbeddingModel
from embeddings_comparison.utils.vectordb.vectordb import VectorDB, VectorIndex
from tqdm.autonotebook import tqdm

vector_db = VectorDB()
# vector_db.add_index(f"OPENAI_SMALL", OpenAIEmbeddingModel(api_key=open_ai_key, model=OPENAI_EMBEDDING_MODEL_NAME.TEXT_EMBEDDING_3_SMALL))
# vector_db.add_index(f"OPENAI_LARGE", OpenAIEmbeddingModel(api_key=open_ai_key, model=OPENAI_EMBEDDING_MODEL_NAME.TEXT_EMBEDDING_3_LARGE))
# vector_db.add_index(f"OPENAI__ADA", OpenAIEmbeddingModel(api_key=open_ai_key, model=OPENAI_EMBEDDING_MODEL_NAME.TEXT_EMBEDDING_ADA_002))
vector_db.add_index(f"HF_SDADAS", HFEmbeddingModel(model_name=HF_EMBEDDING_MODEL_NAME.ST_POLISH_PARAPHRASE_FROM_DISTILROBERTA))

for index_name in tqdm(vector_db.list_indices(), desc='Testing Embedding models'):
    vector_db.insert_texts(pages_df['Section Content'].values.tolist(), index_name)