# Initial environ

In [1]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

# Loading data

In [2]:
import pickle
from pathlib import Path

articles_cache = Path(".local/articles.raw.pickle")

if articles_cache.exists():
    with open(articles_cache, "rb") as f:
        articles = pickle.load(f)

else:
    from gondar.tools import EntrezAPIWrapper
    entrez = EntrezAPIWrapper(retmax=300)
    articles = entrez.load("Chlamydomonas reinhardtii")
    with open(articles_cache, "wb") as f:
        pickle.dump(articles, f)

total_sentences = 0
for i in range(len(articles)):
    total_sentences += len(articles[i]["body"])

print(f"Total sentences: {total_sentences}")
print(f"Average sentences of an article: {total_sentences / len(articles)}")

Total sentences: 58591
Average sentences of an article: 195.30333333333334


# Embedding Data

In [3]:
def bulk_embedding(client, articles, batch_size: int = 2000):
    embedding = []
    
    sentences = sum([_["body"] for _ in articles], [])
    batchs = (sentences[i:i+batch_size] for i in range(0, len(sentences), batch_size))
    
    for batch in batchs:
        res = client.embeddings.create(input=batch, model="text-embedding-ada-002", timeout=60)
        print(res.usage)
        embedding.append([_.embedding for _ in res.data])
    
    return embedding

In [4]:
import numpy as np

embedding_cache = Path(".local/embedding.npy")

if embedding_cache.exists():
    embedding = np.load(embedding_cache)

else:
    from openai import AzureOpenAI
    client = AzureOpenAI(
                azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
                azure_deployment=os.environ.get("AZURE_EMBEDDING_DEPLOYMENT"),
                api_version=os.environ.get("AZURE_API_VERSION"),
                api_key=os.environ.get("AZURE_OPENAI_KEY")
            )

    embedding = np.array(bulk_embedding(client, articles))

    np.save(embedding_cache, embedding)

# Generate Query Embedding