In [None]:
import pandas as pd
from openai import OpenAIError, AsyncOpenAI
from tenacity import (retry,
                      stop_after_attempt,
                      wait_exponential,
                      retry_if_exception_type,
                      before_sleep_log,
                      after_log)
from typing import List
from config import OPENAI_API_KEY
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)

@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=1, max=10),
    retry=retry_if_exception_type(OpenAIError),
    before_sleep=before_sleep_log(logger, logging.WARNING),
    after=after_log(logger, logging.INFO)
)
async def async_embed_text(
    text: str,
    model: str = 'text-embedding-3-large'
) -> List[float]:
    response = await async_openai_client.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0].embedding

## Load texts

In [None]:
import os

root_dir = "book"   # top-level book directory

book_data = []
for chapter, subdirs, files in os.walk(root_dir):
    if chapter == root_dir:
        continue
    # print(f"\nðŸ“‚ Chapter: {chapter} ")
    chapter_number = int(chapter.split('\\')[-1].split('__')[0])
    chapter_title = chapter.split('\\')[-1].split('__')[-1]

    for file in files:
        # print(f"$ {file}")
        subchapter_number = file.split('__')[0]
        subchapter_title = file.split('__')[1]
        subchapter_page = int(file.split('__')[-1].split('.')[0])
        
        if file.endswith(".txt"):
            filepath = os.path.join(chapter, file)
            with open(filepath, "r", encoding="utf-8") as f:
                # print(f"\n--- {file} ---")
                subchapter_text = f.read()
            book_data.append(
                {
                    "chapter_number": chapter_number,
                    "chapter_title": chapter_title,
                    "subchapter_number": subchapter_number,
                    "subchapter_title": subchapter_title,
                    "subchapter_page": subchapter_page,
                    "subchapter_text": subchapter_text
                }
            )


In [None]:
# Export segmented book to parquet
df = pd.DataFrame(book_data)
df.to_parquet("book.parquet", index=False)

## Embeddings

In [None]:
df_partition = df[df['chapter_number'] == 1]

In [None]:
text_list = df_partition.subchapter_text.to_list()


In [None]:
import asyncio

tasks = [
    async_embed_text(text=text)
    for text in text_list
    ]

embeddings = await asyncio.gather(*tasks, return_exceptions=True)

In [None]:
df_partition_embeddings = df_partition.assign(embedding=embeddings)
df_partition_embeddings.to_parquet('book_partition_1.parquet', index=False)

## Retrieval

In [None]:
import numpy as np

def euclidean_distance(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """Calculate the Euclidean distance between two equal-length NumPy vectors."""
    vec1 = np.asarray(vec1, dtype=float)
    vec2 = np.asarray(vec2, dtype=float)

    if vec1.shape != vec2.shape:
        raise ValueError("Vectors must have the same shape.")

    return np.linalg.norm(vec1 - vec2)


df_rag = pd.read_parquet('book_partition_1.parquet')
type(df_rag['embedding'][0])

In [None]:
distance = euclidean_distance(df_rag['embedding'][1], df_rag['embedding'][9])

In [48]:
input_text = "I want to learn about the characteristics of living things"

In [49]:
input_text_embedding = await async_embed_text(text=input_text)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [None]:
distance = euclidean_distance(input_text_embedding, df_rag['embedding'][9])

np.float64(1.2445130583268014)

In [None]:
df_rag['distance'] = df_rag[].apply()