In [None]:
import sys
sys.path.append('./src')
import pandas as pd
from app.llm import async_embed_text, async_response_openai, GenText
from app.ranker import retrieve_top_k, euclidean_distance
from app.prompts import Prompts
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


## Load texts

In [None]:
import os

root_dir = "book"   # top-level book directory

book_data = []
for chapter, subdirs, files in os.walk(root_dir):
    if chapter == root_dir:
        continue
    # print(f"\nðŸ“‚ Chapter: {chapter} ")
    chapter_number = int(chapter.split('\\')[-1].split('__')[0])
    chapter_title = chapter.split('\\')[-1].split('__')[-1]

    for file in files:
        # print(f"$ {file}")
        subchapter_number = file.split('__')[0]
        subchapter_title = file.split('__')[1]
        subchapter_page = int(file.split('__')[-1].split('.')[0])
        
        if file.endswith(".txt"):
            filepath = os.path.join(chapter, file)
            with open(filepath, "r", encoding="utf-8") as f:
                # print(f"\n--- {file} ---")
                subchapter_text = f.read()
            book_data.append(
                {
                    "chapter_number": chapter_number,
                    "chapter_title": chapter_title,
                    "subchapter_number": subchapter_number,
                    "subchapter_title": subchapter_title,
                    "subchapter_page": subchapter_page,
                    "subchapter_text": subchapter_text
                }
            )


In [None]:
# Export segmented book to parquet
df = pd.DataFrame(book_data)
df.to_parquet("book.parquet", index=False)

## Embeddings

In [None]:
df_partition = df

In [None]:
text_list = df_partition.subchapter_text.to_list()

In [None]:
import asyncio

tasks = [
    async_embed_text(text=text)
    for text in text_list
    ]

embeddings = await asyncio.gather(*tasks, return_exceptions=True)

In [None]:
import asyncio

async def embed_text_list_in_chunks(text_list, chunk_size=20):
    # Helper: chunk the list without losing order
    def chunks(lst, n):
        for i in range(0, len(lst), n):
            yield lst[i:i + n]

    results = []
    for chunk in chunks(text_list, chunk_size):
        tasks = [async_embed_text(text=text) for text in chunk]
        chunk_results = await asyncio.gather(*tasks, return_exceptions=True)
        results.extend(chunk_results)

    return results


# Usage
embeddings = await embed_text_list_in_chunks(text_list)

In [None]:
df_partition_embeddings = df_partition.assign(embedding=embeddings)
df_partition_embeddings.to_parquet('book_partition_2.parquet', index=False)

## Retrieval

In [None]:
df_rag = pd.read_parquet('book_partition_full.parquet')

In [None]:
question = """

"""

student_answer = "c. proximal and distal portions"

retrieval_top_k = 1

In [None]:
question_text_embedding = await async_embed_text(text=question)

In [None]:
df_rag_ranked = retrieve_top_k(
    df_rag=df_rag, 
    query_embedding=question_text_embedding,
    top_k=retrieval_top_k)

In [None]:
df_rag_ranked
retrieved_pages = df_rag_ranked['subchapter_page'].values
retrieved_text = " \n".join(df_rag_ranked['subchapter_text'].values)
retrieved_text


## LLM feedback

In [None]:


system_prompt, user_prompt = await Prompts.feedback(
    question=question,
    student_answer=student_answer,
    retrieved_text=retrieved_text
    )

In [None]:
response = await async_response_openai(
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    model = 'gpt-4o-mini',
    response_model=GenText,
    temperature=0.0001
)

In [None]:
response.text

In [None]:
int(retrieved_pages[0])

In [None]:
df_rag_ranked
retrieved_pages = df_rag_ranked['subchapter_page'].values
retrieved_text = " \n".join(df_rag_ranked['subchapter_text'].values)
retrieved_text

In [None]:
print(retrieved_pages)

In [None]:
citations = []
for i, row in df_rag_ranked.iterrows():
    citations.append(
        f"{i+1}. Chapter: {chapter_number}. {chapter_title}"
        f" | Subchapter {subchapter_number}. {subchapter_title}"
        f" | page: {subchapter_page}"
    )
    print(row.chapter_number)
    print(row.chapter_title)
    print(row.subchapter_number)
    print(row.subchapter_title)
    print(row.subchapter_page)

## Tokenize

In [None]:
import tiktoken

In [None]:
enc = tiktoken.encoding_for_model("gpt-4o")

In [None]:
enc.encode(text="This is it")

In [None]:
df['tokens'] = df['subchapter_text'].apply(lambda x: enc.encode(text=x))

In [None]:
tokenized_text = enc.encode(text=df['subchapter_text'][4])

In [None]:
len(tokenized_text)

In [None]:
df['tokens_len'] = df['tokens'].apply(lambda x: len(x))

In [None]:
df.sort_values('tokens_len',ascending=False)