In [4]:
import sys
sys.path.append('./src')
import pandas as pd
from app.llm import async_embed_text, async_response_openai, GenText
from app.ranker import retrieve_top_k
from app.prompts import Prompts
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


## Load texts

In [5]:
import os

root_dir = "book"   # top-level book directory

book_data = []
for chapter, subdirs, files in os.walk(root_dir):
    if chapter == root_dir:
        continue
    # print(f"\nüìÇ Chapter: {chapter} ")
    chapter_number = int(chapter.split('\\')[-1].split('__')[0])
    chapter_title = chapter.split('\\')[-1].split('__')[-1]

    for file in files:
        # print(f"$ {file}")
        subchapter_number = file.split('__')[0]
        subchapter_title = file.split('__')[1]
        subchapter_page = int(file.split('__')[-1].split('.')[0])
        
        if file.endswith(".txt"):
            filepath = os.path.join(chapter, file)
            with open(filepath, "r", encoding="utf-8") as f:
                # print(f"\n--- {file} ---")
                subchapter_text = f.read()
            book_data.append(
                {
                    "chapter_number": chapter_number,
                    "chapter_title": chapter_title,
                    "subchapter_number": subchapter_number,
                    "subchapter_title": subchapter_title,
                    "subchapter_page": subchapter_page,
                    "subchapter_text": subchapter_text
                }
            )


In [6]:
# Export segmented book to parquet
df = pd.DataFrame(book_data)
df.to_parquet("book.parquet", index=False)

## Embeddings

In [None]:
df_partition = df[df['chapter_number'] == 1]

In [None]:
text_list = df_partition.subchapter_text.to_list()

In [None]:
import asyncio

tasks = [
    async_embed_text(text=text)
    for text in text_list
    ]

embeddings = await asyncio.gather(*tasks, return_exceptions=True)

In [None]:
df_partition_embeddings = df_partition.assign(embedding=embeddings)
df_partition_embeddings.to_parquet('book_partition_1.parquet', index=False)

## Retrieval

In [None]:
df_rag = pd.read_parquet('book_partition_1.parquet')

In [None]:
question = """
Cutting a midsagittal section through the body separates which 
parts of the body?
 a. anterior and posterior portions
 b. superior and inferior portions
 c. proximal and distal portions
 d. right and left halves
"""

student_answer = "c. proximal and distal portions"

retrieval_top_k = 1

In [None]:
question_text_embedding = await async_embed_text(text=question)

In [None]:
df_rag_ranked = retrieve_top_k(
    df_rag=df_rag, 
    query_embedding=question_text_embedding,
    top_k=retrieval_top_k)

In [None]:
df_rag_ranked
retrieved_pages = df_rag_ranked['subchapter_page'].values
retrieved_text = " \n".join(df_rag_ranked['subchapter_text'].values)
retrieved_text


## LLM feedback

In [None]:


system_prompt, user_prompt = await Prompts.feedback(
    question=question,
    student_answer=student_answer,
    retrieved_text=retrieved_text
    )

In [None]:
response = await async_response_openai(
    system_prompt=system_prompt,
    user_prompt=user_prompt,
    model = 'gpt-4o-mini',
    response_model=GenText,
    temperature=0.0001
)

In [None]:
response.text

In [None]:
int(retrieved_pages[0])

In [None]:
df_rag_ranked
retrieved_pages = df_rag_ranked['subchapter_page'].values
retrieved_text = " \n".join(df_rag_ranked['subchapter_text'].values)
retrieved_text

In [None]:
print(retrieved_pages)

In [None]:
citations = []
for i, row in df_rag_ranked.iterrows():
    citations.append(
        f"{i+1}. Chapter: {chapter_number}. {chapter_title}"
        f" | Subchapter {subchapter_number}. {subchapter_title}"
        f" | page: {subchapter_page}"
    )
    print(row.chapter_number)
    print(row.chapter_title)
    print(row.subchapter_number)
    print(row.subchapter_title)
    print(row.subchapter_page)

## Tokenize

In [7]:
import tiktoken

In [2]:
enc = tiktoken.encoding_for_model("gpt-4o")

In [3]:
enc.encode(text="This is it")

[2500, 382, 480]

In [16]:
df['tokens'] = df['subchapter_text'].apply(lambda x: enc.encode(text=x))

In [14]:
tokenized_text = enc.encode(text=df['subchapter_text'][4])

In [15]:
len(tokenized_text)

556

In [18]:
df['tokens_len'] = df['tokens'].apply(lambda x: len(x))

In [21]:
df.sort_values('tokens_len',ascending=False)

Unnamed: 0,chapter_number,chapter_title,subchapter_number,subchapter_title,subchapter_page,subchapter_text,tokens,tokens_len
63,4,Tissue Level of Organization,4.2d,Classification of Connective Tissue,96,4 Tissue Level of Organization\n4.2 Connective...,"[19, 195379, 16541, 328, 32130, 198, 19, 13, 1...",6465
120,7,Axial Skeleton,7.1c,Bones of the Cranium,182,7 Axial Skeleton\n7.1 Skull\n7.1c Bones of the...,"[22, 34568, 563, 136367, 198, 22, 13, 16, 1449...",4450
131,7,Axial Skeleton,7.4c,Vertebral Anatomy,202,7 Axial Skeleton\n7.4 Vertebral Column\n7.4c V...,"[22, 34568, 563, 136367, 198, 22, 13, 19, 1258...",3713
20,2,The Cell; Basic Unit of Structure and Function,2.3c,Transport Across the Plasma Membrane,32,2 The Cell: Basic Unit of Structure and Functi...,"[17, 623, 23050, 25, 21976, 14856, 328, 52074,...",3669
24,2,The Cell; Basic Unit of Structure and Function,2.4c,Organelles,37,2 The Cell: Basic Unit of Structure and Functi...,"[17, 623, 23050, 25, 21976, 14856, 328, 52074,...",3653
...,...,...,...,...,...,...,...,...
482,25,Respiratory System,25.4,Lungs,752,25 Respiratory System\n25.4 Lungs\n----------\n,"[1161, 146100, 7295, 1219, 198, 1161, 13, 19, ...",12
520,26,Digestive System,26.7,Stomach,783,26 Digestive System\n26.7 Stomach\n----------\n,"[2109, 88507, 585, 1219, 198, 2109, 13, 22, 90...",12
231,13,Surface Anatomy,13.2,Head Region,391,13 Surface Anatomy\n13.2 Head Region\n--------...,"[1311, 46665, 139777, 198, 1311, 13, 17, 12578...",11
232,13,Surface Anatomy,13.3,Neck Region,393,13 Surface Anatomy\n13.3 Neck Region\n--------...,"[1311, 46665, 139777, 198, 1311, 13, 18, 61480...",11
