### Import libraries

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage

import os
import os.path
import base64
from dotenv import load_dotenv
from supabase.client import Client, create_client
from unstructured.partition.pdf import partition_pdf
import pytesseract

  from .autonotebook import tqdm as notebook_tqdm


### Data Loading

#### Partition PDF texts, tables, and images
Use __[Unstructured](https://docs.unstructured.io/open-source/core-functionality/partitioning)__ to partition elements

In [2]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

input_path = os.getcwd()
output_path = os.path.join(os.getcwd(), "output")

# Get elements
raw_pdf_elements = partition_pdf(
    filename=os.path.join(input_path, "Algebra-and-Trigonometry-2e-WEB-769-1035.pdf"),
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=output_path
)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
text_elements = []
table_elements = []
image_elements = []

# Function to encode images
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

for element in raw_pdf_elements:
    if 'CompositeElement' in str(type(element)):
        text_elements.append(element)
    elif 'Table' in str(type(element)):
        table_elements.append(element)

table_elements = [i.text for i in table_elements]
text_elements = [i.text for i in text_elements]

# Tables
print(len(table_elements))

# Text
print(len(text_elements))

59
192


In [4]:
text_elements[4]

'y = Asin(Bx) and y =\n\nAcos (Bx)\n\nThe amplitude is which is the vertical height from the midline In addition, notice in the example that\n\n|A| = amplitude = > lmaximum — minimum|\n\nEXAMPLE 2\n\nIdentifying the Amplitude of a Sine or Cosine Function What is the amplitude of the sinusoidal function\n\nIs the function stretched or compressed vertically?\n\nSolution\n\nLet’s begin by comparing the function to the simplified form\n\nIn the given function, so the amplitude is The function is stretched.\n\nAccess for free at openstax.org\n\n8.1 • Graphs of the Sine and Cosine Functions\n\nAnalysis\n\nThe negative value of results in a reflection across the x-axis of the sine function, as shown in Figure 10.\n\nF(x)\n\nFigure 10\n\nTRY IT #2 What is the amplitude of the sinusoidal function Is the function stretched or compressed vertically?\n\nAnalyzing Graphs of Variations of y= sin xand y= cos x\n\nNow that we understand how and explore the variables relate to the general form equation

In [5]:
input_path = os.getcwd()
output_path = os.path.join(os.getcwd(), "figures")

# Encode images to base64 form
for image_file in os.listdir(output_path):
    if image_file.endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(output_path, image_file)
        encoded_image = encode_image(image_path)
        image_elements.append(encoded_image)
print(len(image_elements))

387


#### Summarize text, tables, and images

In [6]:
load_dotenv()

# load the environment variables of the OpenAI API key
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

chain_gpt_4o_mini = ChatOpenAI(model="gpt-4o-mini-2024-07-18", max_tokens=4000)
chain_gpt_4o = ChatOpenAI(model="gpt-4o-2024-08-06", max_tokens=1024)

  chain_gpt_4o_mini = ChatOpenAI(model="gpt-4o-mini-2024-07-18", max_tokens=4000)


In [7]:
# Function for text summaries
def summarize_text(text_element):
    prompt = f"Summarize the following text:\n\n{text_element}\n\nSummary:"
    response = chain_gpt_4o_mini.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for table summaries
def summarize_table(table_element):
    prompt = f"Summarize the following table:\n\n{table_element}\n\nSummary:"
    response = chain_gpt_4o_mini.invoke([HumanMessage(content=prompt)])
    return response.content

# Function for image summaries
def summarize_image(encoded_image):
    prompt = [
        AIMessage(content="You are a bot that is good at analyzing images."),
        HumanMessage(content=[
            {"type": "text", "text": "Describe the contents of this image."},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{encoded_image}"
                },
            },
        ])
    ]
    response = chain_gpt_4o.invoke(prompt)
    return response.content

In [None]:
# # Processing table elements with feedback and sleep
# table_summaries = []
# for i, te in enumerate(table_elements):
#     summary = summarize_table(te)
#     table_summaries.append(summary)
#     print(f"{i + 1}th element of tables processed.")

In [18]:
# Processing text elements with feedback and sleep
text_summaries = []
for i, te in enumerate(text_elements):
    summary = summarize_text(te)
    text_summaries.append(summary)
    print(f"{i + 1}th element of texts processed.")

1th element of texts processed.
2th element of texts processed.
3th element of texts processed.
4th element of texts processed.
5th element of texts processed.
6th element of texts processed.
7th element of texts processed.
8th element of texts processed.
9th element of texts processed.
10th element of texts processed.
11th element of texts processed.
12th element of texts processed.
13th element of texts processed.
14th element of texts processed.
15th element of texts processed.
16th element of texts processed.
17th element of texts processed.
18th element of texts processed.
19th element of texts processed.
20th element of texts processed.
21th element of texts processed.
22th element of texts processed.
23th element of texts processed.
24th element of texts processed.
25th element of texts processed.
26th element of texts processed.
27th element of texts processed.
28th element of texts processed.
29th element of texts processed.
30th element of texts processed.
31th element of tex

In [19]:
text_summaries[21]

"The text provides a guide on how to graph one period of periodic functions, particularly focusing on variations of the secant and cosecant functions. It outlines a step-by-step approach that includes expressing the function in a specific form, identifying the stretching/compressing factor, determining the period and phase shift, and then graphing the function while adjusting for these factors. It emphasizes the importance of vertical shifts and stretches/compressions on the range and domain of the function, particularly noting how asymptotes relate to the graph's characteristics. Examples illustrate the process, including identifying asymptotes and plotting key points to complete the graph."

In [20]:
# Processing image elements with feedback and sleep
image_summaries = []
for i, ie in enumerate(image_elements):
    summary = summarize_image(ie)
    image_summaries.append(summary)
    print(f"{i + 1}th element of images processed.")

1th element of images processed.
2th element of images processed.
3th element of images processed.
4th element of images processed.
5th element of images processed.
6th element of images processed.
7th element of images processed.
8th element of images processed.
9th element of images processed.
10th element of images processed.
11th element of images processed.
12th element of images processed.
13th element of images processed.
14th element of images processed.
15th element of images processed.
16th element of images processed.
17th element of images processed.
18th element of images processed.
19th element of images processed.
20th element of images processed.
21th element of images processed.
22th element of images processed.
23th element of images processed.
24th element of images processed.
25th element of images processed.
26th element of images processed.
27th element of images processed.
28th element of images processed.
29th element of images processed.
30th element of images 

In [21]:
image_summaries[10]

'The image shows the Giza Pyramids located in Egypt. There are three large pyramids visible, with a few smaller structures in the foreground. The desert landscape surrounds the pyramids, and a cityscape can be seen in the background under a hazy sky. The scene is characterized by its sandy terrain and the iconic ancient pyramids.'

#### Split text into smaller chunks

In [27]:
# Combine all texts in the document to a single string
full_text = ""

for summary in text_summaries:
    full_text += summary

for summary in image_summaries:
    full_text += summary
    
# Split the combined text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 200,
    length_function = len
)

chunks = text_splitter.split_text(text=full_text)

### Embedding

Documentation:
* HuggingFace Embedding model: https://python.langchain.com/v0.2/docs/integrations/platforms/huggingface/
* Store embedding result to vector database: https://python.langchain.com/v0.2/docs/integrations/vectorstores/supabase/

In [28]:
# Initialize the embedding model
model_name = "BAAI/bge-m3"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'precision': 'binary'}

hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [29]:
load_dotenv()

# load the environment variables of the supabase
PUBLIC_SUPABASE_URL = os.getenv('NEXT_PUBLIC_SUPABASE_URL')
PUBLIC_SUPABASE_ANON_KEY = os.getenv('NEXT_PUBLIC_SUPABASE_ANON_KEY')

# Initialize the database
supabase: Client = create_client(PUBLIC_SUPABASE_URL, PUBLIC_SUPABASE_ANON_KEY)

In [30]:
# Store documents/texts with embeddings to vector database for the first time
# vector_store = SupabaseVectorStore.from_texts(
#     chunks,
#     embedding=hf,
#     client=supabase,
#     table_name="documents_bge_binary",
#     query_name="match_documents_bge_binary",
#     chunk_size=1024
# )

# If you already have documents with embeddings in your database, simply instantiate a new SupabaseVectorStore directly:
vector_store = SupabaseVectorStore(
    embedding=hf,
    client=supabase,
    table_name="documents_bge",
    query_name="match_documents_bge",
)

#### Test the embedding result

In [31]:
query = "aplikasi trigonometri di kehidupan sehari-hari"
matched_docs = vector_store.similarity_search(query)

In [32]:
print(matched_docs[3].page_content)

how they can represent an ellipse through counterclockwise mapping of values. It emphasizes the efficiency of using a graphing calculator set to parametric mode, which helps visualize the graph more easily than manual calculations. The text also includes an example of graphing parametric equations alongside their rectangular form for comparison, suggesting the use of a table of values for clarity.Table 3 discusses various further applications of trigonometry, highlighting its relevance in different fields and practical scenarios. It emphasizes how trigonometric principles can be utilized beyond basic calculations, potentially aiding in areas such as physics, engineering, and architecture. The summary underscores the versatility of trigonometry in solving real-world problems.The text discusses the process of translating parametric equations into rectangular form and highlights how both representations can be graphed together to show they produce the same result. It provides a step-by-st