In [25]:
import os
import openai
import sycamore
from sycamore.transforms.partition import ArynPartitioner
from dotenv import load_dotenv

def process_pdf_return_tuple(file_name: str) -> tuple[list[float], str, str, str]:
    """
    Reads a PDF file via Sycamore, partitions its content, summarizes the text
    with an OpenAI model, and generates embeddings. Returns a tuple:
        ([embedding_vector], file_name, summary_text, entire_document_content).

    Args:
        file_name (str): The name (or path) of the PDF file to process.

    Returns:
        tuple: A tuple with the following elements:
            0) [embedding_vector] (list[float]): The embedding vector of the summary
            1) file_name (str): The original file name/path
            2) summary_text (str): The summarized text content
            3) entire_document_content (str): The full text content extracted from the PDF
    """
    # Load environment variables (e.g. ARYN_API_KEY, OpenAI keys, etc.)
    load_dotenv()
    
    # Initialize Sycamore context (only needs to be done once per session)
    context = sycamore.init()

    # 1) Read the PDF and partition
    docset = context.read.binary(
        paths=[file_name], 
        binary_format="pdf"
    ).partition(
        partitioner=ArynPartitioner(
            extract_table_structure=True,
            aryn_api_key=os.getenv("ARYN_API_KEY"),
            use_ocr=True,
            ocr_images=True
        )
    )

    # 2) Execute the partitioning
    docset.execute()

    # 3) Collect all documents
    documents = docset.take_all()

    # 4) Create a single string of the entire document content
    elements = documents[0].elements if documents else []
    lines = []
    for elem in elements:
        text_content = elem.get("text_representation", "")
        if text_content:
            # Optional formatting. You can adjust as needed.
            lines.append(f"[{elem['type']}]: {text_content}")
    entire_document_content = "\n".join(lines)

    # 5) Use an OpenAI model to summarize the document
    #    (Adjust the model, prompt, and parameters according to your usage.)
    prompt = (
        "Summarize the following lecture notes into one paragraph in detail; "
        "including the important key terms, key words, and key points:\n\n"
        f"{entire_document_content}"
    )

    try:
        response = openai.chat.completion.create(
            model="o1",  # or "o1-mini", or any other model you have access to
            messages=[{"role": "user", "content": prompt}],
            max_tokens=10000  # Adjust if needed
        )
        summary_text = response.choices[0].message.content.strip()
    except Exception as e:
        print("Error during summarization:", e)
        summary_text = ""

    # 6) Generate embeddings for the summarized text
    #    (You may need to adapt the model name for your embedding usage.)
    try:
        embedding_response = openai.embeddings.create(
            model="text-embedding-3-large",  # Example embedding model
            input=summary_text,
            encoding_format="float",         # Example parameter
        )
        embedding_vector = embedding_response.data[0].embedding
    except Exception as e:
        print("Error generating embedding:", e)
        embedding_vector = []

    # Return the tuple in the desired format
    return (embedding_vector, file_name, summary_text, entire_document_content)




In [26]:
file_names = ["data/Fluid Dynamics.pdf", "data/Lab Notes.pdf", "data/Light.pdf", "data/Oscillations.pdf", "data/Circuit.pdf"]
result = [process_pdf_return_tuple(x) for x in file_names]

2025-01-12 05:23:15,535	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-11_19-27-00_140008_13532\logs\ray-data
2025-01-12 05:23:15,536	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]


Running 0: 0.00 row [00:00, ? row/s]
- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

Running Dataset. Active & requested resources: 2/14 CPU, 265.7MB/2.5GB object store: : 0.00 row [00:06, ? row/s]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 9.7MB object store: : 0.00 row [00:01, ? row/s]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 9.7MB object store: : 1.00 row [00:01, 1.06s/ row]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 9.7MB object store: : 1.00 row [00:01, 1.06s/ row]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap): Tasks: 4; Actors: 1; Queued blocks: 16; Resources: 1.0 CPU, 256.0MB object store; [locality off]: : 0.00 row [00:01, ? row/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap): Tasks: 4; Actors: 1; Queued blocks: 16; Resources: 1.0 CPU, 256.0MB object store; [locality off]: : 0.00 row [00:01, ? ro

Error during summarization: 'Chat' object has no attribute 'completion'


2025-01-12 05:24:04,219	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-11_19-27-00_140008_13532\logs\ray-data
2025-01-12 05:24:04,220	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]
Running 0: 0.00 row [00:00, ? row/s]
- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

Running Dataset. Active & requested resources: 1/14 CPU, 0.0B/2.5GB object store: : 0.00 row [00:05, ? row/s]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 596.0KB object store: : 0.00 row [00:01, ? row/s]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 596.0KB object store: : 1.00 row [00:01, 1.04s/ row]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 596.0KB obje

Error during summarization: 'Chat' object has no attribute 'completion'


2025-01-12 05:24:23,369	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-11_19-27-00_140008_13532\logs\ray-data
2025-01-12 05:24:23,370	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]
Running 0: 0.00 row [00:00, ? row/s]
- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

Running Dataset. Active & requested resources: 1/14 CPU, 0.0B/2.5GB object store: : 0.00 row [00:05, ? row/s]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 6.4MB object store: : 0.00 row [00:01, ? row/s]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 6.4MB object store: : 1.00 row [00:01, 1.08s/ row]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 6.4MB object sto

Error during summarization: 'Chat' object has no attribute 'completion'


2025-01-12 05:24:57,015	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-11_19-27-00_140008_13532\logs\ray-data
2025-01-12 05:24:57,017	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]
Running 0: 0.00 row [00:00, ? row/s]
- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

Running Dataset. Active & requested resources: 1/14 CPU, 0.0B/2.5GB object store: : 0.00 row [00:05, ? row/s]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 5.0MB object store: : 0.00 row [00:01, ? row/s]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 5.0MB object store: : 1.00 row [00:01, 1.06s/ row]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 5.0MB object sto

Error during summarization: 'Chat' object has no attribute 'completion'


2025-01-12 05:25:28,704	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-11_19-27-00_140008_13532\logs\ray-data
2025-01-12 05:25:28,705	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]
Running 0: 0.00 row [00:00, ? row/s]
- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

Running Dataset. Active & requested resources: 2/14 CPU, 261.0MB/2.5GB object store: : 0.00 row [00:06, ? row/s]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 5.0MB object store: : 0.00 row [00:01, ? row/s]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 5.0MB object store: : 1.00 row [00:01, 1.08s/ row]
- ReadBinary->SplitBlocks(28): Tasks: 1; Queued blocks: 0; Resources: 1.0 CPU, 5.0MB object 

Error during summarization: 'Chat' object has no attribute 'completion'


In [27]:
print(result)

[([-0.0006984524, -0.019356806, -0.032198977, 0.012672074, -0.039972313, -0.00816881, 0.010129155, 0.08035286, 0.0011332576, 0.015631726, -0.029154275, 0.028592963, -0.028252771, 0.014925832, -0.0019645968, -0.0017487888, -0.030821206, 0.0074203922, -0.03160364, 0.02056448, 0.020003166, -0.01465368, -0.06973893, 0.03435918, 0.013097311, -0.04136709, -0.0136926435, 0.0067272554, -0.017774923, 0.0016573627, 0.028422868, 0.03160364, -0.001843404, 0.02934138, -0.017111553, 0.0064551034, -0.010639439, -0.037182756, -0.005515329, 0.015104432, -0.012034218, -0.0023792032, -0.020292329, 0.021704117, -0.0015903878, 0.02456171, 0.020122234, -0.0012810277, 0.0076372633, 0.00875989, -0.020938689, -0.034240115, -0.008640824, -0.0015308546, -0.021074764, -0.022894781, 0.014245452, -0.056131337, -0.0038016222, -0.024459654, 0.009516813, -0.00084569084, -0.013454511, 0.011991695, 0.010809534, 0.00041061986, 0.0026088313, -0.017672867, 0.018999606, 0.05456646, 0.009695413, 0.011209258, 0.011030658, 0.0

In [30]:
import json

with open('result.json', 'w') as f:
    json.dump(result, f, indent=4)