In [9]:
# import os
# import openai
# import sycamore
# from sycamore.transforms.partition import ArynPartitioner
# from dotenv import load_dotenv
# from openai import OpenAI

# # Load environment variables (e.g. ARYN_API_KEY, OpenAI keys, etc.)
# load_dotenv()

# client = OpenAI()

# def process_pdf_return_tuple(file_name: str) -> tuple[list[float], str, str, str]:
#     """
#     Reads a PDF file via Sycamore, partitions its content, summarizes the text
#     with an OpenAI model, and generates embeddings. Returns a tuple:
#         ([embedding_vector], file_name, summary_text, entire_document_content).

#     Args:
#         file_name (str): The name (or path) of the PDF file to process.

#     Returns:
#         tuple: A tuple with the following elements:
#             0) [embedding_vector] (list[float]): The embedding vector of the summary
#             1) file_name (str): The original file name/path
#             2) summary_text (str): The summarized text content
#             3) entire_document_content (str): The full text content extracted from the PDF
#     """
    
    
#     # Initialize Sycamore context (only needs to be done once per session)
#     context = sycamore.init()

#     # 1) Read the PDF and partition
#     docset = context.read.binary(
#         paths=[file_name], 
#         binary_format="pdf"
#     ).partition(
#         partitioner=ArynPartitioner(
#             extract_table_structure=True,
#             aryn_api_key=os.getenv("ARYN_API_KEY"),
#             use_ocr=True,
#             ocr_images=True
#         )
#     )

#     # 2) Execute the partitioning
#     docset.execute()

#     # 3) Collect all documents
#     documents = docset.take_all()

#     if not documents:
#         print(f"No documents found in {file_name}.")
#         return ([], file_name, "", "")

#     # 4) Create a single string of the entire document content
#     elements = documents[0].elements
#     lines = []
#     for elem in elements:
#         text_content = elem.get("text_representation", "")
#         if text_content:
#             # Optional formatting. You can adjust as needed.
#             lines.append(f"[{elem['type']}]: {text_content}")
#     entire_document_content = "\n".join(lines)

#     if not entire_document_content.strip():
#         print(f"No text content extracted from {file_name}.")
#         return ([], file_name, "", entire_document_content)

#     # 5) Use an OpenAI model to summarize the document
#     #    (Adjust the model, prompt, and parameters according to your usage.)
#     prompt = (
#         "Summarize the following lecture notes into one detailed paragraph, "
#         "including the important key terms, keywords, and key points:\n\n"
#         f"{entire_document_content}"
#     )

#     try:
#         response = client.chat.completion.create(
#             model="o1",  # Use a valid OpenAI model
#             messages=[{"role": "user", "content": prompt}],
#             max_tokens=500  # Adjust to a reasonable limit
#         )
#         summary_text = response.choices[0].message['content'].strip()
#         print(summary_text)
#         if not summary_text:
#             print("Received empty summary from OpenAI.")
#     except Exception as e:
#         print("Error during summarization:", e)
#         summary_text = ""

#     # 6) Generate embeddings for the summarized text
#     #    (You may need to adapt the model name for your embedding usage.)
#     if summary_text:
#         try:
#             embedding_response = openai.Embedding.create(
#                 model="text-embedding-3-large",  # Use a valid embedding model
#                 input=summary_text
#             )
#             embedding_vector = embedding_response['data'][0]['embedding']
#         except Exception as e:
#             print("Error generating embedding:", e)
#             embedding_vector = []
#     else:
#         print("No summary available to generate embeddings.")
#         embedding_vector = []

#     # Return the tuple in the desired format
#     return (embedding_vector, file_name, summary_text, entire_document_content)


In [10]:
import os
import openai
import sycamore
from sycamore.transforms.partition import ArynPartitioner
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

client = OpenAI()
# Optionally set your API key directly, or rely on env var OPENAI_API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

def process_pdf_return_tuple(file_name: str) -> tuple[list[float], str, str, str]:

    context = sycamore.init()

    docset = context.read.binary(
        paths=[file_name],
        binary_format="pdf"
    ).partition(
        partitioner=ArynPartitioner(
            extract_table_structure=True,
            aryn_api_key=os.getenv("ARYN_API_KEY"),
            use_ocr=True,
            ocr_images=True
        )
    )

    docset.execute()

    documents = docset.take_all()
    if not documents:
        print(f"No documents found in {file_name}.")
        return ([], file_name, "", "")

    elements = documents[0].elements
    lines = []
    for elem in elements:
        text_content = elem.get("text_representation", "")
        if text_content:
            lines.append(f"[{elem['type']}]: {text_content}")

    entire_document_content = "\n".join(lines)
    if not entire_document_content.strip():
        print(f"No text content extracted from {file_name}.")
        return ([], file_name, "", entire_document_content)

    prompt = (
        "Summarize the following lecture notes into one detailed paragraph, "
        "including the important key terms, keywords, and key points:\n\n"
        f"{entire_document_content}"
    )

    try:
        response = client.chat.completions.create(
            model="o1",  # must be valid
            messages=[{"role": "user", "content": prompt}],
            max_completion_tokens=10000
        )
        summary_text = response.choices[0].message.content.strip()
        print("Summary:", summary_text)
        if not summary_text:
            print("Received empty summary from OpenAI.")
    except Exception as e:
        print("Error during summarization:", e)
        # summary_text = ""

    if summary_text:
        try:
            embedding_response = client.embeddings.create(
                model="text-embedding-3-large",  # recommended embedding model
                input=summary_text
            )
            embedding_vector = embedding_response.data[0].embedding
        except Exception as e:
            print("Error generating embedding:", e)
            embedding_vector = []
    else:
        print("No summary available to generate embeddings.")
        embedding_vector = []

    return (embedding_vector, file_name, summary_text, entire_document_content)


In [11]:
file_names = ["data/Fluid Dynamics.pdf", "data/Lab Notes.pdf", "data/Light.pdf", "data/Oscillations.pdf", "data/Circuit.pdf"]
result = [process_pdf_return_tuple(x) for x in file_names]

2025-01-12 08:22:16,231	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-12_08-07-40_104493_21200\logs\ray-data
2025-01-12 08:22:16,232	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]


Running 0: 0.00 row [00:00, ? row/s]

- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2: 0.00 row [00:00, ? row/s]

2025-01-12 08:22:41,447	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-12_08-07-40_104493_21200\logs\ray-data
2025-01-12 08:22:41,448	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]


Running 0: 0.00 row [00:00, ? row/s]

- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2: 0.00 row [00:00, ? row/s]

Summary: These notes begin by examining fluid dynamics, emphasizing mass conservation (min = mout), the continuity of flow (A·v = constant), and Bernoulli’s equation, which relates pressure, velocity, and height; viscosity and Poiseuille’s law underscore how fluid resistance depends strongly on radius (r⁴). Next, temperature is described as the average kinetic energy of particles, with the 0th Law of Thermodynamics establishing thermal equilibrium: heat will flow from higher to lower temperature until equalized. Thermal expansion arises from increased atomic motion, and phase changes (solid, liquid, gas, plasma) involve latent heat where temperature plateaus. Heat transfer occurs via conduction (dependent on material properties like thermal conductivity k), convection (fluid flow driven by density differences), and radiation (electromagnetic emission, exemplified by blackbody radiation and the Stefan-Boltzmann law). The notes then move into the ideal gas law (PV = nRT) and kinetic theo

2025-01-12 08:23:21,698	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-12_08-07-40_104493_21200\logs\ray-data
2025-01-12 08:23:21,699	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]


Running 0: 0.00 row [00:00, ? row/s]

- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2: 0.00 row [00:00, ? row/s]

2025-01-12 08:23:30,942	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-12_08-07-40_104493_21200\logs\ray-data
2025-01-12 08:23:30,943	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]


Running 0: 0.00 row [00:00, ? row/s]

- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2: 0.00 row [00:00, ? row/s]

Summary: These notes emphasize the importance of recording exact measurements and organizing the data into clear tables for lab reports, focusing on wave phenomena and verifying the speed of sound (approximately 343 m/s). Wave length is measured peak to peak (λ = 1), and the relationship c = a·f (or c = 2f in certain setups) demonstrates how frequency and wave speed are related. The number of harmonics does not change the fundamental relationship, and analyses of forces (e.g., Fnet = F + Fay) show that net force in the x-direction is zero for a steady wave on a string. In measuring the speed of sound, a microphone, meter stick, PASCO interface, pipes or tubes, and a vibration generator are used; by changing the sample rate (to R kHz) and adjusting tube length, the frequency decreases as the tube is lengthened. Collecting at least five different lengths allows calculation of the slope (m), confirming m ≈ c and thereby verifying wave speed predictions. These procedures illustrate the nee

2025-01-12 08:23:53,166	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-12_08-07-40_104493_21200\logs\ray-data
2025-01-12 08:23:53,168	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]


Running 0: 0.00 row [00:00, ? row/s]

- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2: 0.00 row [00:00, ? row/s]

2025-01-12 08:24:17,036	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-12_08-07-40_104493_21200\logs\ray-data
2025-01-12 08:24:17,037	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]


Running 0: 0.00 row [00:00, ? row/s]

- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2: 0.00 row [00:00, ? row/s]

Summary: Light travels in straight lines through various media at speeds slower than its maximum value c = 3×10^8 m/s in a vacuum, giving rise to an index of refraction n = c/v. Reflection follows the law that the angle of incidence equals the angle of reflection, while refraction obeys Snell’s law, n₁ sinθᵢ = n₂ sinθᵣ, explaining how light bends and can undergo total internal reflection at a critical angle when it moves from higher to lower indices. In geometric optics, mirrors and thin lenses form images described by the mirror and lens equations; real images form where light rays converge, and virtual images appear where they only seem to originate. Wave properties such as diffraction and interference (constructive and destructive) emerge from Huygen’s Principle, with multi-slit setups producing sharper interference peaks and single-slit diffraction patterns depending on slit width. Polarization highlights the transverse nature of light, as aligned polarizing filters block or transm

2025-01-12 08:24:53,078	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-12_08-07-40_104493_21200\logs\ray-data
2025-01-12 08:24:53,079	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]


Running 0: 0.00 row [00:00, ? row/s]

- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2: 0.00 row [00:00, ? row/s]

2025-01-12 08:25:09,864	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-12_08-07-40_104493_21200\logs\ray-data
2025-01-12 08:25:09,865	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]


Running 0: 0.00 row [00:00, ? row/s]

- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2: 0.00 row [00:00, ? row/s]

Summary: In these notes, we explore fundamental concepts of oscillations and waves, beginning with simple harmonic motion (SHM) where a mass on a spring undergoes periodic motion characterized by its period (the time to complete one cycle) and amplitude (the maximum displacement), governed by the equation x(t)=A sin(ωt+φ) with angular frequency ω=√(k/m). When damping is introduced (e.g., via a frictional or resistive force b), the motion satisfies mu''+bu'+ku=0, leading to under-damped, critically damped, or over-damped behaviors. Moving to waves, we distinguish transverse waves (such as a string moving up and down) and discuss key quantities like frequency, wavelength (λ), wave number k=2π/λ, and wave speed c=λ/T. Superposition of waves gives rise to phenomena such as interference and beats (when two slightly different frequencies combine). For waves on a string, tension and linear mass density determine wave speed. Pendulum motion under small angles approximates SHM as well. Energy i

2025-01-12 08:25:42,273	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-12_08-07-40_104493_21200\logs\ray-data
2025-01-12 08:25:42,274	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]


Running 0: 0.00 row [00:00, ? row/s]

- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2: 0.00 row [00:00, ? row/s]

2025-01-12 08:26:00,330	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in C:\Users\aurel\AppData\Local\Temp\ray\session_2025-01-12_08-07-40_104493_21200\logs\ray-data
2025-01-12 08:26:00,331	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> ActorPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap)]


Running 0: 0.00 row [00:00, ? row/s]

- ReadBinary->SplitBlocks(28) 1: 0.00 row [00:00, ? row/s]

- Map(BinaryScan._to_document)->MapBatches(BaseMapTransformCallable___wrap) 2: 0.00 row [00:00, ? row/s]

Summary: Capacitance depends solely on the geometry of the capacitor (plate area A, plate separation d) and the dielectric constant K, rather than on the charge Q or voltage V (as given by C = Kε₀A/d). To determine a capacitor’s capacitance, one can assume it carries a charge Q, use Gauss’s law to find the electric field E between the plates, then integrate to find the potential difference and compute C = Q/ΔV. When a charged capacitor is isolated (disconnected from a battery), its charge remains fixed, but its voltage can change if a dielectric is inserted; conversely, when connected to a battery, the voltage stays the same, and the charge can vary. In circuits, capacitors and resistors are combined in series and parallel arrangements, with the total (equivalent) capacitance or resistance computed differently in each case (for instance, series capacitors share the same charge, whereas parallel capacitors share the same voltage). Kirchhoff’s rules underpin circuit analysis: the node (j

In [12]:
print(result)

[([-0.022719820961356163, -0.01479256246238947, -0.01700621284544468, 0.0004949862486682832, -0.0008492823108099401, 0.0017200654838234186, -0.022734778001904488, 0.015331017784774303, -0.05360621213912964, 0.02523261122405529, 0.027206948027014732, 0.039965346455574036, -0.012018022127449512, -0.01779893785715103, 0.00261375168338418, 0.03281585872173309, -0.0394568033516407, -0.010709277354180813, 0.008727462030947208, -0.0008605001494288445, 0.013349203392863274, -0.009206089191138744, 0.018756192177534103, 0.03577736020088196, -0.00682417256757617, 0.026847977191209793, 0.02303391881287098, -0.001276961644180119, 0.02316853404045105, 0.013880180194973946, 0.045529384166002274, 0.017978422343730927, -0.021448468789458275, -0.023377932608127594, -0.0033410401083528996, -0.018890805542469025, 0.025456968694925308, 0.008428320288658142, 0.012616305612027645, 0.031709033995866776, -0.013723130337893963, -0.012003065086901188, -0.03018340840935707, 0.015974173322319984, -0.01466542761772

In [13]:
import json

with open('result.json', 'w') as f:
    json.dump(result, f, indent=4)