In [69]:
import os
import pathlib
from langchain import OpenAI, PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader

os.environ["OPENAI_API_KEY"] = ""

In [None]:
llm = OpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))

In [66]:
paths = list(pathlib.Path("raw_data").glob("**/*.pdf"))

In [68]:
for p in paths:
    path = str(p)
    summaryPath = path.replace(".pdf", ".txt")

    if os.path.exists(summaryPath):
        print(f"Skipping {path}")
        continue

    print(f"Summarizing {path}")

    docLoader = PyPDFLoader(path)
    docs = docLoader.load_and_split()

    prompt_template = """Write a summary of the following with bullet points:


    {text}


    SUMMARY:"""

    PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
    chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMPT)
    res = chain.run(docs)

    print(f"Writing summary to {summaryPath}")
    with open(summaryPath, "w") as f:
        f.write(res)
    
    print(f"Done summarizing {path}")
    print()

Skipping raw_data/001 - Module 1/001 - Overview/001 - Preview/001.pdf
Skipping raw_data/001 - Module 1/001 - Overview/001 - Preview/002.pdf
Skipping raw_data/001 - Module 1/001 - Overview/002 - Standpoints/001.pdf
Skipping raw_data/001 - Module 1/001 - Overview/002 - Standpoints/002.pdf
Skipping raw_data/001 - Module 1/002 - Field Study/009 - Moving from Practice to Theory/001 - Unit 9_ Moving from Practice to Theory_ EDU HPL101_ How People Learn.pdf
Summarizing raw_data/001 - Module 1/002 - Field Study/002 - Choose Your Level of Practice(Module 1)/001.pdf
Writing summary to raw_data/001 - Module 1/002 - Field Study/002 - Choose Your Level of Practice(Module 1)/001.txt
Done summarizing raw_data/001 - Module 1/002 - Field Study/002 - Choose Your Level of Practice(Module 1)/001.pdf

Summarizing raw_data/001 - Module 1/002 - Field Study/003 - The Syrian Crisis/003 - 220522_ M1 The Crisis Info_PDF Description.pdf
Writing summary to raw_data/001 - Module 1/002 - Field Study/003 - The Syrian