In [2]:
#Summarization of multiple pdfs

from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain import OpenAI , PromptTemplate 
import glob 

In [3]:
from dotenv import load_dotenv , find_dotenv
import os
dotenv_path = find_dotenv()

load_dotenv(find_dotenv())
api_key = os.getenv("OPENAI_API_KEY")

if api_key is None:
    raise ValueError("OPENAI_API_KEY not found in environment variables")


In [25]:
llm = OpenAI(temperature=0.2)
def summarize_pdfs_from_folder(pdfs_folder):
    summaries = []
    for pdf_file in glob.glob(pdfs_folder +"/*.pdf"):
        loader = PyPDFLoader(pdf_file)
        docs = loader.load_and_split()
        chain = load_summarize_chain(llm, chain_type="map_reduce", verbose=True)
        summary = chain.run(docs)
        print("Summary for:" , pdf_file)
        print(summary)
        print("\n")
        summaries.append(summary)

    return summaries

In [26]:

summaries = summarize_pdfs_from_folder("C://Users//Admin//Documents//gen_ai_training//pdfs")
print(summaries)



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"Large language models (LLMs) have revolutionized the field of natural language
processing (NLP) by demonstrating unprecedented capabilities in understand-
ing and generating human-like text. These models, such as OpenAI’s GPT-3
and GPT-4, are built using deep learning techniques, specifically transformers,
which allow them to process and analyze vast amounts of text data. By training
on diverse datasets comprising billions of words, LLMs can generate coherent
and contextually relevant responses, making them useful for a wide range of ap-
plications, from chatbots and virtual assistants to content creation and language
translation.
One of the key strengths of large language models is their ability to perform
zero-shot and few-shot learning. This means that they can generalize from a
small number of example

In [27]:
#saving all the summaries into one txt file 
with open("summaries.txt", "w") as f:
    for summary in summaries :
        f.write(summary + "\n"*3)

### Querying PDF

In [28]:
from langchain.indexes import VectorstoreIndexCreator 
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import OpenAIEmbeddings # importing embedding models 

#defining embedded moel

embedding_model = OpenAIEmbeddings()

In [29]:
loader  = PyPDFDirectoryLoader("C://Users//Admin//Documents//gen_ai_training//pdfs")

docs = loader.load()

index = VectorstoreIndexCreator(embedding=embedding_model).from_loaders([loader])



In [30]:
#question 
llm = OpenAI(temperature= 0.2)
query = "what is the main goal pf sports analytics"

response = index.query(query, llm = llm)
print(response)

 The main goal of sports analytics is to use data and advanced statistical techniques to extract valuable insights and improve performance in sports.
