## Summarizing PDFs  with Langchain

In [None]:
# !pip install langchain
# !pip install pypdf
# !pip install openai
# !pip install dotenv
# !pip install tiktoken
# !pip install gradio

In [None]:
import os
import openai
import sys
sys.path.append('../..')

!pip install python-dotenv

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [None]:
from openai import OpenAI

client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY']
)

In [None]:
from langchain.chains.summarize import load_summarize_chain

from langchain.document_loaders import PyPDFLoader
from langchain.chat_models import ChatOpenAI

import textwrap

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
llm = ChatOpenAI(temperature = 0.0)

## URLs

In [None]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
docs = loader.load()
chain = load_summarize_chain(llm, chain_type="stuff")

chain.run(docs)


'The article discusses the concept of building autonomous agents powered by Large Language Models (LLMs). It covers components such as planning, memory, and tool use, along with case studies and proof-of-concept examples. Challenges include the finite context length, reliability of the natural language interface, and long-term planning difficulties. The article also provides references for further reading.'

## PDF

In [None]:
!curl -o paper1.pdf https://arxiv.org/pdf/2402.15061.pdf

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  722k  100  722k    0     0  2285k      0 --:--:-- --:--:-- --:--:-- 2287k


In [None]:
pdf_path = "./paper1.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load_and_split()

summarize_chain = load_summarize_chain(llm, chain_type = "map_reduce")

summary = summarize_chain.run(docs)

In [None]:
wrapped_text = textwrap.fill(summary, width= 100,
                             break_long_words= False,
                             replace_whitespace= False)
print(wrapped_text)

The paper discusses challenges faced by Large Language Models (LLMs) in domain-specific Machine
Translation (MT) and proposes a prompt-oriented fine-tuning method called LlamaIT to address these
challenges. The method involves constructing a task-specific mix-domain dataset, fine-tuning the LLM
with LoRA, and using zero-shot prompting with instructions to adapt MT tasks to the target domain at
inference time. Experimental results show that LlamaIT significantly enhances domain-specific MT
capabilities of LLMs while preserving their zero-shot MT capabilities. The study also compares
LlamaIT with in-context learning and fine-tuning methods used in current LLM-based MT systems,
highlighting the benefits of prompt-oriented fine-tuning for domain-specific translation tasks.


For summarization tasks, we need three main tools:


*   Documents loaders
*   Text splitters
*   Chain types(Stuff, Map reduce, Refine, Map-Rerank)



In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


pdf_path = "./paper1.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    separators="\n",
    chunk_size=500,
    chunk_overlap=100,
    length_function=len
)

docs = text_splitter.split_documents(docs)

summarize_chain = load_summarize_chain(llm, chain_type = "map_reduce")

summarize_chain.run(docs)

## Personalise your Summarization(by using Prompt Template)

In [None]:
from langchain import PromptTemplate

In [None]:
custom_prompt = """Summarize precisely the following  paper into a set of 5 bullets points:
                  \n\n " + " paper : {text}"""

prompt = PromptTemplate(template = custom_prompt, input_variables = ["text"])

summarize_chain = load_summarize_chain(llm, chain_type = "map_reduce",
                                       map_prompt = prompt, combine_prompt = prompt)
summary = summarize_chain.run(docs)

In [None]:
wrapped_text = textwrap.fill(summary, width= 100,
                             break_long_words= False,
                             replace_whitespace= False)
print(wrapped_text)

- Large language models have potential in domain-specific machine translation but face challenges
such as sensitivity to input examples and over-generation.
- Fine-tuning LLMs on specific domains
may improve domain-specific MT performance.
- Research is needed to fully utilize LLMs in domain-
specific machine translation and address challenges.
- ChatGPT 3.5 performs best among models tested
in domain-specific MT tasks.
- LlamaIT method combines fine-tuning and in-context learning for
accurate translation of domain-specific terms.


## Bonus: Summarizer App via Gradio

In [None]:
import gradio as gr
from langchain_community.document_loaders import WebBaseLoader
from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI


# Define the summarization function for URLs
def summarize_url(url):
    loader = WebBaseLoader(url)
    docs = loader.load()
    llm = ChatOpenAI(temperature=0)
    summarize_chain = load_summarize_chain(llm, chain_type="stuff")
    summary = summarize_chain.run(docs)

    return summary


# Gradio interface

interface_gradio  = gr.Interface(fn=summarize_url,
                     inputs=[gr.Textbox(label="URL Text to summarize", lines=2)],
                     outputs=[gr.Textbox(label="Summary", lines=3)],
                     title="LLM Summarizer",
                     description="Paste  your URL to get a summary.")



interface_gradio.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://0e073c3d2b80fb768a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader

from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain.chat_models import ChatOpenAI


# # Define the summarization function for PDFs
# def summarize_pdf(pdf_file):

#     loader = PyPDFLoader(pdf_file)
#     docs = loader.load()


#     llm = ChatOpenAI(temperature = 0.0)


#     text_splitter = RecursiveCharacterTextSplitter(
#         separators="\n",
#         chunk_size=500,
#         chunk_overlap=150,
#         length_function=len
#     )
#     docs = text_splitter.split_documents(docs)


#     summarize_chain = load_summarize_chain(llm, chain_type="map_reduce")
#     summary = summarize_chain.run(docs)

#     return summary

# Define the summarization function for URLs
def summarize_url(url):
    loader = WebBaseLoader(url)
    docs = loader.load()

    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-1106")

    summarize_chain = load_summarize_chain(llm, chain_type="stuff")
    summary = summarize_chain.run(docs)

    return summary

# # Define the main function that the Gradio interface will use
# def summarize(input_type, input_data):
#     if input_type == "PDF":
#         return summarize_pdf(input_data)
#     elif input_type == "URL":
#         return summarize_url(input_data)
#     else:
#         return "Unsupported input type."

# Gradio interface
# demo = gr.Interface(
#     fn=summarize,
#     inputs=[
#         gr.Radio(["PDF", "URL"], label="Select Input Type", type="index"),
#         gr.Data(file=True, type="file", label="Upload PDF or Enter URL")
#         gr.Textbox(lines=5, placeholder="Enter PDF path, URL, or Notion link here...")
#     ],
#     outputs='text',
#     title="LLM Summarizer",
#     description="Upload a PDF or provide a URL to get a summary."
# )


interface_gradio  = gr.Interface(fn=summarize_url,
                     inputs=[gr.Textbox(label="Text to summarize", lines=6)],
                     outputs=[gr.Textbox(label="Summary", lines=3)],
                     title="LLM Summarizer",
                     description="Paste  your URL to get a summary.")



interface_gradio.launch(share=True, debug=True)
