# Install

In [None]:
%pip install azure-identity
%pip install langchain===0.0.200
%pip install openai, tiktoken
%pip install pypdf
%pip install python-dotenv

# Import and Setup

Set up Langchain for Summarization

In [None]:

import os,openai
from dotenv import load_dotenv
# Load environment variables from a .env file using load_dotenv():
load_dotenv()

azure_openai_api_key: str = os.environ.get('AZURE_OPENAI_API_KEY')

openai.api_type = "azure"
openai.api_base = "https://verx-corp-ai.openai.azure.com/"
openai.api_version = "2023-03-15-preview"
openai.api_key = azure_openai_api_key
openai.openai_api_key = azure_openai_api_key
deployment_id: str = "VERX-CORP-DAVINCI"
model: str = "text-davinci-003"

from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.llms import AzureOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate

llm = OpenAI(temperature=0, model=model, openai_api_key=azure_openai_api_key, deployment_id=deployment_id)



# Initialize Text Splitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len)

# Getting chunky with it (txt)
Use this one for parsing a txt document

In [None]:
# text document split
with open('state_of_union.txt') as f:
    state_of_union = f.read()
docs = text_splitter.create_documents([state_of_union])

# Getting chunky with it (pdf)
Use this one for parsing a pdf document

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("../data/Emerging_Tech_Generative AI Code Assistants Are Becoming Essential to Developer Experience_790320_ndx.pdf")
docs = loader.load_and_split()
docs

# Summarizing w/Map Reduce

create the prompts (The Map prompt is used for every chunk, the combine prompt is used to summarize all the summararies.)

In [None]:
from langchain.chains.summarize import load_summarize_chain
import textwrap

#customize the prompt

map_prompt_template =  """You are a research analyst. I will provide you with a section of a document and you will create a summary from it. You will preserve as many details as possible. You will maintain context across the summary. Your section will be combined with the other sections to create summary of the entire document.

Your summary must be no longer than 650 characters long.

Input: {text} """

combine_prompt_template = """You are a copy editor. Combine the below summaries. The combined output must be less than 4,000 characters long. You must keep the content and context preserved. 

Input: {text} """

Load the prompts and perform the summaries.  (if you want to see more detail add verbose=True as a param to load_summarize_chain)

In [None]:

map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])
combine_prompt = PromptTemplate(template=combine_prompt_template, input_variables=["text"])

chain = load_summarize_chain(llm, chain_type='map_reduce', map_prompt=map_prompt, combine_prompt=combine_prompt, verbose=True)

output_summary = chain.run(docs)
wrapped_text = textwrap.fill(output_summary, width=80)
print(wrapped_text)