In [66]:
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain import PromptTemplate
import glob
import os
from IPython.display import HTML

In [69]:
from dotenv import load_dotenv


os.environ["OPENAI_API_TYPE"] = OPENAI_API_TYPE
os.environ["OPENAI_API_VERSION"] = OPENAI_API_VERSION
os.environ["OPENAI_API_BASE"] = OPENAI_API_BASE
os.environ["AZURE_OPENAI_ENDPOINT"] = OPENAI_API_BASE
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["OPENAI_DEPLOYMENT_NAME"] = OPENAI_DEPLOYMENT_NAME

load_dotenv()

False

In [71]:
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage

In [73]:
# Test the model connection
model = AzureChatOpenAI(
    openai_api_version=OPENAI_API_VERSION,
    deployment_name=OPENAI_DEPLOYMENT_NAME,
)
  

In [74]:
message = HumanMessage(
    content="Translate this sentence from English to French. I love programming."
)
model([message])



AIMessage(content="J'aime la programmation.")

In [76]:
def color_chunks(text: str, chunk_size: int, overlap_size: int) -> str:
        overlap_color = "#808080"
        chunk_colors = ["#a8d08d", "#c6dbef", "#e6550d", "#fd8d3c", "#fdae6b", "#fdd0a2"] # Different shades of green for chunks
        
        colored_text = ""
        overlap = ""
        color_index = 0
        for i in range(0, len(text), chunk_size-overlap_size):
            chunk = text[i:i+chunk_size]
            if overlap:
                colored_text += f'<mark style="background-color: {overlap_color};">{overlap}</mark>'
            chunk = chunk[len(overlap):]
            colored_text += f'<mark style="background-color: {chunk_colors[color_index]};">{chunk}</mark>'
            color_index = (color_index + 1) % len(chunk_colors)
            overlap = text[i+chunk_size-overlap_size:i+chunk_size]

        return colored_text

In [77]:
def color_chunks_specific(docs):
    chunk_colors = ["#a8d08d", "#c6dbef", "#e6550d", "#fd8d3c", "#fdae6b", "#fdd0a2"]

    colored_text = ""
    color_index = 0

    for i, chunk in enumerate(docs):
        # Add HTML markup for the current chunk
        colored_text += f'<mark style="background-color: {chunk_colors[color_index]};">{chunk}</mark>'

        # Update the color index for the next chunk
        color_index = (color_index + 1) % len(chunk_colors)

    return colored_text



In [79]:
#Function to chunk docs using load_and_split() from PyPDF2 and using stuff chain

def summarize_pdfs_from_folder(pdfs_folder):
   
    
    summaries = []
    for pdf_file in glob.glob(pdfs_folder + "/*.pdf"):
        loader = PyPDFLoader(pdf_file)
        docs = loader.load_and_split()
        
         # Print the number of chunks
        print(f"Number of Chunks for {pdf_file}: {len(docs)}")
        
      
        # Colorize and print only the chunks obtained from load_and_split()
        colored_text = color_chunks_specific(docs)
        display(HTML("Colored Chunks:<br>" + colored_text))
        
        # Print the number of colored chunks
        num_colored_chunks = len(docs)
        print(f"Number of Colored Chunks for {pdf_file}: {num_colored_chunks}")

       
                  
        chain = load_summarize_chain(AzureChatOpenAI(openai_api_key=OPENAI_API_KEY, deployment_name=OPENAI_DEPLOYMENT_NAME, model_name=MODEL_NAME), chain_type="stuff")
        summary = chain.run(docs)
        print("Summary for: ", pdf_file)
        print(summary)
        print("\n")
        summaries.append(summary)
    
    return summaries

In [82]:
#Sample function 

def pdfLoader(pdfs_folder):      
    for pdf_file in glob.glob(pdfs_folder + "/*.pdf"):
        loader = PyPDFLoader(pdf_file)
        docs = loader.load_and_split()
        return docs

In [96]:
#Sample code to count no of tokens in a document
docs=pdfLoader("/workspaces/info-assist/docs/sample_doc")
text =""
for doc in docs:
    text += doc.page_content
text=text.replace('\t','')

In [84]:
num_tokens=model.get_num_tokens(text)
num_tokens

16594

In [87]:
summaries = summarize_pdfs_from_folder("/workspaces/info-assist/docs/sample_doc")
# This is stuff chain summary and it works

#16 pages 29 chunks. 29 colored chunks

Number of Chunks for /workspaces/info-assist/docs/sample_doc/China, climate change and the energy transition.pdf: 29


Number of Colored Chunks for /workspaces/info-assist/docs/sample_doc/China, climate change and the energy transition.pdf: 29
Summary for:  /workspaces/info-assist/docs/sample_doc/China, climate change and the energy transition.pdf
This report examines China's significant shift towards renewable energy amidst global efforts to transition from fossil fuels to low-carbon energy sources due to climate change and energy security concerns. China's actions present a mix of impressive strides and daunting challenges in the energy transformation landscape. As the largest coal producer, consumer, and CO2 emitter, China's management of its energy transition will have substantial domestic and global impacts on social, economic, and environmental fronts.

China has committed to peaking carbon emissions by 2030 and achieving carbon neutrality by 2060. Its energy policy has shifted to include renewables such as wind and solar, alongside traditional coal, oil, and natural gas. Technology innovation is

In [None]:
#Map Reduce Chain

In [95]:
#chunking method changed and prompt variations and start of map reduce chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [99]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=3000, chunk_overlap=300)

docs = text_splitter.create_documents([text])

In [101]:
num_docs = len(docs)

num_tokens_first_doc = model.get_num_tokens(docs[0].page_content)

print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 28 documents and the first one has 544 tokens


In [102]:
summary_chain = load_summarize_chain(llm=model, chain_type='map_reduce',
#                                      verbose=True # Set verbose=True if you want to see the prompts being used
                                    )

In [None]:
#Bullet Summary

In [104]:
map_prompt = """
Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

In [105]:
combine_prompt = """
Write a concise summary of the following text delimited by triple backquotes.
Return your response in bullet points which covers the key points of the text.
```{text}```
BULLET POINT SUMMARY:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [107]:
summary_chain = load_summarize_chain(llm=model,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
#                                      verbose=True
                                    )

In [109]:
output = summary_chain.run(docs)
print (output)

- Xu Yi-chong discussed China's role in the energy transition, emphasizing the move from fossil fuels to low-carbon energy due to environmental and security reasons.
- China faces challenges as the largest coal producer and consumer, top CO2 emitter, and with the highest population energy demands.
- China aims to peak emissions by 2030 and achieve carbon neutrality by 2060, necessitating wide-ranging economic reforms.
- The main CO2 sources are electricity/heat production, industry, and transport, mostly from coal combustion.
- Energy-intensive materials production, like steel and cement using coal, leads to high emissions intensity in China.
- The electricity industry in China is working to peak emissions by 2028 and integrate more low-carbon sources to satisfy increasing demand.
- China's 2050 energy strategy includes a diverse mix with reduced electricity sector emissions.
- Global fossil fuel demand is expected to peak in the mid-2020s, but China may see short-term rises in coal us

In [None]:
#Theme Summary

In [114]:
map_prompt1 = """
The following is a set of documents:

"{text}"

Based on this list of docs, please identify the main themes 

Helpful Answer:
"""
map_prompt_template1 = PromptTemplate(template=map_prompt, input_variables=["text"])




In [118]:
combine_prompt1 = """
The following is set of summaries:
{doc_summaries}
Take these and distill it into a final, consolidated summary of the main themes. 
Helpful Answer:
"""
combine_prompt_template1 = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [119]:
summary_chain1 = load_summarize_chain(llm=model,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template1,
                                     combine_prompt=combine_prompt_template1,
#                                      verbose=True
                                    )

In [120]:
output1 = summary_chain1.run(docs)
print (output1)

- Energy Transition: China is shifting from fossil fuels to low-carbon energy sources to address climate change and ensure energy security.
- Climate Change: China aims to mitigate climate-related disasters through its energy transition.
- Geostrategic Change: Renewable energy is reshaping international relations and power dynamics.
- Technological Innovation: Technology plays a crucial role in China's energy transformation and strategic positioning.
- China's Challenges: The country must balance energy security, economic growth, and environmental sustainability.
- Renewable Energy Policies: China is integrating renewables into its energy strategy, alongside other nations.
- Future of Conventional Energy: Oil and natural gas will continue to play a role in China's energy mix.
- Electricity and Tech Advancements: China is making progress in its electricity sector and energy technologies.
- Socioeconomic Changes: Energy transformation requires fundamental changes in the economy and socie