In [74]:
import langchain.prompts as prompts
from langchain.schema import HumanMessage, SystemMessage
from langchain.prompts.chat import HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import io
import requests
from PyPDF2 import PdfReader

In [75]:
os.environ['OPENAI_API_KEY'] = ''
os.environ['PINECONE_API_KEY'] = ''
os.environ['ENVIROMENT_KEY'] = ''

### Set the Vectorstore

In [80]:
def initialise_pinecone():
    PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
    ENVIROMENT_KEY = os.environ['ENVIROMENT_KEY']
    pinecone.init(api_key=PINECONE_API_KEY, environment=ENVIROMENT_KEY)

def create_pinecone_index(pinecone_index_name):
    initialise_pinecone()
    pinecone.create_index(name = pinecone_index_name, dimension = 1536, metric = 'cosine')

def describe_index(pinecone_index_name):
    initialise_pinecone()
    index = pinecone.Index(pinecone_index_name)
    return index.describe_index_stats()

def set_docsearch(pinecone_index_name, namespace, data):
    embeddings = OpenAIEmbeddings(openai_api_key = os.environ['OPENAI_API_KEY'])
    initialise_pinecone()
    if pinecone_index_name in pinecone.list_indexes():
        list_of_namespaces = list(describe_index(pinecone_index_name)['namespaces'].keys())
        if namespace in list_of_namespaces:
            return Pinecone.from_existing_index(index_name=pinecone_index_name, embedding=embeddings, namespace=namespace)
        else:
            index = pinecone.Index(pinecone_index_name)
            p = Pinecone(index = index, embedding_function = embeddings, text_key = 'text')
            return p.add_texts(texts=data, embeddings = embeddings, namespace=namespace, index_name=pinecone_index_name)
            
    else:
        create_pinecone_index(pinecone_index_name)
        return Pinecone.from_texts(data, embedding=embeddings, index_name=pinecone_index_name, namespace = namespace)

### Import pdf and split 

In [46]:
def text_splitter(chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap=chunk_overlap,
        separators=['\n\n', '\n', ' ', '']
    )
    return text_splitter

def parse_online_pdf(url, chunk_size=400, chunk_overlap=20):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Windows; Windows x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36'}
    response = requests.get(url=url, headers=headers, timeout=120)
    on_fly_mem_obj = io.BytesIO(response.content)
    pdfReader = PdfReader(on_fly_mem_obj)
    full_text = ""
    for i, page in enumerate(pdfReader.pages):
        full_text += page.extract_text()
    chuncks = text_splitter(chunk_size, chunk_overlap).split_text(full_text)
    return chuncks 

### Prompt engeneering and chain

In [47]:
qa_prompt = prompts.PromptTemplate(
    input_variables=["question", "context_str", "length"],
    template="""Write an answer ({length}) 
    for the question below solely based on the provided context. 
    If the context provides insufficient information,
    reply 'I cannot answer'. 
    For each sentence in your answer, indicate which sources most support it
    via valid citation markers at the end of sentences, like (Example2012).
    Answer in an unbiased and scholarly tone. Make clear what is your opinion.
    Use Markdown for formatting code or text, and try to use direct quotes to support arguments.\n\n
    {context_str}\n
    Question: {question}\n
    Answer: """,
)

def make_chain(prompt, llm):
    if type(llm) == ChatOpenAI:
        system_message_prompt = SystemMessage(
            content="""You are a scholarly researcher that answers in an unbiased, scholarly tone.
            You sometimes refuse to answer if there is insufficient information.""",
        )
        human_message_prompt = HumanMessagePromptTemplate(prompt=prompt)
        prompt = ChatPromptTemplate.from_messages(
            [system_message_prompt, human_message_prompt]
        )
    return LLMChain(prompt=prompt, llm=llm)

### Initialise the LLM model and query

In [48]:
query = 'Summary'

In [49]:
llm = ChatOpenAI(temperature=0.1, model="gpt-3.5-turbo")
qa_chain = make_chain(prompt=qa_prompt, llm=llm)

In [68]:
url = 'https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1150988/JROC_report_recommendations_and_actions_paper_April_2023.pdf'
pinecone_index_name = 'booktoavatar'
namespace = 'test_1'

In [81]:
data = parse_online_pdf(url)
doc_search = set_docsearch(pinecone_index_name, namespace, data)
docs = doc_search.similarity_search(query, namespace=namespace)
# parse the output
output = qa_chain.run(question=query, context_str=docs, length=50)

TypeError: 'OpenAIEmbeddings' object is not callable

In [55]:
chars=100
for i in range(0, len(output), chars):
    print(output[i:i+chars])

The provided context includes information on the DIMPACT initiative, which aims to help media compan
ies reduce the carbon footprint of their digital value chain by providing a tool that models the env
ironmental impacts of digital products and services. The initiative involves the DIMPACT Expert Advi
sory Panel in developing the methodology, and provides a summary of data input requirements and part
icipant results from modeling runs. The carbon intensity of electricity for end-user devices varies 
by module, and the tool allows users to calculate this within the publishing module by listing the p
roportions of their views per country. There is also information on the standby power and carbon int
ensity of various end-user devices such as smartphones, laptops, and desktops. (DIMPACT Methodology,
 October 2022; Carbon Trust White Paper, BBC White Paper)
