In [1]:
!pip install -qU \
    langchain \
    click \
    openai  \
    datasets \
    tiktoken \
    pinecone-client==2.2.1

%pip install --upgrade typing_extensions
!pip install --upgrade fastapi python-multipart uvicorn

%pip install kaleido
%pip install --upgrade gradio



Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
!python --version
#Python 3.7.16


Python 3.10.11


In [3]:
import os
from langchain.chat_models import ChatOpenAI
#from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.schema.output_parser import StrOutputParser

import json
import pinecone
import gradio as gr
import time


openai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pinecone_env = os.getenv('PINECONE_ENV')


  from tqdm.autonotebook import tqdm


In [4]:
# only for testing
sample_query = "We're finally planning that big family trip to East Asia. I'm just wondering about a few things, Visas etc? do we need shots or evidence we got the covid booster?? anything you think we need to know culturally speaking? I also dont know if there are limits min or max on cash? Do they ahve contactless or do i need to change before i go"


# JSON Question Extraction & Step Back Prompt
combo_question_extract_and_step_back_prompt = PromptTemplate.from_template("""You are a helpful assistant who analyses text enquiries from users and extracts all of the questions contained in that text.  You work for the UK FCDO Government Department. You extract all explicit questions but also all implicit questions.  You return using JSON. Each item is one of the questions you identified from the user's original text.  You notice implied questions as well as explicit questions.  You also write a sentence for each question describing the type of information required to answer the question.If the user asks about a continent, or a group of countries (e.g. Europe or South America) then you list countries explicitly in your question (Germany / France etc or Brazil / Argentina etc - except you don't do "etc" you list up to 5 valid countries)
For example:
User: "I'm been offered a job in California, but i'm a bit confused by all the paperwork – what do I need exactly? my husband needs to came with me as we can't afford two places ? Are our estas fine for this? We can't leave our cat behind and need advice there too. Should I be listed for the cat or my husband?"
System:
{{
  "1": "What are the entry requirements for a British Citizen to enter the USA for work. To answer the question about entry requirements for a British citizen to work in the USA, information about current US visa policies and work permit requirements for UK nationals is needed",
  "2": "What are the routes of legal admission to the USA that are available to a spouse of a British citizen who is working in the USA.  Information about US immigration laws and visa options for spouses of British citizens working in the USA is required to address the routes of legal admission. ",
  "3": "What are the limitations of the ESTA system for british citizens who wish to work in the UK. To address the limitations of the ESTA system for British citizens working in the UK, details about the ESTA's applicability and restrictions for UK nationals in work contexts are needed.",
  "4": "What are the limitations of the ESTA system for British Citizens wishing to stay in the USA with their spouse for a period of time. Understanding the ESTA system's limitations for British citizens staying in the USA with their spouse requires knowledge of ESTA's terms, especially regarding duration of stay and accompanying family members.",
  "5": "what are the rules and regulations around bringing a pet cat from Britain to the USA. Information about US regulations on importing pets, specifically cats, from Britain is necessary to explain the rules and regulations for bringing a pet cat to the USA",
  "6": "Do the rules and regulations around bringing pets to the USA differ based on the type of visa an individual is in possession of? To answer if pet import rules to the USA vary based on visa type, detailed knowledge of US customs and immigration policies concerning pets in relation to different visa categories is needed."
}}

User: {user_query}
System:""")

# connect to the LLM - we're using the JSON constrained version of GPT4-Turbo
json_llm = ChatOpenAI(
    openai_api_key=openai_api_key,
    model='gpt-4-1106-preview', temperature = 0,
    model_kwargs={
     "response_format": {
       "type": "json_object"
     }
   }
)



In [5]:
def convert_docs_to_json(docs):
    """
    Converts a list of documents into a JSON-formatted string.

    Each document is a dictionary with a 'question' and an 'answer'.
    The 'answer' is an object with attributes 'page_content' and 'metadata'.

    Args:
    docs (list): A list of document objects.

    Returns:
    str: A JSON-formatted string representing the list of documents.
    """

    json_docs = []
    json_str = ""

    try:
        for doc in docs:
            # Handle missing 'question' key
            question = doc.get('GPT_GENERATED_question', 'No question available')

            # Extract necessary information from each document
            answer = doc.get('answer')
            if answer:
                doc_dict = {
                    'GPT_GENERATED_question': question,
                    'answer': {
                        'page_content': getattr(answer, 'page_content', None),
                        'metadata': getattr(answer, 'metadata', None)
                    }
                }
                json_docs.append(doc_dict)
            else:
                print(f"Missing 'answer' in document: {doc}")

        # Convert the list of dictionaries to a JSON string
        json_str = json.dumps(json_docs, indent=4)

    except Exception as e:
        print(f"An error occurred: {e}")

    return json_str


In [6]:
# Let's connect to pinecone




pinecone.init(api_key=pinecone_api_key, environment=pinecone_env)
pinecone.whoami()

index_name = "fcdo-travel"
embeddings = OpenAIEmbeddings()

# check if index already exists
if index_name not in pinecone.list_indexes():
    # if does not exist throw an error because we should have a vector store waiting for us
    print ("error index not found")
# connect to index
pinecone_index = pinecone.Index(index_name)
# view index stats
pinecone_index.describe_index_stats()

# now connect the langchain way so we can use their syntax
# note you need to do text_key = "xyz" to be able to access the text since langchain defaults to expect "text" (and I used content_to_embed)
docsearch = Pinecone.from_existing_index(index_name, embeddings, text_key="content_to_embed")


In [7]:
#now let's put it all into a function

def retrieve_snippets(query):

  # run the chain to turn the query into stepped-back questions
  # and then run the pinecone retrieval - not sure which search type is best - commented out MMR for time being
  # then turn those results into json which we return

  # This chain turns a question into a json object that contains a list of questions.  The questions contain the step back reasoning too.
  # An earlier version split it into two stages and had chain of thought reasoning - but it was very very slow.
  # This is still not performant IMHO, taking an unpredictable time between 16-30 seconds.
  start_time = time.time()
  chain = (combo_question_extract_and_step_back_prompt | json_llm | StrOutputParser()
  )
  results = chain.invoke({"user_query" : query})
  # extracted_questions now holds different questions that were in the user's original query (explicit or implicit)
  extracted_questions = json.loads(results)
  end_time = time.time()

  #now we loop around the questions, do a retrieval and add the snippet to the snippets returned  - NOTE K IS 1!
  retrieval_docs = []
  retrieved_snippets = []
  for i, question in extracted_questions.items():
    retrieval_docs = docsearch.similarity_search(question, k=1) # similarity search - we may investigate MMR too and others (see below)
    for doc in retrieval_docs:
      qanda = {"GPT_GENERATED_question": question, "answer": doc}
      retrieved_snippets.append(qanda)

  #json_snippets = retrieved_snippets
  #print("\nMaximal Marginal Relevance\n")
  #retriever = docsearch.as_retriever(search_type="mmr")
  #docs = retriever.get_relevant_documents(question, k=1)


  # Now let's turn the document results into JSON
  json_snippets = convert_docs_to_json(retrieved_snippets)

  return json_snippets


In [8]:
#now let's serve this up via a gradio
def smart_help(message, history):
    response = retrieve_snippets (message)
    return response

gr.ChatInterface(smart_help).launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [10]:

snippets = retrieve_snippets ("Hi I’m goign to hurghada on holiday but i’m not sure it’s safe to do so or whether we are allowed to visit")
print (snippets)
print (type(snippets))
#json_snippets = json.dumps(snippets, indent=4)
#print ()
#print (json_snippets)

[
    {
        "GPT_GENERATED_question": "Is it currently safe for British citizens to travel to Hurghada, Egypt? To answer the question about the safety of travel to Hurghada for British citizens, information about the current travel advisories, political climate, and security situation in Egypt is needed.",
        "answer": {
            "page_content": "Country:'Egypt'; Section:'Safety and security'; Content:You should also read FCDO's overall travel advice and regional risks advice. Terrorism There is a high threat of terrorist attack globally affecting UK interests and British nationals, including from groups and individuals who view the UK and British nationals as targets. You should remain vigilant at all times. UK Counter Terrorism Policing has information and advice on staying safe abroad and what to do in the event of a terrorist attack. Find out more about the global threat from terrorism. Terrorism in Egypt Terrorists are very likely to try to carry out attacks in Egypt. 