<a href="https://colab.research.google.com/github/EmicoBinsfinder/EPOCodeFestProject/blob/main/DrillDownV1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
#@title Configure OpenAI API key

# access your OpenAI API key

# installing llmx first isn't necessary but avoids a confusing error when installing openai
!pip install -q llmx
!pip install -q openai
from openai import OpenAI
import google.generativeai as genai
from google.colab import userdata


openai_api_secret_name = 'Test'
## @param {type: "string"}

try:
  OPENAI_API_KEY=userdata.get(openai_api_secret_name)
  OpenAIclient = OpenAI(
    api_key=OPENAI_API_KEY
  )
except userdata.SecretNotFoundError as e:
   print(f'''Secret not found\n\nThis expects you to create a secret named {openai_api_secret_name} in Colab\n\nVisit https://platform.openai.com/api-keys to create an API key\n\nStore that in the secrets section on the left side of the notebook (key icon)\n\nName the secret {openai_api_secret_name}''')
   raise e
except userdata.NotebookAccessError as e:
  print(f'''You need to grant this notebook access to the {openai_api_secret_name} secret in order for the notebook to access Gemini on your behalf.''')
  raise e
except Exception as e:
  # unknown error
  print(f"There was an unknown error. Ensure you have a secret {openai_api_secret_name} stored in Colab and it's a valid key from https://platform.openai.com/api-keys")
  raise e

### System Setup

In [6]:
!pip install gradio
!pip install elasticsearch
!pip install langchain



In [7]:
########## IMPORTING REQUIRED PYTHON PACKAGES ##########
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
import math
import time
import csv
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import string
import gradio
import os
import pprint
from elasticsearch import Elasticsearch
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ElasticsearchChatMessageHistory
from uuid import uuid4
import os, sys
import json, csv

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# Get my os environment
os.environ['ELASTICSEARCH_PASSWORD'] = 'l0ng-r4nd0m-p@ssw0rd'
pwd = os.environ["ELASTICSEARCH_PASSWORD"]

# Password for the 'elastic' user generated by Elasticsearch
ELASTIC_PASSWORD = pwd

# Found in the 'Manage Deployment' page
CLOUD_ID = "http://AnkarDev-Elasticsearch-1891076460.eu-west-2.elb.amazonaws.com:9200"

# Create the client instance
client = Elasticsearch(
    CLOUD_ID,
    basic_auth=("eogbomo", ELASTIC_PASSWORD),
    verify_certs=False
)

###Gradio App

In [9]:
def loadprevresponses(QueryID):
  try:
    with open (f'{QueryID}_responses.json', 'r+') as file:
      try:
        data = json.load(file)
        datastr = list(data.values())
        datastr = ', '.join(datastr)
      except:
        print('Error loading responses')
        data = {}
  except FileNotFoundError:
    with open (f'{QueryID}_responses.json', 'w') as file:
      data = {}
      datastr = ''
      json.dump(data, file)
  return data, datastr

def saveresponse(input, QueryID):
  history, historystr = loadprevresponses(QueryID)

  history[f'Input{len(history)+1}'] = input
  with open (f'{QueryID}_responses.json', 'r+') as file:
    json.dump(history, file)


In [10]:
def DrillDown(input, QueryID):

  history, historystr = loadprevresponses(QueryID)
  saveresponse(input, QueryID)

  input += historystr

  prompt = """You are an expert in translating natural language queries about patents into ElasticSearch Queries.
    Given a user input, create an Elasticsearch query enabling the user to return as many relevant patents as possible when querying in Elastic

    input: {input}

    """.format(input=input)

  additional_prompt="""
    Instructions:
    1. Generate Elasticsearch queries based on the provided natural language queries.
    2. Only use fields present in the mapping. If the user is asking about a field that is not in the mapping ignore it.
    3. Ensure that the generated queries follow Elasticsearch's query DSL syntax and structure.
    4. You can correct or reformulate the user's query if it has errors.
    5. Return all fields in your response when applicable.
    6. Make sure that the query only performs full text search when applicable i.e. don't use keyword search
    7. When returning the json portion of the answer, compress the json output removing spaces. Remove any mention of json in the output or triple backtick sand make sure that it's valid.
    8. Ensure that as many aspects of the user input are captired as possible

    Examples of expected behavior:
    Natural Language Query: "What is the title of the most recent Apple patent"
    Expected Elasticsearch Query:
    {
      "size": 1,
      "sort": [
        {
          "publicationDate": {
            "order": "desc"
          }
        }
      ],
      "query": {
        "bool": {
          "must": [
            {
              "match": {
                "applicants": "apple"
              }
            }
          ]
        }
      }
    }

    Natural Language Query: "What are the most recent methods to deal with cell group failure?"
    Expected Elasticsearch Query:
    {
      "query": {
        "bool": {
          "must": [
            {
              "bool": {
                "should": [
                  {
                    "match": {
                      "patentTitle": "cell group failure"
                    }
                  },
                  {
                    "match": {
                      "patentAbstract": "cell group failure"
                    }
                  },
                  {
                    "match": {
                      "claims.claimText": "cell group failure"
                    }
                  },
                  {
                    "match": {
                      "patentDescription": "cell group failure"
                    }
                  }
                ]
              }
            }
          ],
          "filter": [
            {
              "range": {
                "publicationDate": {
                  "gte": "now-5y/d"
                }
              }
            }
          ]
        }
      },
      "_source": ["*"]
    }"""
  prompt += additional_prompt

  completion = OpenAIclient.chat.completions.create(
  model="gpt-4-0125-preview",
  messages=[
  {"role": "user", "content": f'Your function is that of a bot optimised for summarising patent text. Answer the following query as accurately as possible based on your function {prompt}'}
  ]
  )
  response = completion.choices[0].message.content

  return response

In [11]:
# inputs = gradio.Textbox(lines=7, label="Generate Queries for use with Elastic Search, allowing for search refinement")
# outputs = gradio.Textbox(label="Reply")

# gradio.Interface(fn=DrillDown, inputs=inputs, outputs=outputs, title="Patent DrillDown Prototype",
#              theme="compact").launch(share=True, debug=True)

### Load in and prepare test dataset

In [13]:
Queries = pd.read_csv('/content/Valeo_Queries_Testing.csv')

Queries_List = Queries['Query'].tolist()

## Add IDs to Queries for tracking later on

ID_Queries_List = []
for index, query in enumerate(Queries_List):
  QueryID = f'Query_{index}'
  query += f'////{QueryID}'
  ID_Queries_List.append(query)

### Define function to simulate drill down workflow

In [14]:
def DrillDownQueryGenerator(Initial_Query):

  prompt = f"""
You are an expert in mimicking the behaviour of a patent professional using an AI powered patent search tool.
Based on the provided initial query delimited by triple backticks marks ```Initial User Query```,
provided a list of up to 10 sequential queries that could both narrow and broaden the scope of the search.
Return the list of queries in the format of a Python list.

When creating the list of queries, strictly follow the list of numbered instructions below:

1. When creating queries that vary the scope of the initial search
2. Ensure that all generated follow up queries are relevant to the initial provided query.
3. When creating a narrowing query, define the scope of the query such that it focusses on a more specific, relevant technology or technical area (example: A car panel made from metal > a car panel made of steel).
4. When creating a broadening query, define the scope of the query such that it focusses on a less specific, relevant technology or technical area (example: patents about batteries cooled using a heat sink > patents about battery cooling).
5. When receiving initial queries that include dates, make sure to have some follow up queries with different dates, but not all (example, Patent with priority date before 2012 > Now look at with priority date before 2009)
6. When receiving initial queries with several aspects, start by generating queries that are likely to broaden the scope of the search.
7. When receiving initial queries that are quite broad and ambiguous, start by generating queries that are likely to narrow the scope of the search.
8. Return only the list of queries in list form, do not return anything outside of the list with limits []
9. Make sure to use the word 'narrow' or 'broaden' in the query depending on if it's meant to broaden or narrow the scope of the search
10. Mix up the frequency and order of broadening and narrowing queries
11. Once you have created the list, double check it to see if the you are returning solely a list, changing the output if it is not just a list.
12. Ensure that you are not returning the initial query

```{Initial_Query}```

Example input: Patents Owned by Valeo

Example output:
[
"Now broaden the search to include patents owned by other major automotive suppliers",
"Now focus specifically on patents owned by Valeo related to automotive lighting systems",
"Now narrow the search to patents owned by Valeo related to autonomous driving technology",
"Now include patents owned by Valeo and its subsidiaries",
"Now broaden the search to include patents owned by Valeo related to climate control systems",
"Now narrow the search to patents owned by Valeo filed within the last 5 years",
"Now include patents owned by other companies in the automotive industry",
"Now narrow the search to patents owned by Valeo related to vehicle safety systems",
"Now broaden the search to include patents owned by Valeo related to automotive electronics",
"Now exclude patents owned by Valeo's competitors"
]

"""

  completion = OpenAIclient.chat.completions.create(
  model="gpt-4-0125-preview",
  messages=[
  {"role": "user", "content": f'Your function is that of a bot optimised for summarising patent text. Answer the following query as accurately as possible based on your function {prompt}'}
  ]
  )
  response = completion.choices[0].message.content

  return response

### Iterate through queries in the Valeo CSV

In [27]:
import re

for InitialQuery in ID_Queries_List[2:3]:
  #Retrieve QueryID
  QueryID = InitialQuery.split('////')[-1]

  #Simulate list of drill down queries
  QueryListString = DrillDownQueryGenerator(InitialQuery)

  # Regular expression pattern to extract the list
  pattern = r'\[([\s\S]*?)\]'

  # Extracting the list using regex
  matches = re.findall(pattern, QueryListString)

  if matches:
      # Cleaning up the extracted list
      QueryList = matches[0].strip().split("\n")[1:-1]
  else:
      print("No list found in the given text.")

  Numresponses = []
  ESQueries = []
  ValidQueries = []
  Responses = []

  for SubQuery in QueryList:

    #Generte Elastic Search query for each stage of drill down workflow
    ESQuery = DrillDown(SubQuery, QueryID)

    try:
      resp = client.search(index="patents", body=ESQuery)

      print(resp['hits']['hits'])
      NumResponse = len(resp['hits']['hits'])
      ValidESQuery = 'Yes'

    except Exception as E:
      print(E)
      NumResponses = 0
      ValidESQuery = 'No'

    print(NumResponse)
    print(ValidESQuery)
    print(ESQuery)
    print(resp)

    Responses.append(resp)
    Numresponses.append(NumResponse)
    ESQueries.append(ValidESQuery)
    ESQueries.append(ESQuery)

KeyboardInterrupt: 