<a href="https://colab.research.google.com/github/EmicoBinsfinder/EPOCodeFestProject/blob/main/DrillDownV1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#@title Configure OpenAI API key

# access your OpenAI API key

# installing llmx first isn't necessary but avoids a confusing error when installing openai
!pip install -q llmx
!pip install -q openai
from openai import OpenAI
import google.generativeai as genai
from google.colab import userdata


openai_api_secret_name = 'Test'
## @param {type: "string"}

try:
  OPENAI_API_KEY=userdata.get(openai_api_secret_name)
  client = OpenAI(
    api_key=OPENAI_API_KEY
  )
except userdata.SecretNotFoundError as e:
   print(f'''Secret not found\n\nThis expects you to create a secret named {openai_api_secret_name} in Colab\n\nVisit https://platform.openai.com/api-keys to create an API key\n\nStore that in the secrets section on the left side of the notebook (key icon)\n\nName the secret {openai_api_secret_name}''')
   raise e
except userdata.NotebookAccessError as e:
  print(f'''You need to grant this notebook access to the {openai_api_secret_name} secret in order for the notebook to access Gemini on your behalf.''')
  raise e
except Exception as e:
  # unknown error
  print(f"There was an unknown error. Ensure you have a secret {openai_api_secret_name} stored in Colab and it's a valid key from https://platform.openai.com/api-keys")
  raise e

## System Setup

In [None]:
!git clone https://github.com/EmicoBinsfinder/EPOCodeFestProject.git
!pip install gradio

Cloning into 'EPOCodeFestProject'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 77 (delta 11), reused 0 (delta 0), pack-reused 54[K
Receiving objects: 100% (77/77), 138.42 MiB | 17.98 MiB/s, done.
Resolving deltas: 100% (14/14), done.
Updating files: 100% (48/48), done.
Collecting gradio
  Downloading gradio-4.21.0-py3-none-any.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.110.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setu

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Path to embedding Model
Model_Path = '/content/EPOCodeFestProject/TextSimilarityModel'

Mounted at /content/drive


In [None]:
########## IMPORTING REQUIRED PYTHON PACKAGES ##########
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
import math
import time
import csv
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import string
import gradio
import os
import pprint
from elasticsearch import Elasticsearch
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ElasticsearchChatMessageHistory
from uuid import uuid4

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Defining embedding generation

### Format embeddings

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)

### Function to embed the input text for similarity searching

In [None]:
### Sentence Embedder
def sentence_embedder(sentences, model_path):
  """
  Calling the sentence similarity model to generate embeddings on input text.
  :param sentences: takes input text in the form of a string
  :param model_path: path to the text similarity model
  :return returns a (1, 384) embedding of the input text
  """
  tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
  model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
  # Compute token embeddings
  with torch.no_grad():
    model_output = model(**encoded_input)
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
  return sentence_embeddings

### Load Saved Embeddings

In [None]:
class_embeddings = pd.read_csv('/content/EPOCodeFestProject/MainClassEmbeddings.csv')

In [None]:
### Sentence Embedding Preparation Function
def convert_saved_embeddings(embedding_string):
    """
    Preparing pre-computed embeddings for use for comparison with new abstract embeddings .
    Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity.
    :param embedding_string:
    :return: Should be a single tensor with dims (,384) in string formate
    """
    embedding = embedding_string.replace('(', '')
    embedding = embedding.replace(')', '')
    embedding = embedding.replace('[', '')
    embedding = embedding.replace(']', '')
    embedding = embedding.replace('tensor', '')
    embedding = embedding.replace(' ', '')
    embedding = embedding.split(',')
    embedding = [float(x) for x in embedding]
    embedding = np.array(embedding)
    embedding = np.expand_dims(embedding, axis=0)
    embedding = torch.from_numpy(embedding)
    return embedding

### Clean User Input

In [None]:
all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
all_stopwords.extend(extra_stopwords)

def clean_data(input, type='Dataframe'):
    if type == 'Dataframe':
        cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
        for i in range(0, len(input)):
            row_list = input.loc[i, :].values.flatten().tolist()
            noNaN_row = [x for x in row_list if str(x) != 'nan']
            listrow = []
            if len(noNaN_row) > 0:
                row = noNaN_row[:-1]
                row = [x.strip() for x in row]
                row = (" ").join(row)
                text_tokens = word_tokenize(row)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
                Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
                row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
                removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
                for char in removechars:
                    row = list(map(lambda x: x.replace(char, ''), row))

                row = ''.join(row)
                wnum = row.split(' ')
                wnum = [x.lower() for x in wnum]
                #remove duplicate words
                wnum = list(dict.fromkeys(wnum))
                #removing numbers
                wonum = []
                for x in wnum:
                    xv = list(x)
                    xv = [i.isnumeric() for i in xv]
                    if True in xv:
                        continue
                    else:
                        wonum.append(x)
                row = ' '.join(wonum)
                l = [noNaN_row[-1], row]
                cleaneddf.loc[len(cleaneddf)] = l
        cleaneddf = cleaneddf.drop_duplicates(subset=['Description'])
        cleaneddf.to_csv('E:/Users/eeo21/Startup/CPC_Classifications_List/additionalcleanedclasses.csv', index=False)
        return cleaneddf

    elif type == 'String':
        text_tokens = word_tokenize(input)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
        Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
        row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
        removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
        for char in removechars:
            row = list(map(lambda x: x.replace(char, ''), row))
        row = ''.join(row)
        wnum = row.split(' ')
        wnum = [x.lower() for x in wnum]
        # remove duplicate words
        wnum = list(dict.fromkeys(wnum))
        # removing numbers
        wonum = []
        for x in wnum:
            xv = list(x)
            xv = [i.isnumeric() for i in xv]
            if True in xv:
                continue
            else:
                wonum.append(x)
        row = ' '.join(wonum)
        return row

### Function for CPC Class Prediction

In [None]:
def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=10, Sensitivity='Medium'):
    predictions = pd.DataFrame(columns=['Class Name', 'Score', 'Description'])
    for i in range(len(class_embeddings)):
        class_name = class_embeddings.iloc[i, 0]
        embedding = class_embeddings.iloc[i, 2]
        description = class_embeddings.iloc[i, 1]
        embedding = convert_saved_embeddings(embedding)
        abstract_embedding = abstract_embedding.numpy()
        abstract_embedding = torch.from_numpy(abstract_embedding)
        cos = torch.nn.CosineSimilarity(dim=1)
        score = cos(abstract_embedding, embedding).numpy().tolist()
        result = [class_name, score[0], description]
        predictions.loc[len(predictions)] = result

    HighestSimilarityDF = predictions.nlargest(N, ['Score'])
    HighestSimilarity = HighestSimilarityDF['Class Name'].tolist()
    Description = HighestSimilarityDF['Description'].tolist()
    HighestSimilarityClass = [x for x in HighestSimilarity]

    Links = [f'https://www.patbase.com/classSnapshot/public/?class={x}&system=CPC' for x in HighestSimilarityClass]

    HighestSimilarity = pd.DataFrame({'Class':HighestSimilarity, 'Links':Links, 'Description':Description})
    Description = ' '.join(Description[:5])

    return HighestSimilarity, Description

def classifier(userin):
    cleaned_input = clean_data(userin, type='String')
    input_embedding = sentence_embedder(cleaned_input, Model_Path)

    Number = 10
    broad_scope_predictions, descriptions = broad_scope_class_predictor(class_embeddings, input_embedding, Number, Sensitivity='High')

    return broad_scope_predictions, descriptions


###Prompt Creation

###Gradio App

In [None]:
def chatbot(input):
  predictions, CPC_Descriptions = classifier(input)

  class_links = []
  for i in range(len(predictions)):
    class_links.append("[{}]({})".format(predictions['Class'][i], predictions['Links'][i]))

  links = '\n'.join(class_links)

  Prompt = f'''Based on the description below and the patent claim set,
generate a comprehensive list of relevant tags/keywords related to the claims where the
tags fall into one of the following categories:

1. Product: What is the general product area
2. Function: What is the function of the invention
3. Component: What components does the invention comprise of
4. Invention type: Is it for example a method claim, apparatus claim, method-of-use

For each tag, also return which of the 4 categories it belongs to in the following format:

Attempt to extract as many tags/keywords from the claims as possible.

{{Tag: Category}}

Description:
{CPC_Descriptions}

Claims:
{input}
'''

  completion = client.chat.completions.create(
  model="gpt-4-0125-preview",
  messages=[
  {"role": "user", "content": f'Your function is that of a bot optimised for summarising patent text. Answer the following query as accurately as possible based on your function {Prompt}'}
  ]
  )

  response = completion.choices[0].message.content

  response = '\n'.join([response, f"{'#'*120} \n"])
  response = '\n'.join([response, f"Classes USED TO GENERATE RESPONSE:\n {links}"])

  return response

inputs = gradio.Textbox(lines=7, label="Generate tags and CPC classifications based on patent claims")
outputs = gradio.Textbox(label="Reply")

gradio.Interface(fn=chatbot, inputs=inputs, outputs=outputs, title="Patent Tagging Prototype",
             theme="compact").launch(share=True, debug=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.

Sorry, we can't find the page you are looking for.


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://7d9fc91aee348244e8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
