<a href="https://colab.research.google.com/github/EmicoBinsfinder/EPOCodeFestProject/blob/main/TaggingTool_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Configure OpenAI API key

# access your OpenAI API key

# installing llmx first isn't necessary but avoids a confusing error when installing openai
!pip install -q llmx
!pip install -q openai
from openai import OpenAI
import google.generativeai as genai
from google.colab import userdata


openai_api_secret_name = 'Test'
## @param {type: "string"}

try:
  OPENAI_API_KEY=userdata.get(openai_api_secret_name)
  client = OpenAI(
    api_key=OPENAI_API_KEY
  )
except userdata.SecretNotFoundError as e:
   print(f'''Secret not found\n\nThis expects you to create a secret named {openai_api_secret_name} in Colab\n\nVisit https://platform.openai.com/api-keys to create an API key\n\nStore that in the secrets section on the left side of the notebook (key icon)\n\nName the secret {openai_api_secret_name}''')
   raise e
except userdata.NotebookAccessError as e:
  print(f'''You need to grant this notebook access to the {openai_api_secret_name} secret in order for the notebook to access Gemini on your behalf.''')
  raise e
except Exception as e:
  # unknown error
  print(f"There was an unknown error. Ensure you have a secret {openai_api_secret_name} stored in Colab and it's a valid key from https://platform.openai.com/api-keys")
  raise e

## System Setup

In [None]:
!git clone https://github.com/EmicoBinsfinder/EPOCodeFestProject.git
!pip install gradio

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Path to embedding Model
Model_Path = '/content/EPOCodeFestProject/TextSimilarityModel'

In [None]:
########## IMPORTING REQUIRED PYTHON PACKAGES ##########
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
import math
import time
import csv
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import string
import gradio

## Defining embedding generation

### Format embeddings

In [6]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)

### Function to embed the input text for similarity searching

In [7]:
### Sentence Embedder
def sentence_embedder(sentences, model_path):
  """
  Calling the sentence similarity model to generate embeddings on input text.
  :param sentences: takes input text in the form of a string
  :param model_path: path to the text similarity model
  :return returns a (1, 384) embedding of the input text
  """
  tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
  model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
  # Compute token embeddings
  with torch.no_grad():
    model_output = model(**encoded_input)
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
  return sentence_embeddings

### Load Saved Embeddings

In [None]:
class_embeddings = pd.read_csv('/content/EPOCodeFestProject/MainClassEmbeddings.csv')

In [9]:
### Sentence Embedding Preparation Function
def convert_saved_embeddings(embedding_string):
    """
    Preparing pre-computed embeddings for use for comparison with new abstract embeddings .
    Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity.
    :param embedding_string:
    :return: Should be a single tensor with dims (,384) in string formate
    """
    embedding = embedding_string.replace('(', '')
    embedding = embedding.replace(')', '')
    embedding = embedding.replace('[', '')
    embedding = embedding.replace(']', '')
    embedding = embedding.replace('tensor', '')
    embedding = embedding.replace(' ', '')
    embedding = embedding.split(',')
    embedding = [float(x) for x in embedding]
    embedding = np.array(embedding)
    embedding = np.expand_dims(embedding, axis=0)
    embedding = torch.from_numpy(embedding)
    return embedding

### Clean User Input

In [16]:
all_stopwords = stopwords.words('english') # Making sure to only use English stopwords
extra_stopwords = ['ii', 'iii'] # Can add extra stopwords to be removed from dataset/input abstracts
all_stopwords.extend(extra_stopwords)

def clean_data(input, type='Dataframe'):
    if type == 'Dataframe':
        cleaneddf = pd.DataFrame(columns=['Class', 'Description'])
        for i in range(0, len(input)):
            row_list = input.loc[i, :].values.flatten().tolist()
            noNaN_row = [x for x in row_list if str(x) != 'nan']
            listrow = []
            if len(noNaN_row) > 0:
                row = noNaN_row[:-1]
                row = [x.strip() for x in row]
                row = (" ").join(row)
                text_tokens = word_tokenize(row)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
                Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
                row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
                removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
                for char in removechars:
                    row = list(map(lambda x: x.replace(char, ''), row))

                row = ''.join(row)
                wnum = row.split(' ')
                wnum = [x.lower() for x in wnum]
                #remove duplicate words
                wnum = list(dict.fromkeys(wnum))
                #removing numbers
                wonum = []
                for x in wnum:
                    xv = list(x)
                    xv = [i.isnumeric() for i in xv]
                    if True in xv:
                        continue
                    else:
                        wonum.append(x)
                row = ' '.join(wonum)
                l = [noNaN_row[-1], row]
                cleaneddf.loc[len(cleaneddf)] = l
        cleaneddf = cleaneddf.drop_duplicates(subset=['Description'])
        cleaneddf.to_csv('E:/Users/eeo21/Startup/CPC_Classifications_List/additionalcleanedclasses.csv', index=False)
        return cleaneddf

    elif type == 'String':
        text_tokens = word_tokenize(input)  # splits abstracts into individual tokens to allow removal of stopwords by list comprehension
        Stopword_Filtered_List = [word for word in text_tokens if not word in all_stopwords]  # removes stopwords
        row = (" ").join(Stopword_Filtered_List)  # returns abstract to string form
        removechars = ['[', ']', '{', '}', ';', '(', ')', ',', '.', ':', '/', '-', '#', '?', '@', '£', '$']
        for char in removechars:
            row = list(map(lambda x: x.replace(char, ''), row))
        row = ''.join(row)
        wnum = row.split(' ')
        wnum = [x.lower() for x in wnum]
        # remove duplicate words
        wnum = list(dict.fromkeys(wnum))
        # removing numbers
        wonum = []
        for x in wnum:
            xv = list(x)
            xv = [i.isnumeric() for i in xv]
            if True in xv:
                continue
            else:
                wonum.append(x)
        row = ' '.join(wonum)
        return row

### Function for CPC Class Prediction

In [45]:
def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=10, Sensitivity='Medium'):
    predictions = pd.DataFrame(columns=['Class Name', 'Score', 'Description'])
    for i in range(len(class_embeddings)):
        class_name = class_embeddings.iloc[i, 0]
        embedding = class_embeddings.iloc[i, 2]
        description = class_embeddings.iloc[i, 1]
        embedding = convert_saved_embeddings(embedding)
        abstract_embedding = abstract_embedding.numpy()
        abstract_embedding = torch.from_numpy(abstract_embedding)
        cos = torch.nn.CosineSimilarity(dim=1)
        score = cos(abstract_embedding, embedding).numpy().tolist()
        result = [class_name, score[0], description]
        predictions.loc[len(predictions)] = result

    HighestSimilarityDF = predictions.nlargest(N, ['Score'])
    HighestSimilarity = HighestSimilarityDF['Class Name'].tolist()
    Description = HighestSimilarityDF['Description'].tolist()
    HighestSimilarityClass = [x for x in HighestSimilarity]

    Links = [f'https://www.patbase.com/classSnapshot/public/?class={x}&system=CPC' for x in HighestSimilarityClass]

    HighestSimilarity = pd.DataFrame({'Class':HighestSimilarity, 'Links':Links, 'Description':Description})
    Description = ' '.join(Description[:5])

    return HighestSimilarity, Description

def classifier(userin):
    cleaned_input = clean_data(userin, type='String')
    input_embedding = sentence_embedder(cleaned_input, Model_Path)

    Number = 10
    broad_scope_predictions, descriptions = broad_scope_class_predictor(class_embeddings, input_embedding, Number, Sensitivity='High')

    return broad_scope_predictions, descriptions


#Input Claim Set/Description/Abstract Here!

In [53]:
Claims = 'Protection device (3; 103) of an optical sensor (13) for a motor vehicle (100), characterized in that the protection device (3; 103) comprises: - a housing (4) mounted to rotate about an axis of rotation (A1), the housing (4) having a housing (19) configured to receive the optical sensor (13) so that the optical axis (15 ) of the optical sensor (13) is coincident with the axis of rotation (Al), - a transparent optical element (9) integral in rotation with the housing (4) configured to be disposed at the front of the housing (4) facing a road scene whose optical sensor (13) is configured to participate in the taking of views, and centered with respect to the optical sensor (13), and - An actuator (5) coupled to the housing (4) for rotating the housing (4), so as to allow cleaning of said optical element (9) by centrifugal effect. 2. Protection device (3; 103) according to claim 1, wherein the housing (19) for the optical sensor (13) is defined by a wall (21) of the housing (4). 3. Protective device (3; 103) according to the preceding claim, wherein the wall (21) is centered around the axis of rotation (Al) of the housing (4). 4. Protective device (3; 103) according to any one of the preceding claims, wherein the housing (4) has at least one through hole (210). 5. Protective device (103) according to any one of the preceding claims, in which the actuator (5) is arranged at the rear of the housing (4).'

predictions, CPC_Descriptions = classifier(Claims)
predictions


All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


       Class Name     Score                                        Description
2982     B60Q1/00  0.503652  vehicles in general arrangement of signalling ...
2994     B60R1/00  0.483059  vehicles in general  vehicle fittings or parts...
3234     B62J6/00  0.455897  land vehicles for travelling otherwise than on...
3005    B60R21/00  0.447691  vehicles in general  vehicle fittings or parts...
2987    B60Q11/00  0.446090  vehicles in general arrangement of signalling ...
2919     B60J3/00  0.441313  vehicles in general windows  windscreens nonfi...
2993  B60Q2900/00  0.440610  vehicles in general arrangement of signalling ...
7050     F21S6/00  0.427310  lighting nonportable devices  systems thereof ...
2991  B60Q2500/00  0.416965  vehicles in general arrangement of signalling ...
7070    F21V15/00  0.416304  lighting functional features or details of dev...


Unnamed: 0,Class,Links,Description
0,B60Q1/00,https://www.patbase.com/classSnapshot/public/?...,vehicles in general arrangement of signalling ...
1,B60R1/00,https://www.patbase.com/classSnapshot/public/?...,vehicles in general vehicle fittings or parts...
2,B62J6/00,https://www.patbase.com/classSnapshot/public/?...,land vehicles for travelling otherwise than on...
3,B60R21/00,https://www.patbase.com/classSnapshot/public/?...,vehicles in general vehicle fittings or parts...
4,B60Q11/00,https://www.patbase.com/classSnapshot/public/?...,vehicles in general arrangement of signalling ...
5,B60J3/00,https://www.patbase.com/classSnapshot/public/?...,vehicles in general windows windscreens nonfi...
6,B60Q2900/00,https://www.patbase.com/classSnapshot/public/?...,vehicles in general arrangement of signalling ...
7,F21S6/00,https://www.patbase.com/classSnapshot/public/?...,lighting nonportable devices systems thereof ...
8,B60Q2500/00,https://www.patbase.com/classSnapshot/public/?...,vehicles in general arrangement of signalling ...
9,F21V15/00,https://www.patbase.com/classSnapshot/public/?...,lighting functional features or details of dev...


###Prompt Creation

In [55]:
Prompt = f'''Based on the description below and the patent claim set, generate a list relevant tags related to the claims where the
tags fall into one of the following categories:

1. Product: What is the general product area
2. Function: What is the function of the invention
3. Component: What components do the invention comprise
4. Invention type: Is it for example a method claim, apparatus claim, method-of-use

For each tag, also return which of the 4 categories it belongs to in the following format:

Attempt to extract as many unique components from the claims as possible.

{{Tag: Category}}

Description:
{CPC_Descriptions}

Claims:
{Claims}
'''

completion = client.chat.completions.create(
model="gpt-4-0125-preview",
messages=[
{"role": "user", "content": '%s -- Please answer as concisely as you can, avoiding any extra conversation or text' % Prompt}
]
)

response = completion.choices[0].message.content
response

'{Motor Vehicle: Product}\n{Optical Sensor Protection: Function}\n{Housing: Component}\n{Transparent Optical Element: Component}\n{Actuator: Component}\n{Rotation Axis: Component}\n{Centrifugal Cleaning: Function}\n{Through Hole: Component}\n{Apparatus Claim: Invention Type}'

TODO: Integrate into

In [66]:
def chatbot(input):
  predictions, CPC_Descriptions = classifier(input)

  class_links = []
  for i in range(len(predictions)):
    class_links.append("[{}]({})".format(predictions['Class'][i], predictions['Links'][i]))

  links = '\n'.join(class_links)

  completion = client.chat.completions.create(
  model="gpt-4-0125-preview",
  messages=[
  {"role": "user", "content": '%s -- Please answer as concisely as you can, avoiding any extra conversation or text' % Prompt}
  ]
  )

  response = completion.choices[0].message.content

  response = '\n'.join([response, f"{'#'*120} \n"])
  response = '\n'.join([response, f"Classes USED TO GENERATE RESPONSE:\n {links}"])

  return response

inputs = gradio.Textbox(lines=7, label="Generate tags and CPC classifications based on patent claims")
outputs = gradio.Textbox(label="Reply")

gradio.Interface(fn=chatbot, inputs=inputs, outputs=outputs, title="Patent Tagging Prototype",
             theme="compact").launch(share=True, debug=True)


Sorry, we can't find the page you are looking for.


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://837ed8b96de1dd6bf1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


       Class Name     Score                                        Description
8365    G09F17/00  0.240451  education  cryptography display advertising se...
8364    G09F15/00  0.231229  education  cryptography display advertising se...
8362    G09F11/00  0.220925  education  cryptography display advertising se...
8366    G09F19/00  0.206503  education  cryptography display advertising se...
9059  H04H2201/00  0.204572  electric communication technique broadcast  mu...
8358     G09F3/00  0.202155  education  cryptography display advertising se...
8363    G09F13/00  0.191516  education  cryptography display advertising se...
8352     G09C1/00  0.190778  education  cryptography display advertising se...
8367    G09F21/00  0.190301  education  cryptography display advertising se...
8360     G09F7/00  0.190058  education  cryptography display advertising se...


All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


       Class Name     Score                                        Description
6849    F16H15/00  0.629792  engineering elements and units  general measur...
6873    F16H63/00  0.610352  engineering elements and units  general measur...
6847     F16H9/00  0.608114  engineering elements and units  general measur...
6845     F16H3/00  0.606114  engineering elements and units  general measur...
6848    F16H13/00  0.604977  engineering elements and units  general measur...
6855    F16H29/00  0.601729  engineering elements and units  general measur...
6850    F16H19/00  0.600272  engineering elements and units  general measur...
6871    F16H59/00  0.600117  engineering elements and units  general measur...
6888  F16H2716/00  0.597021  engineering elements and units  general measur...
6872    F16H61/00  0.595447  engineering elements and units  general measur...
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://837ed8b96de1dd6bf1.gradio.live


