In [1]:
import networkx as nx
import pickle

with open("../resources/LegalBases/graph.pkl", "rb") as f:
    g = pickle.load(f)

In [2]:
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document

texts = []
metadatas = []
ids = []
for id, doc in g.nodes(data=True):
    if doc["data"].page_content:
        ids.append(id)
        metadatas.append({"label": doc["label"]})
        texts.append(doc["data"].page_content)

In [3]:
from src.system.rag import Retriever
articles = {}
articles["texts"] = texts
articles["ids"] = ids
articles["metadatas"] = metadatas
config_path = "../config/retriever_config.yaml"
local_embeddings = OllamaEmbeddings(model="nomic-embed-text")
retriever = Retriever(config_path, articles, local_embeddings)

  from .autonotebook import tqdm as notebook_tqdm


🔁 Loading existing Chroma vectorstore...


In [4]:
from src.system.rag import RAG
from langchain_ollama import ChatOllama
llm = ChatOllama(model="mistral")

rag_config_path = "../config/config.yaml"
rag = RAG(rag_config_path, llm, retriever, g, None)

In [5]:
for c in rag.generate_question("1. farejiou"):
    print(c, end='', flush=True)

1. Company C files a European patent application for an invention which has been displayed at an officially recognized international exhibition. The certificate required according to Rule 25 EPC is obtained three months after filing the application and filed within the four-month period together with the authenticated identification of the invention. Will the certificate be accepted by the EPO?

   A. Yes, if the certificate was obtained at the exhibition and filed within the specified four-month period.
   B. No, if the certificate was not issued at the exhibition.

In [7]:
question = """1. Company C files a European patent application for an invention which has been displayed at an officially recognized international exhibition. The certificate required according to Rule 25 EPC is obtained three months after filing the application and filed within the four-month period together with the authenticated identification of the invention. Will the certificate be accepted by the EPO?

   A. Yes, if the certificate was obtained at the exhibition and filed within the specified four-month period.
   B. No, if the certificate was not issued at the exhibition."""
for c in rag.run_flux(question, rerank=False):
    print(c, end='', flush=True)

 The question asks about a European patent application filed by Company C for an invention that has been displayed at an officially recognized international exhibition. The certificate required according to Rule 25 EPC is obtained three months after filing the application and filed within the four-month period together with the authenticated identification of the invention.

Let's break down the information given:
1. Company C files a European patent application for an invention displayed at an officially recognized international exhibition.
2. The certificate required according to Rule 25 EPC is obtained three months after filing the application and filed within the four-month period together with the authenticated identification of the invention.

From this information, it's clear that the certificate was obtained three months after filing the application, which seems to comply with the rule that it should be obtained as soon as possible but not later than four months from the date o

In [17]:
query = "What does it take for an idea to be eligible for patenting?"
rag.run(query)

According to the European Patent Office (EPO) guidelines and the European Patent Convention (EPC), for an idea to be eligible for patenting, it must meet the following criteria:

1. It must be an "invention" belonging to any field of technology.
2. The invention must be "susceptible of industrial application".
3. The invention must be "new".
4. The invention must involve an "inventive step".

Additionally, the EPO guidelines emphasize that a technical character is an implicit requisite for the presence of an "invention" within the meaning of Art. 52(1). This means that the idea must relate to a technical field, be concerned with a technical problem, and have technical features in terms of which the matter for which protection is sought can be defined in the claim.

The two-hurdle approach is also relevant here: the first hurdle assesses whether the claimed subject-matter as a whole falls under the exclusions from patentability under Art. 52(2), while the second hurdle assesses inventiv

'According to the European Patent Office (EPO) guidelines and the European Patent Convention (EPC), for an idea to be eligible for patenting, it must meet the following criteria:\n\n1. It must be an "invention" belonging to any field of technology.\n2. The invention must be "susceptible of industrial application".\n3. The invention must be "new".\n4. The invention must involve an "inventive step".\n\nAdditionally, the EPO guidelines emphasize that a technical character is an implicit requisite for the presence of an "invention" within the meaning of Art. 52(1). This means that the idea must relate to a technical field, be concerned with a technical problem, and have technical features in terms of which the matter for which protection is sought can be defined in the claim.\n\nThe two-hurdle approach is also relevant here: the first hurdle assesses whether the claimed subject-matter as a whole falls under the exclusions from patentability under Art. 52(2), while the second hurdle assesse

In [21]:
from pathlib import Path
import os
ROOT_DIR = Path(os.getcwd())
print(ROOT_DIR)
ROOT_DIR = Path(__file__).resolve().parent.parent

/home/timothee/Documents/data_challenges/PatentAssist/notebooks


NameError: name '__file__' is not defined

In [1]:
s = [1, 4, 6]
sorted(s, reverse=True)

[6, 4, 1]

In [49]:
from FlagEmbedding import FlagReranker
reranker = FlagReranker('BAAI/bge-large-en', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

score = reranker.compute_score(['query', 'passage'])
print(score) # -5.65234375

# You can map the scores into 0-1 by set "normalize=True", which will apply sigmoid function to the score
score = reranker.compute_score(['query', 'passage'], normalize=True)
print(score) # 0.003497010252573502

scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']])
print(scores) # [-8.1875, 5.26171875]

# You can map the scores into 0-1 by set "normalize=True", which will apply sigmoid function to the score
scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']], normalize=True)
print(scores) # [0.00027803096387751553, 0.9948403768236574]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at BAAI/bge-large-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[0.059539794921875]
[0.5148805530426933]
[0.0077667236328125, -0.007549285888671875]
[0.5019416711477563, 0.4981126874912553]


In [24]:
from src.utils.preprocessing import Dataset,  EQE_Dataset_Explaination
from src.utils.evaluation import EvaluationFramework
from src.utils.metrics_retriever.nDCG import NDCG_articles, NDCG_rules, NDCG
from src.utils.metrics_retriever.Precision_K import Precision_K_articles, Precision_K_rules, Precision_K
from src.utils.metrics_retriever.Recall_K import Recall_K_articles, Recall_K_rules, Recall_K
from src.system.rag import Retriever
from src.system.rag_adapter import Retriever_Adapter
from src.utils.tools import extract_articles, extract_rules, clean_article, clean_rule
from langchain_ollama import OllamaEmbeddings
import pickle
import os

Folder_Path = ["../resources/EQE_PaperD/", "../resources/EQE_PreEx/"]
files = []
# files.extend([Folder_Path[0] + file for file in os.listdir(Folder_Path[0]) if file.endswith("_documentLess.json")])

files = [Folder_Path[0] + file for file in os.listdir(Folder_Path[0]) if file.endswith("_documentLess.json")]

files.extend(["../resources/EQE_PreEx/EQE_2021_PreEx_final_documentLess.json", "../resources/EQE_PreEx/EQE_2022_PreEx_final_documentLess.json"])
dataset = EQE_Dataset_Explaination(files)
test_data = dataset.get_dataset()


# initializing metrics
metrics = [
    Precision_K(4),
    Recall_K(4),
    NDCG(),

    # Precision_K_articles(4),
    # Recall_K_articles(4),
    # NDCG_articles(),
    #
    # Precision_K_rules(4),
    # Recall_K_rules(4),
    # NDCG_rules(),
]

with open("../resources/LegalBases/graph.pkl", "rb") as f:
        g = pickle.load(f)

print('loading files')
texts = []
metadatas = []
ids = []
for id, doc in g.nodes(data=True):
    if doc["data"].page_content and doc["label"] != "Guideline":
        ids.append(id)
        metadatas.append({"label": doc["label"]})
        texts.append(doc["data"].page_content)
articles = {}
articles["texts"] = texts
articles["ids"] = ids
articles["metadatas"] = metadatas
config_path = "../config/retriever_config.yaml"
print("Loading embeddings")
local_embeddings = OllamaEmbeddings(model="nomic-embed-text")
#local_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
print("Loading retriever")
retriever = Retriever(config_path, articles, local_embeddings)
retriever.purge_store()

system = Retriever_Adapter(retriever, rerank=True)


examples = []
for _, sample in test_data.iterrows():
    ground_truth = sample["Y"]
    docs = retriever.retrieve_documents(sample["X"], rerank=False)
    gt_articles = extract_articles(ground_truth)
    gt_rules = extract_rules(ground_truth)

    gt = [clean_rule(rule) for rule in gt_rules]
    gt.extend([clean_article(article) for article in gt_articles])
    examples.append(([doc for doc in docs], gt))

loading files
Loading embeddings
Loading retriever
🔁 Loading existing Chroma vectorstore...
🆕 Building new Chroma vectorstore...


In [50]:
s = 11
examples[s]

([Document(id='Article 100', metadata={'label': 'Article'}, page_content='Opposition may only be filed on the grounds that:\n(a) the subject-matter of the European patent is not patentable under Articles Article 52 to Article 57;\n(b) the European patent does not disclose the invention in a manner sufficiently clear and complete for it to be carried out by a person skilled in the art; \n(c) the subject-matter of the European patent extends beyond the content of the application as filed, or, if the patent was granted on a divisional application or on a new application filed under Article 61, beyond the content of the earlier application as filed.\n\n\n105See decisions/opinions of the Enlarged Board of Appeal G 3/89, G 10/91, G 11/91, G 1/95, G 2/95, G 7/95, G 1/99, G 3/04 (Annex I).'),
  Document(id='Article 69', metadata={'label': 'Article'}, page_content='(1) The extent of the protection conferred by a European patent or a European patent application shall be determined by the claims.

In [51]:
scores = reranker.compute_score([[test_data.iloc[s-1]["X"], doc.page_content] for doc in examples[s][0]])

In [52]:
[doc.id for doc, _ in sorted(zip(examples[s][0], scores), key=lambda x: x[1], reverse=True)]

['Article 75',
 'Article 61',
 'Article 63',
 'Article 67',
 'Rule 48',
 'Article 78',
 'Article 123',
 'Rule 31',
 'Rule 81',
 'Article 54',
 'Rule 96',
 'Article 94',
 'Article 100',
 'Article 138',
 'Article 83',
 'Rule 73',
 'Article 53',
 'Article 69',
 'Rule 137',
 'Article 55']