# Preprocessing
## Imports

In [1]:
import networkx as nx
from langchain_core.documents import Document
import pandas as pd
import json

## Functions

In [2]:
def load_data(path: str) -> list:
    with open(path, 'r') as js:
        data = json.load(js)
    return data

In [3]:
def create_doc(row: pd.Series) -> Document:
    doc = Document(page_content=row["text"], id=row["title"], metadata=row.drop(labels=["text", "title", "references"]).to_dict())
    return doc

In [12]:
def clean_list(lst):
    if isinstance(lst, list):
        return [s.replace('\xa0', ' ') if isinstance(s, str) else s for s in lst]
    return lst

## Main

### EPC

In [13]:
path = "../../resources/extracted_data/EPC_data.json"
articles = load_data(path)
articles_df = pd.DataFrame(articles)
articles_df = articles_df.replace(u"\xa0", ' ', regex=True)
articles_df = articles_df.map(lambda x: x.strip() if isinstance(x, str) else x)
articles_df['references'] = articles_df['references'].apply(clean_list)
articles_df

Unnamed: 0,book,part,chapter,title,text,references,url
0,EPC,Part I - General and institutional provisions,Chapter I - General provisions,Article 1 - European law for the grant of patents,"A system of law, common to the Contracting Sta...",,https://www.epo.org/en/legal/epc/2020/a1.html
1,EPC,Part I - General and institutional provisions,Chapter I - General provisions,Article 2 - European patent,(1) Patents granted under this Convention shal...,"[Article 63 Term of the European patent, Artic...",https://www.epo.org/en/legal/epc/2020/a2.html
2,EPC,Part I - General and institutional provisions,Chapter I - General provisions,Article 3 - Territorial effect,The grant of a European patent may be requeste...,"[Article 79 Designation of Contracting States,...",https://www.epo.org/en/legal/epc/2020/a3.html
3,EPC,Part I - General and institutional provisions,Chapter I - General provisions,Article 4 - European Patent Organisation,"(1) A European Patent Organisation, hereinafte...",[Article 4a Conference of ministers of the Con...,https://www.epo.org/en/legal/epc/2020/a4.html
4,EPC,Part I - General and institutional provisions,Chapter I - General provisions,Article 4a - Conference of ministers of the Co...,A conference of ministers of the Contracting S...,[Article 4 European Patent Organisation],https://www.epo.org/en/legal/epc/2020/a4a.html
...,...,...,...,...,...,...,...
163,EPC,Part XII - Final provisions,Art. 167 - (deleted),Article 174 - Denunciation,Any Contracting State may at any time denounce...,"[Article 175 Preservation of acquired rights, ...",https://www.epo.org/en/legal/epc/2020/a174.html
164,EPC,Part XII - Final provisions,Art. 167 - (deleted),Article 175 - Preservation of acquired rights,(1) In the event of a State ceasing to be part...,,https://www.epo.org/en/legal/epc/2020/a175.html
165,EPC,Part XII - Final provisions,Art. 167 - (deleted),Article 176 - Financial rights and obligations...,(1) Any State which has ceased to be a party t...,,https://www.epo.org/en/legal/epc/2020/a176.html
166,EPC,Part XII - Final provisions,Art. 167 - (deleted),Article 177 - Languages of the Convention,"(1) This Convention, drawn up in a single orig...",,https://www.epo.org/en/legal/epc/2020/a177.html


In [14]:
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document

local_embeddings = OllamaEmbeddings(model="all-minilm")

vectorstore = Chroma.from_texts(articles_df["text"].to_list(), embedding=local_embeddings)

""" config_path = "../../config/retriever_config.yaml"
retriever = Retriever(config_path, articles_df) """

' config_path = "../../config/retriever_config.yaml"\nretriever = Retriever(config_path, articles_df) '

In [15]:
query = "How is decided whether an idea is novel or not?"
docs = vectorstore.similarity_search(query)

In [16]:
docs

[Document(id='af79404e-38c6-49c4-a85f-36f0ce4f2256', metadata={}, page_content='An invention shall be considered as involving an inventive step if, having regard to the state of the art, it is not obvious to a person skilled in the art. If the state of the art also includes documents within the meaning of Article 54, paragraph 3, these documents shall not be considered in deciding whether there has been an inventive step.\n\n\n46See decisions/opinions of the Enlarged Board of Appeal G 2/98, G 3/98, G 2/99, G 1/03, G 2/03, G 1/16 (Annex I).'),
 Document(id='7e329766-d354-4154-8dfe-f5e5e085e133', metadata={}, page_content='An invention shall be considered as involving an inventive step if, having regard to the state of the art, it is not obvious to a person skilled in the art. If the state of the art also includes documents within the meaning of Article 54, paragraph 3, these documents shall not be considered in deciding whether there has been an inventive step.\n\n\n46See decisions/opin

In [17]:
g = nx.MultiDiGraph()

In [None]:
docs = []
for _, row in articles_df.iterrows():
    doc = create_doc(row)
    g.add_node(doc.id, data=doc)

for _, row in articles_df.iterrows():
    node1_id = row["title"]
    references = row["references"]
    for ref in references:
        print(node1_id, " -> ", ref)
        g.add_edge(node1_id, ref)

Article 2 - European patent  ->  Article 63 Term of the European patent
Article 2 - European patent  ->  Article 65 Translation of the European patent
Article 2 - European patent  ->  Article 68 Effect of revocation or limitation of the European patent
Article 2 - European patent  ->  Article 69 Extent of protection
Article 2 - European patent  ->  Article 70 Authentic text of a European patent application or European patent
Article 2 - European patent  ->  Article 99 Opposition
Article 2 - European patent  ->  Article 105c Publication of the amended specification of the European patent
Article 2 - European patent  ->  Article 142 Unitary patents
Article 2 - European patent  ->  Rule 75 Surrender or lapse of the patent
Article 2 - European patent  ->  Rule 85 Transfer of the European patent
Article 2 - European patent  ->  Rule 89 Intervention of the assumed infringer
Article 2 - European patent  ->  Rule 90 Subject of proceedings
Article 2 - European patent  ->  Rule 96 Content and fo

In [19]:
g.edges("Article 63")

OutMultiEdgeDataView([('Article 2 - European patent', 'Article 63 Term of the European patent'), ('Article 2 - European patent', 'Article 65 Translation of the European patent'), ('Article 2 - European patent', 'Article 68 Effect of revocation or limitation of the European patent'), ('Article 2 - European patent', 'Article 69 Extent of protection'), ('Article 2 - European patent', 'Article 70 Authentic text of a European patent application or European patent'), ('Article 2 - European patent', 'Article 99 Opposition'), ('Article 2 - European patent', 'Article 105c Publication of the amended specification of the European patent'), ('Article 2 - European patent', 'Article 142 Unitary patents'), ('Article 2 - European patent', 'Rule 75 Surrender or lapse of the patent'), ('Article 2 - European patent', 'Rule 85 Transfer of the European patent'), ('Article 2 - European patent', 'Rule 89 Intervention of the assumed infringer'), ('Article 2 - European patent', 'Rule 90 Subject of proceedings'