In [2]:
import requests
import pandas as pd
from tqdm import tqdm
import os
import zipfile
from lxml import etree


start_date = '2023-01'
end_date = '2023-12'
download_path = "patents_data"
unzip_path = "unzipped_patents_data"

# COLLECTION


In [43]:
url = "https://developer.uspto.gov/products/bdss/get/ajax"
params = {
    "data": f'{{"name":"APPXML","fromDate":"{start_date}","toDate":"{end_date}"}}'
}
response = requests.get(url, params=params)
if response.status_code != 200:
    print(f"Error: {response.status_code}")
else:
    print("Urls collected for the period from {} to {} successfully".format(start_date, end_date)) 

data = response.json()
rows = []
for f in data["productFiles"]:
    rows.append([f["fileFromTime"],f["fileName"], f["fileSize"]/1000000, f["fileDownloadUrl"]])

df = pd.DataFrame(rows, columns=["Date","File Name", "File Size (Mb)", "Download URL"])

print(F"Total file size: {round(df["File Size (Mb)"].sum()/1000,2)} GB between {start_date} and {end_date}")


if not os.path.exists(download_path):
    os.makedirs(download_path)
if not os.path.exists(unzip_path):
    os.makedirs(unzip_path)



for index, row in tqdm(df[:2].iterrows(), total=df[:2].shape[0], desc="Downloading files"):
    file_url = row["Download URL"]
    file_name = row["Date"] + "_" + row["File Name"].split(".")[0]
    zip_file_name = f"{file_name}.zip"
    zip_file_path = os.path.join(download_path, zip_file_name)
    
    response = requests.get(file_url, stream=True)
    with open(zip_file_path, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    print(f"Downloaded and saved {file_name} as {zip_file_name}")
for file_name in tqdm(os.listdir(download_path), desc="Unzipping files"):
    if file_name.endswith(".zip"):
        zip_file_path = os.path.join(download_path, file_name)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)
        print(f"Unzipped {file_name} to {unzip_path}")

# SECTION IDENTIFICATION

In [63]:
import re

for i,xml_part in enumerate(xml_parts[1:]):
    root = etree.fromstring(xml_part.encode(), parser)
    xmlfile = etree.tostring(root, pretty_print=True, encoding="utf-8").decode()

    patterns = [r'(?i)(example|experiment|methodology|testing).*', r'(?i)(experimental).*',r'(?i)(example[s]?|experiment[s]?|methodology|testing|experimental)[\s\:\-]*[\r\n]*(.*?)(?=\n\n|$)',r'(?i)(example[s]?|experiment[s]?|methodology|testing|experimental)[\s\:\-]*[\r\n]*([^\r\n]*)'
]
    experiment_sections = []
    for pattern in patterns:
        sections = re.findall(pattern, xmlfile, re.DOTALL)
        start_index = xmlfile.lower().find(pattern.lower())
        if start_index != -1:  # find() returns -1 if the word is not found
                print(f"Found '{pattern}' at index {start_index} at part {i}")
    # experiment_sections.append(start_index)


In [3]:
with open("unzipped_patents_data\\ipa230105.xml", "r", encoding="utf-8") as file:
    content = file.read()
xml_parts = content.split('<?xml version="1.0" encoding="UTF-8"?>')

experiment_keywords = [
    "experiment", "test", "trial", "study", "methodology", "analysis", "investigation", 
    "evaluation", "testing", "approach", "experimentation", "research", "simulation", 
    "assessment", "protocol", "setup", "fieldwork", "observation", "survey", 
    "exploration", "examination", "implementation", "clinical trial", "experiment design", 
    "control study", "laboratory work", "data collection", "benchmarking"
]    
experiment_keys = [" experiment ", " test ", " trial "]

experiments  = []
# Split the file into multiple XML parts based on `<?xml ... ?>`
parser = etree.XMLParser(recover=True)
for i,xml_part in enumerate(xml_parts[1:]):
    root = etree.fromstring(xml_part.encode(), parser)  # Parse the XML part
    xmlfile = etree.tostring(root, pretty_print=True, encoding="utf-8").decode()
    #found = any(keyword.lower() in xmlfile.lower() for keyword in experiment_keys)


    if "experiment" in xmlfile.lower():
        doc_num = root.xpath("//publication-reference//document-id//doc-number/text()")
        experiments.append([i, doc_num])
        print(f"XML part {i} contains experiment")



XML part 15 contains experiment
XML part 21 contains experiment
XML part 31 contains experiment
XML part 32 contains experiment
XML part 33 contains experiment
XML part 36 contains experiment
XML part 37 contains experiment
XML part 38 contains experiment
XML part 39 contains experiment
XML part 40 contains experiment
XML part 41 contains experiment
XML part 42 contains experiment
XML part 43 contains experiment
XML part 44 contains experiment
XML part 45 contains experiment
XML part 46 contains experiment
XML part 53 contains experiment
XML part 54 contains experiment
XML part 65 contains experiment
XML part 67 contains experiment
XML part 72 contains experiment
XML part 73 contains experiment
XML part 75 contains experiment
XML part 76 contains experiment
XML part 78 contains experiment
XML part 79 contains experiment
XML part 80 contains experiment
XML part 83 contains experiment
XML part 84 contains experiment
XML part 87 contains experiment
XML part 90 contains experiment
XML part

KeyboardInterrupt: 

In [30]:

root = etree.fromstring(xml_parts[25].encode(), parser)  # Parse the XML part
root.xpath("//publication-reference//document-id//doc-number/text()")

['20230000025']

In [35]:
experiments = extract_experiments(xml_parts[23])
experiments

No 'Examples' heading found.

Looking for 'EXAMPLES' paragraphs.


'FIG. 2 shows a system 200 consisting of a series of four vertically arranged grow trays 201, 202, 203, 204. It will be understood that the number of trays is more or less arbitrary, and other embodiments may include more or fewer trays. The system also includes a reservoir, pump and conduit (not shown in this figure), and it will be understood that any known reservoir, pump and conduit combination may be used provided that the combination can convey liquid stored in the reservoir to a nozzle 205 or other suitable outlet located above the uppermost grow tray 201. In some embodiments, the reservoir, pump and conduit may be substantially similar to those in existing systems (with any direct connections between the conduit and the trays removed), reducing the cost of converting an existing growing system.\nEach grow tray is provided with a valve assembly 206, comprising a filling catchment 207 and a drainage outlet 208. The drainage outlet is sized to fit into an opening in the grow tray 

In [None]:
patent_experiments = []
for i,xml_part in enumerate(xml_parts[1:]):
    experiments = extract_experiments(xml_part)
    if experiments:
        root = etree.fromstring(xml_part.encode(), parser)  # Parse the XML part
        doc_num = root.xpath("//publication-reference//document-id//doc-number/text()")
        patent_experiments.append([i, experiments, doc_num])
        #print(f"\nXML part {i} contains experiments:\n{experiments}")

import pandas as pd

df = pd.DataFrame(patent_experiments, columns=["XML index", "Experiments", "Doc Number"])
df.to_csv("patent_experiments.csv", index=False)

In [2]:
import pandas as pd
df = pd.read_csv("patent_experiments.csv")

In [53]:
from utils import *
with open("unzipped_patents_data\\ipa230105.xml", "r", encoding="utf-8") as file:
    content = file.read()
xml_parts = content.split('<?xml version="1.0" encoding="UTF-8"?>')

In [51]:
xml_parts[1]

'\n<!DOCTYPE us-patent-application SYSTEM "us-patent-application-v46-2022-02-17.dtd" [ ]>\n<us-patent-application lang="EN" dtd-version="v4.6 2022-02-17" file="US20230000001A1-20230105.XML" status="PRODUCTION" id="us-patent-application" country="US" date-produced="20221220" date-publ="20230105">\n<us-bibliographic-data-application lang="EN" country="US">\n<publication-reference>\n<document-id>\n<country>US</country>\n<doc-number>20230000001</doc-number>\n<kind>A1</kind>\n<date>20230105</date>\n</document-id>\n</publication-reference>\n<application-reference appl-type="utility">\n<document-id>\n<country>US</country>\n<doc-number>17364781</doc-number>\n<date>20210630</date>\n</document-id>\n</application-reference>\n<us-application-series-code>17</us-application-series-code>\n<classifications-ipcr>\n<classification-ipcr>\n<ipc-version-indicator><date>20060101</date></ipc-version-indicator>\n<classification-level>A</classification-level>\n<section>A</section>\n<class>01</class>\n<subclass

['20230000001']

In [60]:
df["Experiments"][8]

'Hereinafter, the present application is described in more detail with reference to Examples but is not limited to the Examples.'

In [27]:

extract_all_examples(df["Experiments"][6])

{}

In [None]:
<heading id="h-0009" level="2">1. Growth Conditions and Light Treatment
<p id="p-0130" num="0000">2. Comparison of Active Ingredient
<heading id="h-0010" level="2">3. Comparison of Damage to 
<heading id="h-0011" level="2">4. Comparison of Damage to Plants and Active 
<p id="p-0149" num="0000">5. Whether the Active Ingred
<heading id="h-0012" level="2">6. Whether UVB Irradiation Under the Light

In [58]:
check_tense_nltk("Furthermore, the compounds of this aspect of the present invention can also be produced, for example, by reacting compound [A'], for example a compound of the formula ##STR643## or a salt or ester thereof with a compound of the formula ##STR644## wherein R"" is as defined above or a functional derivative thereof, subjecting the resulting compound of the formula ##STR645## wherein R"" is as defined above or a salt or ester thereof to dehydration condensation with a compound of the formula")

'present'

In [14]:
def extract_experiments_from_xml(xml_content):
    parser = etree.XMLParser(recover=True)
    root = etree.fromstring(xml_content.encode(), parser)
    xml_str = etree.tostring(root, pretty_print=True, encoding="utf-8").decode()
    
    experiment_sections = []
    for keyword in experiment_keywords:
        start_index = xml_str.lower().find(keyword.lower())
        if start_index != -1:
            end_index = xml_str.find('<heading', start_index + 1)
            if end_index == -1:
                end_index = len(xml_str)
            experiment_sections.append(xml_str[start_index:end_index])
    return experiment_sections


In [19]:
def create_experiment_dataset(xml_parts):
    dataset = []
    parser = etree.XMLParser(recover=True)
    for i, xml_part in enumerate(xml_parts[1:]):
        experiments = extract_experiments_from_xml(xml_part)
        if experiments:
            root = etree.fromstring(xml_part.encode(), parser)
            doc_num = root.xpath("//publication-reference//document-id//doc-number/text()")
            for exp in experiments:
                dataset.append([doc_num[0], exp])
    return dataset


In [21]:
from lxml import etree
with open("unzipped_patents_data/ipa230105.xml", "r", encoding="utf-8") as file:
    content = file.read()
xml_parts = content.split('<?xml version="1.0" encoding="UTF-8"?>')




In [35]:
import re
def find_matches(xml_parts):
    patterns = [r"\bexample\s+(?:I{1,3}|IV|V{1,3}|IX|X)\b",r"\bexample\s+\d+\b",r"\bexample\s+(?:I{1,3}|IV|V{1,3}|IX|X)\b"]
    parser = etree.XMLParser(recover=True)
    for pattern in patterns:
        for i,xml_part in enumerate(xml_parts):
            root = etree.fromstring(xml_part.encode(), parser)
            xmlfile = etree.tostring(root, pretty_print=True, encoding="utf-8").decode()
            matches = re.findall(pattern, xmlfile, re.IGNORECASE)
            if matches:
                print(f"Found '{pattern}' at part {i}")
                print(matches)
                print("\n")


In [36]:
find_matches(xml_parts[1:5])

Found '\bexample\s+\d+\b' at part 3
['Example 1', 'Example 2', 'Example 1', 'Example 3', 'Example 1', 'Example 4', 'Example 1', 'Example 5', 'Example 4', 'Example 6', 'Example 1', 'Example 7', 'Example 1', 'Example 8', 'Example 9', 'Example 8', 'Example 10', 'Example 8', 'Example 11', 'Example 8', 'Example 12', 'Example 11', 'Example 13', 'Example 8', 'Example 14', 'Example 8', 'Example 15', 'Example 16', 'Example 15', 'Example 17', 'Example 15', 'Example 18', 'Example 15', 'Example 19', 'Example 18', 'Example 20', 'Example 15', 'Example 21', 'Example 15']




In [23]:
import pandas as pd
experiment_keywords = [
    "experiment", "test", "trial", "study", "methodology", "analysis", "investigation", 
    "evaluation", "testing", "approach", "experimentation", "research", "simulation", 
    "assessment", "protocol", "setup", "fieldwork", "observation", "survey", 
    "exploration", "examination", "implementation", "clinical trial", "experiment design", 
    "control study", "laboratory work", "data collection", "benchmarking"
]

dataset = create_experiment_dataset(xml_parts[:10])
df = pd.DataFrame(dataset, columns=["Patent Number", "Experiment Text"])
# df.to_csv("patent_experiments_gpttest.csv", index=False)
# print("Dataset created and saved to patent_experiments.csv")

In [28]:
import pprint as pp
pp.pprint(df["Experiment Text"][0])

('approach each other, and it becomes easier for the interlocking swing part '
 '<b>35</b> to be operated so as to swing rearward. That is to say, in '
 'response to the tillage depth adjustment cam <b>34</b> being swung rearward, '
 'it becomes easier for the lift arms <b>16</b> to be operated so as to swing '
 'upward.</p>\n'
 '<p id="p-0070" num="0069">A torsion spring <b>40</b> is wound around a '
 'swinging portion of the interlocking swing part <b>35</b>. The torsion '
 'spring <b>40</b> is locked to a downward portion of the swing base end '
 'portion of the interlocking swing part <b>35</b> and a downward portion of '
 'the free end portion of the bracket <b>24</b>B. The interlocking swing part '
 '<b>35</b> is biased by the torsion spring <b>40</b> so as to swing '
 'forward.</p>\n'
 '<p id="p-0071" num="0070">In response to the receiving member <b>30</b>B '
 'swinging forward by receiving a pressing force from the work apparatus '
 '<b>15</b>, the interlocking swing arm <b>31