**To Do:**
1. clean the collected urls to only include lates version ( removes duplication)

In [1]:
import requests
import pandas as pd
from tqdm import tqdm
import os
import zipfile
from lxml import etree


start_date = '2023-01'
end_date = '2023-12'
download_path = "patents_data"
unzip_path = "unzipped_patents_data"

In [43]:
url = "https://developer.uspto.gov/products/bdss/get/ajax"
params = {
    "data": f'{{"name":"APPXML","fromDate":"{start_date}","toDate":"{end_date}"}}'
}
response = requests.get(url, params=params)
if response.status_code != 200:
    print(f"Error: {response.status_code}")
else:
    print("Urls collected for the period from {} to {} successfully".format(start_date, end_date)) 

data = response.json()
rows = []
for f in data["productFiles"]:
    rows.append([f["fileFromTime"],f["fileName"], f["fileSize"]/1000000, f["fileDownloadUrl"]])

df = pd.DataFrame(rows, columns=["Date","File Name", "File Size (Mb)", "Download URL"])

print(F"Total file size: {round(df["File Size (Mb)"].sum()/1000,2)} GB between {start_date} and {end_date}")
df

In [None]:


if not os.path.exists(download_path):
    os.makedirs(download_path)
if not os.path.exists(unzip_path):
    os.makedirs(unzip_path)



for index, row in tqdm(df[:2].iterrows(), total=df[:2].shape[0], desc="Downloading files"):
    file_url = row["Download URL"]
    file_name = row["Date"] + "_" + row["File Name"].split(".")[0]
    zip_file_name = f"{file_name}.zip"
    zip_file_path = os.path.join(download_path, zip_file_name)
    
    response = requests.get(file_url, stream=True)
    with open(zip_file_path, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    print(f"Downloaded and saved {file_name} as {zip_file_name}")

Downloading files:  50%|█████     | 1/2 [00:45<00:45, 45.58s/it]

Downloaded and saved 2023-01-05_ipa230105 as 2023-01-05_ipa230105.zip


Downloading files: 100%|██████████| 2/2 [01:06<00:00, 33.34s/it]

Downloaded and saved 2023-01-12_ipa230112_r1 as 2023-01-12_ipa230112_r1.zip





In [76]:
for file_name in tqdm(os.listdir(download_path), desc="Unzipping files"):
    if file_name.endswith(".zip"):
        zip_file_path = os.path.join(download_path, file_name)
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)
        print(f"Unzipped {file_name} to {unzip_path}")

Unzipping files:  50%|█████     | 1/2 [00:12<00:12, 12.24s/it]

Unzipped 2023-01-05_ipa230105.zip to unzipped_patents_data


Unzipping files: 100%|██████████| 2/2 [00:17<00:00,  8.84s/it]

Unzipped 2023-01-12_ipa230112_r1.zip to unzipped_patents_data





In [2]:
for root, dirs, files in os.walk(unzip_path):
    for file in files:
        if file.endswith(".xml"):
            xml_file_path = os.path.join(root, file)
            try:
                parser = etree.XMLParser(recover=True)
                tree = etree.parse(xml_file_path, parser)
                root_element = tree.getroot()
                # Process the XML data
                print(f"Parsed XML file: {xml_file_path}")
                # Example: print the root tag
                print(f"Root tag: {root_element.tag}")
            except etree.XMLSyntaxError as e:
                print(f"Failed to parse {xml_file_path}: {e}")

Parsed XML file: unzipped_patents_data\ipa230105.xml
Root tag: us-patent-application
Parsed XML file: unzipped_patents_data\ipa230112.xml
Root tag: us-patent-application


In [17]:
experiment_keywords = [
    "experiment", "test", "trial", "study", "methodology", "analysis", "investigation", 
    "evaluation", "testing", "approach", "experimentation", "research", "simulation", 
    "assessment", "protocol", "setup", "fieldwork", "observation", "survey", 
    "exploration", "examination", "implementation", "clinical trial", "experiment design", 
    "control study", "laboratory work", "data collection", "benchmarking"
]
experiment_keys = [" experiment ", " test ", " trial "]
for key in experiment_keys:
    start_index = xmlfile.lower().find(key.lower())  # Use find() for the first occurrence (case-insensitive)
    print(xmlfile[start_index-100:start_index+100])  # Print the first 50 characters before the keyword
    
    if start_index != -1:  # find() returns -1 if the word is not found
        print(f"Found '{key}' at index {start_index}")






In [43]:
experiments

[[15, ['20230000016']],
 [21, ['20230000022']],
 [31, ['20230000032']],
 [32, ['20230000033']],
 [33, ['20230000034']],
 [36, ['20230000036']],
 [37, ['20230000037']],
 [38, ['20230000038']],
 [39, ['20230000039']],
 [40, ['20230000040']],
 [41, ['20230000041']],
 [42, ['20230000042']],
 [43, ['20230000043']],
 [44, ['20230000044']],
 [45, ['20230000045']],
 [46, ['20230000046']],
 [53, ['20230000053']],
 [54, ['20230000054']],
 [65, ['20230000064']],
 [67, ['20230000065']],
 [72, ['20230000070']],
 [73, ['20230000071']],
 [75, ['20230000072']],
 [76, ['20230000073']],
 [78, ['20230000075']],
 [79, ['20230000076']],
 [80, ['20230000077']],
 [83, ['20230000080']],
 [84, ['20230000081']],
 [87, ['20230000084']],
 [90, ['20230000087']],
 [91, ['20230000088']],
 [92, ['20230000089']],
 [100, ['20230000097']],
 [101, ['20230000098']],
 [104, ['20230000101']],
 [108, ['20230000105']],
 [112, ['20230000109']],
 [113, ['20230000110']],
 [118, ['20230000115']],
 [122, ['20230000119']],
 [126, [

In [36]:
root = etree.fromstring(xml_parts[3].encode(), parser)
xmlfile = etree.tostring(root, pretty_print=True, encoding="utf-8").decode()

for key in experiment_keys:
    if key in xmlfile:
        start_index = xmlfile.lower().find(key.lower())  # Use find() for the first occurrence (case-insensitive)
        print(xmlfile[start_index-200:start_index+200])  # Print the first 50 characters before the keyword
        
        if start_index != -1:  # find() returns -1 if the word is not found
            print(f"Found '{key}' at index {start_index}")


typically recommend 10-20 sample points for every forty acres of field and they recommend creating a reference map to record the location and quantities of field samples in order to properly interpret test results. Something that used to be done manually can now be done by the GURU [[<b>202</b> with better tracking, sampling and mapping. Testing is also performed on-site using the onboard software
Found ' test ' at index 150307


In [42]:
# with open("unzipped_patents_data\\ipa230105.xml", "r", encoding="utf-8") as file:
#     content = file.read()
# xml_parts = content.split('<?xml version="1.0" encoding="UTF-8"?>')

experiment_keywords = [
    "experiment", "test", "trial", "study", "methodology", "analysis", "investigation", 
    "evaluation", "testing", "approach", "experimentation", "research", "simulation", 
    "assessment", "protocol", "setup", "fieldwork", "observation", "survey", 
    "exploration", "examination", "implementation", "clinical trial", "experiment design", 
    "control study", "laboratory work", "data collection", "benchmarking"
]    
experiment_keys = [" experiment ", " test ", " trial "]

experiments  = []
# Split the file into multiple XML parts based on `<?xml ... ?>`
parser = etree.XMLParser(recover=True)
for i,xml_part in enumerate(xml_parts[1:]):
    root = etree.fromstring(xml_part.encode(), parser)  # Parse the XML part
    xmlfile = etree.tostring(root, pretty_print=True, encoding="utf-8").decode()
    #found = any(keyword.lower() in xmlfile.lower() for keyword in experiment_keys)


    if "experiment" in xmlfile.lower():
        doc_num = root.xpath("//publication-reference//document-id//doc-number/text()")
        experiments.append([i, doc_num])
        print(f"XML part {i} contains experiment")



XML part 15 contains experiment
XML part 21 contains experiment
XML part 31 contains experiment
XML part 32 contains experiment
XML part 33 contains experiment
XML part 36 contains experiment
XML part 37 contains experiment
XML part 38 contains experiment
XML part 39 contains experiment
XML part 40 contains experiment
XML part 41 contains experiment
XML part 42 contains experiment
XML part 43 contains experiment
XML part 44 contains experiment
XML part 45 contains experiment
XML part 46 contains experiment
XML part 53 contains experiment
XML part 54 contains experiment
XML part 65 contains experiment
XML part 67 contains experiment
XML part 72 contains experiment
XML part 73 contains experiment
XML part 75 contains experiment
XML part 76 contains experiment
XML part 78 contains experiment
XML part 79 contains experiment
XML part 80 contains experiment
XML part 83 contains experiment
XML part 84 contains experiment
XML part 87 contains experiment
XML part 90 contains experiment
XML part

KeyboardInterrupt: 

In [4]:
import pprint
pprint.pprint(xmlfile)

('<us-patent-application lang="EN" dtd-version="v4.6 2022-02-17" '
 'file="US20230000004A1-20230105.XML" status="PRODUCTION" '
 'id="us-patent-application" country="US" date-produced="20221220" '
 'date-publ="20230105">\n'
 '<us-bibliographic-data-application lang="EN" country="US">\n'
 '<publication-reference>\n'
 '<document-id>\n'
 '<country>US</country>\n'
 '<doc-number>20230000004</doc-number>\n'
 '<kind>A1</kind>\n'
 '<date>20230105</date>\n'
 '</document-id>\n'
 '</publication-reference>\n'
 '<application-reference appl-type="utility">\n'
 '<document-id>\n'
 '<country>US</country>\n'
 '<doc-number>17364347</doc-number>\n'
 '<date>20210630</date>\n'
 '</document-id>\n'
 '</application-reference>\n'
 '<us-application-series-code>17</us-application-series-code>\n'
 '<classifications-ipcr>\n'
 '<classification-ipcr>\n'
 '<ipc-version-indicator><date>20060101</date></ipc-version-indicator>\n'
 '<classification-level>A</classification-level>\n'
 '<section>A</section>\n'
 '<class>01</cl