In [2]:
import os
import pandas as pd

import json
import time
import logging
from serpapi import GoogleSearch
from dotenv import load_dotenv
from tqdm.auto import tqdm

In [3]:
# Load environment variables from .env file
load_dotenv()

# Retrieve API key from .env or environment
SERPAPI_KEY = os.getenv("SERPAPI_KEY")


In [6]:
df = pd.read_csv("data/USPTO/SearchResults20250307122838.csv")

In [8]:
df

Unnamed: 0,+,Document ID,Source,Date Published,Family ID,Pages,Title,CPCI,CPCA,Inventor,Assignee,Application Number,Filing Date,XREF,Domestic Priority,Foreign Priority,IPC,Applicant Name,Patent Number,Relevancy
0,,US 20230355757 A1,US-PGPUB,2023-11-09,69147429,20,FORMULATIONS OF ANTI-PD1 ANTIBODIES,A61K9/0019;A61K47/22;C07K16/2818;A61K39/39591;...,C07K2317/24;C07K2317/94,SIGL; Rainer et al.,,17/786423,2020-12-18,,,2019-12-20,A61K39/395;C07K16/28,FORMYCON AG,20230355757,15.462541
1,,US 20210008135 A1,US-PGPUB,2021-01-14,62244544,73,BIOMARKERS FOR CANCER THERAPEUTICS,A61K35/763;G01N33/5743;A61P35/04;A61P35/00;C07...,G01N2800/52;A61K39/39541;A61K39/39541;A61K2300/00,Gansert; Jennifer Lorraine et al.,,16/499095,2018-04-27,,2017-04-28,,A61K35/763;A61P35/00,"MERCK SHARP & DOHME CORP.,AMGEN INC.",20210008135,15.459262
2,,US 20190160148 A1,US-PGPUB,2019-05-30,59021571,17,COMBINATION OF PEMBROLIZUMAB AND ABEMACICLIB F...,A61K38/1774;A61P35/00;A61K39/3955;C07K16/2827;...,C07K2317/76,BECKMANN; Richard Paul et al.,Eli Lilly and Company,16/301835,2017-05-19,,2016-05-23;2016-11-03,,A61K38/17;A61K31/506,"Eli Lilly and Company,Merck Sharp & Dohme Corp...",20190160148,15.449961
3,,US 20240287185 A1,US-PGPUB,2024-08-29,82361217,66,FORMULATIONS OF ANTI-PD1 ANTIBODIES,A61K47/22;A61K47/12;A61K39/39591;A61K47/183;A6...,C07K2317/76;C07K2317/24;C07K2317/94;A61K2039/505,Sigl; Rainer et al.,Formycon AG,18/573798,2022-06-22,,,2021-06-23;2021-10-05;2022-04-29,C07K16/28;A61K39/395,Formycon AG,20240287185,15.443056
4,,US 20210347889 A1,US-PGPUB,2021-11-11,70611084,78,DOSING REGIMEN OF ANTI-LAG3 ANTIBODY AND COMBI...,C07K16/2818;C07K16/2827;C07K16/2803;A61P35/00,C07K2317/56;A61K2039/507;A61K2039/505;A61K2039...,Abraham; Anson Kunjachan et al.,Merck Sharp & Dohme Corp.,17/289810,2019-11-04,,2018-11-05,,C07K16/28;A61P35/00,Merck Sharp & Dohme Corp.,20210347889,15.412694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,US 20230391769 A1,US-PGPUB,2023-12-07,77726431,189,SUBSTITUTED HETEROCYCLIC COMPOUNDS AND THERAPE...,C07D471/04;C07D401/06;A61P35/00;C07D401/14;C07...,,Schulze; Volker et al.,,18/018216,2021-07-27,,2021-04-28;2020-07-29,,C07D471/04;C07D401/06;C07F7/08;C07D401/14;C07D...,"Bayer Aktiengesellschaft,The Broad Institute, ...",20230391769,3.083757
9996,,US 20240301064 A1,US-PGPUB,2024-09-12,83049602,267,"PD-1 ANTIBODIES, POLYPEPTIDES AND USES THEREOF",C07K16/2803;A61P37/04;C07K16/2818;A61P37/06,C07K2317/55;C07K2317/56;C07K2317/522;C07K2317/...,VINEY; Joanne L. et al.,"PANDION OPERATIONS, INC.",18/547183,2022-02-23,,2021-02-23,,C07K16/28;A61P37/04;A61K39/00,"PANDION OPERATIONS, INC.",20240301064,3.083757
9997,4.0,US 20180200366 A1,US-PGPUB,2018-07-19,62019612,146,MULTIMERIC IL-15-BASED MOLECULES,A61K39/3955;C07K14/5443;C07K16/2803;A61K38/005...,C07K2317/622;C07K2319/035;C07K2319/30;A61K38/1...,Wong; Hing C. et al.,,15/789985,2017-10-21,,2017-06-01;2016-10-21,,A61K39/395;A61P35/00,"Altor Bioscience Corporation,NantCell, Inc.",20180200366,3.081094
9998,3.0,US 20200123252 A1,US-PGPUB,2020-04-23,66752156,119,BISPECIFIC ANTIBODIES AGAINST CEACAM5 AND CD47,C07K16/2803;C07K16/3007;A61P35/00;C07K16/2809;...,C07K2317/732;A61K2039/505;C07K2317/76;C07K2317...,BUATOIS; Vanessa et al.,LamKap Bio beta Ltd.,16/428539,2019-05-31,,,2018-06-03;2018-06-03;2018-06-03;2018-06-03;20...,C07K16/28;C12N15/85,LamKap Bio beta Ltd.,20200123252,3.081094


In [10]:
def filter_and_clean_data(df):
    """
    Filters rows where either 'Applicant Name' or 'Assignee' contains
    'Merck Sharp & Dohme Corp.', then sorts by 'Relevancy' descending
    and selects the top 66 rows. Finally, it removes spaces from
    'Document ID' column.

    Parameters
    ----------
    df : pandas.DataFrame
        A DataFrame that must include at least:
        - 'Applicant Name'
        - 'Assignee'
        - 'Relevancy'
        - 'Document ID'

    Returns
    -------
    pandas.DataFrame
        A new DataFrame containing the filtered, sorted, and cleaned data.
    """

    # Filter rows where 'Applicant Name' or 'Assignee' contains the target string
    filtered_df = df[
        (df["Applicant Name"].str.contains("Merck Sharp & Dohme Corp.", na=False)) |
        (df["Assignee"].str.contains("Merck Sharp & Dohme Corp.", na=False))
    ]

    # Sort the filtered rows by 'Relevancy' in descending order
    filtered_sorted_df = filtered_df.sort_values(by="Relevancy", ascending=False)

    # Select the top 66 rows based on the highest 'Relevancy' and create a copy
    sample_df = filtered_sorted_df.head(66).copy()

    # Remove all spaces in the 'Document ID' column
    # str.replace(' ', '') removes any spaces within the string, not just leading or trailing.
    sample_df["Document ID"] = (
        sample_df["Document ID"].str.replace(' ', '', regex=False)
    )

    # Return the processed DataFrame
    return sample_df



In [12]:
result_df = filter_and_clean_data(df)

In [14]:
result_df

Unnamed: 0,+,Document ID,Source,Date Published,Family ID,Pages,Title,CPCI,CPCA,Inventor,Assignee,Application Number,Filing Date,XREF,Domestic Priority,Foreign Priority,IPC,Applicant Name,Patent Number,Relevancy
2,,US20190160148A1,US-PGPUB,2019-05-30,59021571,17,COMBINATION OF PEMBROLIZUMAB AND ABEMACICLIB F...,A61K38/1774;A61P35/00;A61K39/3955;C07K16/2827;...,C07K2317/76,BECKMANN; Richard Paul et al.,Eli Lilly and Company,16/301835,2017-05-19,,2016-05-23;2016-11-03,,A61K38/17;A61K31/506,"Eli Lilly and Company,Merck Sharp & Dohme Corp...",20190160148,15.449961
4,,US20210347889A1,US-PGPUB,2021-11-11,70611084,78,DOSING REGIMEN OF ANTI-LAG3 ANTIBODY AND COMBI...,C07K16/2818;C07K16/2827;C07K16/2803;A61P35/00,C07K2317/56;A61K2039/507;A61K2039/505;A61K2039...,Abraham; Anson Kunjachan et al.,Merck Sharp & Dohme Corp.,17/289810,2019-11-04,,2018-11-05,,C07K16/28;A61P35/00,Merck Sharp & Dohme Corp.,20210347889,15.412694
7,2.0,US20180333503A1,US-PGPUB,2018-11-22,64269828,43,ANTI-FOLR1 IMMUNOCONJUGATES AND ANTI-PD-1 ANTI...,A61K47/6849;A61K39/3955;C07K16/2827;A61K31/57;...,A61K2300/00;A61K2039/505;A61K2039/507;A61K2039...,RUIZ SOTO; Rodrigo R.,"ImmunoGen, Inc.,Merck Sharp & Dohme Corp.",15/979989,2018-05-15,,2017-05-16;2017-09-19;2018-03-23,,A61K47/68;A61K9/00,"ImmunoGen, Inc.,Merck Sharp & Dohme Corp.",20180333503,15.400988
16,,US20210403557A1,US-PGPUB,2021-12-30,70611082,59,DOSING REGIMEN OF ANTI-TIGIT ANTIBODY FOR TREA...,C07K16/2803;A61P35/00,A61K2039/54;A61K2039/545;A61K2039/55;Y02A50/30...,Cai; Mingmei et al.,Merck Sharp & Dohme Corp.,17/288641,2019-11-04,,2018-11-05,,C07K16/28;A61P35/00,"CAI; Mingmei,CHARTASH; Elliot K.,HEALY; Jane A...",20210403557,15.328548
21,1.0,US20170089914A1,US-PGPUB,2017-03-30,58408842,47,ANTI-PEMBROLIZUMAB ANTIBODIES,G01N33/57434;G01N33/57407;G01N33/6854;C07K16/4258,C07K2317/24;C07K2317/92;G01N2800/52,Loo; LiNa et al.,Merck Sharp & Dohme Corp.,15/274330,2016-09-23,,2015-09-25,,G01N33/68;C07K16/42,Merck Sharp & Dohme Corp.,20170089914,15.303405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4262,1.0,US20200361931A1,US-PGPUB,2020-11-19,66438946,31,PRMT5 INHIBITORS,A61P35/00;C07D471/04,,Machacek; Michelle R. et al.,Merck Sharp & Dohme Corp.,16/762201,2018-11-05,,2017-11-08,,C07D471/04,"MACHACEK; Michelle R.,WITTER; David J.,REUTERS...",20200361931,8.285498
5004,3.0,US20190241655A1,US-PGPUB,2019-08-08,62109704,63,ILT3 LIGAND,A61P35/00;G01N33/542;C07K16/2803;C07K16/38,C07K2317/76;A61K2039/505;C07K2317/32,Cua; Daniel et al.,Merck Sharp & Dohme Corp.,16/342582,2017-11-06,,2016-11-10,,C07K16/28;G01N33/542,"CUA; Daniel,CHERWINSKI; Holly,WANG; Adele,CHEN...",20190241655,7.531259
5006,,US20220267369A1,US-PGPUB,2022-08-25,69180557,48,METHODS OF SEPARATING HOST CELL LIPASES FROM A...,C07K16/065;B01D15/327;B01D15/362;B01D15/363;B0...,,Chmielowski; Rebecca A. et al.,Merck Sharp & Dohme Corp.,17/261365,2019-07-24,,2018-07-25,,C07K1/22;B01D15/16,Merck Sharp & Dohme Corp.,20220267369,7.531259
5037,1.0,US20200048258A1,US-PGPUB,2020-02-13,61905936,78,KDM5 INHIBITORS,C07D519/00;C07D487/04;A61P35/00,,MANSOOR; UMAR FARUK et al.,Merck Sharp & Dohme Corp.,16/341313,2017-10-06,,2016-10-12,,C07D487/04,"MANSOOR; UMAR Faruk,Fischer; Christian,Silipha...",20200048258,7.507926


In [16]:
# 2) Initialize logger for error recording only
#    - Logs will be written to 'error_log.txt'.
logger = logging.getLogger("patent_fetch_logger")
logger.setLevel(logging.ERROR)

# Clear existing handlers if re-running in a notebook environment
if logger.hasHandlers():
    logger.handlers.clear()

# Create a file handler for errors
err_handler = logging.FileHandler("error_log.txt", mode="a", encoding="utf-8")
err_handler.setLevel(logging.ERROR)
logger.addHandler(err_handler)

In [18]:
def fetch_patent_details(patent_id, api_key):
    """
    Fetch patent details from Google Patents via SerpAPI.
    
    Parameters
    ----------
    patent_id : str
        A pure patent number like "US11734097B1". We will prepend 'patent/'
        and append '/en' automatically.
    api_key : str
        SerpAPI API key.
    
    Returns
    -------
    dict
        The JSON response from SerpAPI as a Python dictionary.
    """
    # Construct the full patent_id with prefix/suffix
    full_patent_id = f"patent/{patent_id}/en"

    params = {
        "engine": "google_patents_details",
        "patent_id": full_patent_id,
        "api_key": api_key
    }

    # Create the GoogleSearch object and fetch
    search = GoogleSearch(params)
    result = search.get_dict()
    return result



In [20]:
def fetch_patents_in_bulk(
    patent_ids,
    output_file="patent_data.jsonl",
    max_retries=3,
    sleep_seconds=1
):
    """
    Fetch data for a list of patent IDs in bulk, with error handling, retry logic,
    and a progress bar using tqdm. Saves JSON Lines to 'output_file'.

    Parameters
    ----------
    patent_ids : list of str
        A list of pure patent numbers, e.g. ["US11734097B1", "US7654321B2"].
    output_file : str, optional
        The file path for JSON Lines output. Default is "patent_data.jsonl".
    max_retries : int, optional
        How many times to retry on failure. Default is 3.
    sleep_seconds : int, optional
        Delay (in seconds) between retries. Default is 1.
    
    Returns
    -------
    None
        The results are written to a JSONL file, one record per line.
    """

    if not SERPAPI_KEY:
        raise ValueError("No SERPAPI_KEY found. Please set it in .env or environment.")

    # Open the file in append mode. If you want to overwrite, use "w" instead of "a".
    num_patents = len(patent_ids)
    start_time = time.time()
    total_success = 0
    total_fail = 0
    cumulative_time = 0.0

    with open(output_file, "a", encoding="utf-8") as f_out:
        # Use tqdm to track progress
        for i, patent_id in enumerate(tqdm(patent_ids, desc="Fetching Patent Data", total=num_patents)):
            # Track the time for each iteration
            iteration_start = time.time()

            # Attempt up to max_retries
            success = False
            for attempt in range(max_retries):
                try:
                    response = fetch_patent_details(patent_id, SERPAPI_KEY)
                    # If we get here, we assume the fetch was successful
                    # Write the result to JSON Lines
                    record = {
                        "patent_id": patent_id,
                        "data": response
                    }
                    f_out.write(json.dumps(record) + "\n")

                    success = True
                    total_success += 1
                    break  # Break out of the retry loop if success

                except Exception as e:
                    # Log the error but do NOT interrupt the whole process
                    logger.error(
                        f"Error fetching data for {patent_id} (attempt {attempt+1}/{max_retries}): {e}"
                    )
                    time.sleep(sleep_seconds)

            if not success:
                total_fail += 1

            iteration_time = time.time() - iteration_start
            cumulative_time += iteration_time

            # Optionally, update tqdm postfix with average iteration time
            avg_time = cumulative_time / (i + 1)
            tqdm.write(f"Patent {patent_id} processed in {iteration_time:.2f}s; average: {avg_time:.2f}s")

    total_time = time.time() - start_time
    avg_time_overall = cumulative_time / num_patents if num_patents else 0.0

    print(f"\nAll done! Total: {num_patents} patents. Success: {total_success}, Fail: {total_fail}.")
    print(f"Total time: {total_time:.2f}s; average time per patent: {avg_time_overall:.2f}s.")

In [24]:
patent_list = result_df["Document ID"].tolist()

In [26]:
# Now call the existing function
fetch_patents_in_bulk(
    patent_ids=patent_list,
    output_file="data/SerpAPI/patent_data.jsonl",
    max_retries=3,
    sleep_seconds=2
)

Fetching Patent Data:   0%|          | 0/66 [00:00<?, ?it/s]

Patent US20190160148A1 processed in 0.19s; average: 0.19s
Patent US20210347889A1 processed in 0.07s; average: 0.13s
Patent US20180333503A1 processed in 0.07s; average: 0.11s
Patent US20210403557A1 processed in 0.08s; average: 0.10s
Patent US20170089914A1 processed in 0.06s; average: 0.09s
Patent US20220380469A1 processed in 0.07s; average: 0.09s
Patent US20200147213A1 processed in 0.24s; average: 0.11s
Patent US20180327848A1 processed in 0.08s; average: 0.11s
Patent US20220409724A1 processed in 0.06s; average: 0.10s
Patent US20180148790A1 processed in 0.10s; average: 0.10s
Patent US20180237524A1 processed in 0.07s; average: 0.10s
Patent US20210317215A1 processed in 0.07s; average: 0.10s
Patent US20180044418A1 processed in 0.15s; average: 0.10s
Patent US20200115451A1 processed in 0.06s; average: 0.10s
Patent US20210317214A1 processed in 0.17s; average: 0.10s
Patent US20220112564A1 processed in 0.07s; average: 0.10s
Patent US20210380694A1 processed in 0.15s; average: 0.10s
Patent US20210