In [None]:
%%capture
!pip install keybert

In [None]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

import random
from keybert import KeyBERT

In [None]:
start = 0
query_term = 'redox flow battery'
query_term = '%22' + query_term + '%22'
query_term.replace(' ', '%20')

bulk_data_api_url = f"https://developer.uspto.gov/ibd-api/v1/application/publications?searchText={query_term}&start={start}&largeTextSearchFlag=Y"
bulk_search = requests.get(bulk_data_api_url).json()

result_count = bulk_search['recordTotalQuantity']
l = []
while start < result_count:
  bulk_data_api_url = f"https://developer.uspto.gov/ibd-api/v1/application/publications?searchText={query_term}&start={start}&largeTextSearchFlag=Y"
  bulk_search = requests.get(bulk_data_api_url).json()
  bulk_search_results = bulk_search['results']

  for result in bulk_search_results:
    d = {}
    d['App_Number'] = result['patentApplicationNumber']
    d['Base_Pub_Number'] = result['publicationDocumentIdentifier']
    d['Base_Abstract'] = result['abstractText'][0]

    try:
      d['Base_Claims'] = result['claimText'][0]
    except:
      d['Base_Claims'] = result['claimText']

    l.append(d)

  start += 100

df = pd.DataFrame(l)
df = df[df['Base_Claims'].notnull()].reset_index(drop=True)
df.head()

In [None]:
text2text_generator = pipeline("text2text-generation")

COMMON_COUNTRY_CODES = ['US', 'JP', 'EP', 'WO', 'CN']
COMMON_KIND_CODES = ['A', 'A1', 'A2', 'B1', 'B2']
def clean_pub_num(n, debug=False):
  n = n.replace('/', '')
  n = n.replace(',', '')
  n = n.replace('-', '')

  spl_n = n.split(' ')
  temp_n = ''
  for spl in spl_n:
    if spl in COMMON_COUNTRY_CODES or spl.isnumeric() or spl in COMMON_KIND_CODES:
      temp_n += spl
    elif (spl[:-1].isnumeric() and spl[-1] in COMMON_KIND_CODES) or (spl[:-2].isnumeric() and spl[-2:] in COMMON_KIND_CODES):
      temp_n += spl

  if len(temp_n) < 7 or len(temp_n) > 15:
    if debug:
      print(temp_n)
    return np.nan
  else:
    return temp_n

def clean_whitespace(s):
  s = s.replace('  \n', '')
  s = s.replace('\n', '')
  s = s.replace('  ', ' ')
  return s

def is_valid_char(c):
  uni_c = ord(c)
  return uni_c > 31 and uni_c < 127

def scrape_google_patents(pub_num, debug=False):
  country_codes = [''] + COMMON_COUNTRY_CODES
  for country_code in country_codes:
      temp_num = country_code + pub_num
      url = f"https://patents.google.com/patent/{temp_num}/en"
      page = requests.get(url)

      if page.status_code == 404:
        continue
      else:
        parser = BeautifulSoup(page.content, "html.parser")
        try:
          claims = parser.find('section', itemprop='claims').text
          claims = clean_whitespace(''.join(filter(is_valid_char, claims)))
          return temp_num, claims[claims.find(')')+1:].strip()
        except:
          if debug:
            print(pub_num)
          return np.nan, np.nan

  if debug:
    print(pub_num)
  return np.nan, np.nan

In [None]:
oa_api_url = "https://developer.uspto.gov/ds-api/oa_actions/v1/records"
headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}

for i, r in df.iterrows():
  app_num = f"{r['App_Number'][2:]}"
  app_num = '"' + app_num + '"'
  data = {'criteria': f'patentApplicationNumber:{app_num}', 'start': '0', 'rows': '100'}
  oa_search = requests.post(oa_api_url, headers=headers, data=data).json()
  oa_response = oa_search['response']

  if oa_response['numFound'] > 0:
    j = 1
    rej_nums = []
    for oa in oa_response['docs']:
      if 'sections.section102RejectionText' in oa.keys():
        rej_text = oa['sections.section102RejectionText']
        if rej_text == None:
          continue

        rej_text = rej_text[0][:250]
        extract_num = text2text_generator(f"question: What is the anticpated patent's publication number with country and kind code? context: {rej_text}")[0]['generated_text']
        rej_num = clean_pub_num(extract_num, debug=False)
        if rej_num is not np.nan and rej_num not in rej_nums:
          scrape_res = scrape_google_patents(rej_num, debug=True)
          if scrape_res[0] is not np.nan:
            df.loc[i, f'Rej_Pub_Number_{j}'], df.loc[i, f'Rej_Claims_{j}'] = scrape_res
            j += 1
            rej_nums.append(rej_num)

In [None]:
PATENT_COUNT_PER_ROW = 25

kw_model = KeyBERT()

rej_cols = [col for col in list(df.columns) if col[:-1] == 'Rej_Claims_']
non_rej_cols = []
for i in range(1, PATENT_COUNT_PER_ROW+1):
  non_rej_cols.extend([f'Pub_Number_{i}', f'Claims_{i}'])

In [None]:
def get_tot_result_count(term):
  query_term = '%22' + term.lower() + '%22'
  query_term.replace(' ', '%20')

  bulk_data_api_url = f"https://developer.uspto.gov/ibd-api/v1/application/publications?searchText={query_term}&rows=1&largeTextSearchFlag=Y"
  keyword_search = requests.get(bulk_data_api_url).json()
  tot_result_count = keyword_search['recordTotalQuantity']

  return tot_result_count

def allocate_term_results(top_terms, tgt_patent_count, debug=False):
  even_num_results = tgt_patent_count // len(top_terms)
  term_result_alloc_dict = {term: [get_tot_result_count(term), even_num_results] for term in top_terms}
  term_result_alloc_dict[top_terms[0]][1] += tgt_patent_count % len(top_terms)

  master_key = 0
  invalid_keys = []
  for i, (tot_result_count, num_results) in enumerate(term_result_alloc_dict.values()):
    if debug:
      print(tot_result_count, num_results)
      print(term_result_alloc_dict)

    if tot_result_count <= 2*num_results:
      temp_num_results = term_result_alloc_dict[top_terms[master_key]][1] + num_results
      temp_tot_result = term_result_alloc_dict[top_terms[master_key]][0]
      while temp_tot_result <= 2*temp_num_results:
        master_key += 1
        if master_key == len(top_terms):
          # Return empty dict when not enough results found
          return dict()
        temp_num_results = term_result_alloc_dict[top_terms[master_key]][1] + num_results
        temp_tot_result = term_result_alloc_dict[top_terms[master_key]][0]

      term_result_alloc_dict[top_terms[master_key]][1] = temp_num_results
      invalid_keys.append(i)

  for key in invalid_keys:
    del term_result_alloc_dict[top_terms[key]]

  return term_result_alloc_dict

In [None]:
for i, r in df.iterrows():
  top_terms = [term for term, score in kw_model.extract_keywords(r['Base_Abstract'], keyphrase_ngram_range=(1, 2))]
  rej_count = len(rej_cols) - r[rej_cols].isna().sum()
  tgt_patent_count = PATENT_COUNT_PER_ROW - rej_count

  term_result_alloc_dict = allocate_term_results(top_terms, tgt_patent_count)

  l = []
  for term, (tot_result_count, num_results) in term_result_alloc_dict.items():
    term = '%22' + term.lower() + '%22'
    term.replace(' ', '%20')

    start = random.randint(0, (tot_result_count - 2*num_results - 1))
    bulk_data_api_url = f"https://developer.uspto.gov/ibd-api/v1/application/publications?searchText={term}&start={start}&largeTextSearchFlag=Y"
    keyword_search = requests.get(bulk_data_api_url).json()

    keyword_search_results = keyword_search['results']
    j = 0
    for result in keyword_search_results:
      try:
        pub_num = result['publicationDocumentIdentifier']
        if pub_num == r['Base_Pub_Number'] or pub_num in l:
          continue

        claims = result['claimText'][0]
        l.extend([pub_num, claims])
        j += 1
        if j == num_results:
          break
      except:
        continue

    if j != num_results:
      print(f"Retrieval error at row {i}")

    if len(l)/2 == tgt_patent_count:
      if len(l)/2 != PATENT_COUNT_PER_ROW:
        diff = (PATENT_COUNT_PER_ROW * 2) - len(l)
        l.extend([np.nan]*diff)
      df.loc[i, non_rej_cols] = l

  print(i)

In [None]:
df.drop(columns=['Unnamed: 0', 'Rej_Pub_Number', 'Rej_Claims'], inplace=True)

In [None]:
df.to_csv('dataset.csv')