In [None]:
from transformers import pipeline
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import pandas as pd


In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Title and Abstract Processing

In [None]:
feature_extracter = pipeline("feature-extraction", model="anferico/bert-for-patents", device = 0)

summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device = 0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/327 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/329k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
# Directory to titles+abstracts
dir_titleabstract = "/content/drive/MyDrive/titleabstracttexts1"
# Directory to full texts
dir_fulltext = "/content/drive/MyDrive/descriptiontexts1"
# Load all unique patents
with open("/content/drive/MyDrive/app_ref_doc_numbers1.txt", "r") as f:
  unique_patents = f.read().splitlines()
print(unique_patents)

In [None]:
def read_in_texts(directory, ids):
  texts = []
  for id in ids:
    with open(os.path.join(directory,f"{int(id)}.txt"), "r") as reader:
      texts.append(reader.read().replace("\n",""))
  return texts

In [None]:
def titleabstractloader(batch_size):
  batch_size = batch_size
  for id_set in range(int(round(len(unique_patents)/batch_size+0.5,0))):
    ids = [value for index,value in enumerate(unique_patents) if id_set*batch_size <= index < (id_set+1)*batch_size]
    texts = read_in_texts(dir_titleabstract, ids)
    yield ids, texts

In [None]:
def fulltextloader(batch_size):
  batch_size = batch_size
  for id_set in range(int(round(len(unique_patents)/batch_size+0.5,0))):
    ids = [value for index,value in enumerate(unique_patents) if id_set*batch_size <= index < (id_set+1)*batch_size]
    texts = read_in_texts(dir_fulltext, ids)
    yield ids, texts

In [None]:
os.mkdir("/content/drive/MyDrive/batched_indices")
os.mkdir("/content/drive/MyDrive/batched_titleabstracts")

In [None]:
# Encode the titleabstracts batch by batch and save them
for batch_ind, textinfo in enumerate(titleabstractloader(10)):
  id_set, texts = textinfo
  print(id_set)
  texts = [truncate_to_512_tokens(text) for text in texts]

  # Extract features for each text
  features = feature_extracter(texts)
  squeezed_features = [np.squeeze(np.array(feats), axis=0) for feats in features]

  # Aggregate token embeddings (e.g., using the mean)
  aggregated_features = [np.mean(np.array(feats), axis=0) for feats in squeezed_features]

  # Save indices
  with open(f"/content/drive/MyDrive/batched_indices1/batch_{batch_ind}.txt", "w") as f:
    f.write("\n".join(id_set))
  print(f"Batch {batch_ind} indices saved!")

  # Save embeded titleabstracts
  directory_embedding = os.path.join("/content/drive/MyDrive/batched_titleabstracts1", f"batch_{batch_ind}.npy")
  np.save(directory_embedding, aggregated_features)
  print(f"Batch {batch_ind} embeddings saved!")




In [None]:
# Reconstruct array of all embeddings
directory_base = "/content/drive/MyDrive/batched_titleabstracts"
all_embeddings = []
for batch in os.listdir(directory_base):
  batch_filename = os.path.join(directory_base, batch)
  batch_embeddings = np.load(batch_filename)
  all_embeddings.append(batch_embeddings)
combined_embeddings = np.vstack(all_embeddings)
print(combined_embeddings)

In [None]:
# Reconstrct array of all indices
directory_base = "/content/drive/MyDrive/batched_indices"
all_indices = []
for batch in os.listdir(directory_base):
  batch_filename = os.path.join(directory_base,batch)
  with open(batch_filename, "r") as index_reader:
    all_indices.extend(index_reader.read().splitlines())
print(all_indices)

In [None]:
# Read in data for analysis
directory_base_analysis = "/content/drive/MyDrive/all"
indices = np.load(os.path.join(directory_base_analysis, "indicies.npy"))
fulltext = np.load(os.path.join(directory_base_analysis, "fulltext.npy"))
fulltext_trunc = np.load(os.path.join(directory_base_analysis, "fulltext_trunc.npy"))
titleabstracts = np.load(os.path.join(directory_base_analysis, "titleabstracts.npy"))

In [None]:
def top_k_similar_patent_indices(patent_index, all_patent_indices, all_patent_embeddings, k):
  try:
    ind = all_patent_indices.index(patent_index)
    patent_embedding = all_patent_embeddings[ind]
  except:
    assert("patent embedding does not exist")

  A = patent_embedding
  B = all_patent_embeddings

  A_normalized = A / np.linalg.norm(A)
  B_normalized = B / np.linalg.norm(B, axis=1, keepdims=True)

  # Compute cosine similarities
  similarities = np.dot(B_normalized, A_normalized)

  # Use argpartition to find the top k indices efficiently
  top_k_indices = np.argpartition(-similarities, k+1)[:k+1]  # Select top k indices (negate for descending order)

  # Sort the top k indices by similarity for the final order
  top_k_indices = top_k_indices[np.argsort(-similarities[top_k_indices])]
  top_k_patent_indices = [all_patent_indices[id] for id in top_k_indices[1:]]
  return top_k_patent_indices



In [None]:
def top_all_indices(patent_index, all_patent_indices, all_patent_embeddings):
    ind = np.where(all_patent_indices == patent_index)[0]
    patent_embedding = all_patent_embeddings[ind]

    # Normalize A and B for cosine similarity
    patent_embedding_norm = patent_embedding / np.linalg.norm(patent_embedding)
    all_patent_embeddings_norm = all_patent_embeddings / np.linalg.norm(all_patent_embeddings, axis=1, keepdims=True)

    # Compute cosine similarity
    similarities = np.dot(all_patent_embeddings_norm, patent_embedding_norm.T)

    # Combine C and D into a 2D array
    CD_combined = np.column_stack((similarities, all_patent_indices))

    # Sort by the first column (C) in descending order
    CD_sorted = CD_combined[np.argsort(CD_combined[:, 0])[::-1]]

    # Extract the sorted D values
    D_sorted = CD_sorted[:, 1]

    # D_sorted is the desired output
    return D_sorted

In [None]:
def ind_to_text(index, titleabstract = True):
  with open(os.path.join("/content/drive/MyDrive/titleabstracttexts1", f"{index}.txt")) as f:
    return f.read()

In [None]:
k = 5
print("PATENT TO MATCH:")
print("\n")
print(ind_to_text('18363819'),f"\n\nTOP {k} MOST SIMILAR")
print("\n")
for ind in top_k_similar_patent_indices('18363819', all_indices, combined_embeddings, k):
  print(ind_to_text(ind))

PATENT TO MATCH:


METHOD OF MANUFACTURING SEMICONDUCTOR STRUCTURE, SEMICONDUCTOR STRUCTURE, AND MEMORY 
The present disclosure provides a method of manufacturing a semiconductor structure, a semiconductor structure, and a memory. The semiconductor structure includes a base. The base includes columnar basal bodies and an isolation layer filled around the columnar basal bodies. Word line trenches are provided in the base and extend along a direction parallel to a surface of the base. First trench portions are formed at parts of the word line trenches intersecting with the columnar basal bodies, and a first word line conductive layer, a second word line conductive layer, and an insulating layer are sequentially arranged in the first trench portions from bottom to top. Second trench portions are formed at parts of the word line trenches intersecting with the isolation layer, and the second word line conductive layer and the insulating layer are sequentially arranged in the second trench p

# Detailed text processing

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("anferico/bert-for-patents")

In [None]:
def truncate_to_512_tokens(text):
    # Tokenize the input
    tokens = tokenizer.tokenize(text)

    # Truncate to 512 tokens
    truncated_tokens = tokens[:450]

    # Convert back to string
    truncated_text = tokenizer.convert_tokens_to_string(truncated_tokens)
    return truncated_text

In [None]:
def summarize(text, summary_model, chunk_size = 1000, summary_length = 100):
  # Tokenize the input text to estimate token length
  tokens = text.split()

  # Split the text into chunks of chunk_size tokens
  chunks = [' '.join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]

  # Summarize each chunk
  summaries = []
  for ind, chunk in enumerate(chunks):
    if ind >= 5:
      break
    if 20 < len(chunk.split()) < 100:
      summary = summarizer(chunk, max_length=20, min_length=15, truncation=True)
    elif len(chunk.split())<=20:
      continue
    else:
      summary = summarizer(chunk, max_length=summary_length, min_length=summary_length//2, truncation=True)
    summaries.append(summary[0]['summary_text'])  # Extract summary text

  # Concatenate the summaries
  final_summary = ' '.join(summaries)

  return final_summary


In [None]:
# Encode the description texts batch by batch and save them
for batch_ind, textinfo in enumerate(fulltextloader(10)):
  print(f"Processing batch {batch_ind}")
  id_set, texts = textinfo

  new_texts = [summarize(text, summarizer) for text in texts]
  new_texts_shortened = [truncate_to_512_tokens(text) for text in new_texts]
  print(f"batch {batch_ind} has been shortened")

  # Extract features for each text
  features = feature_extracter(new_texts_shortened)
  squeezed_features = [np.squeeze(np.array(feats), axis=0) for feats in features]

  # Aggregate token embeddings (e.g., using the mean)
  aggregated_features = [np.mean(np.array(feats), axis=0) for feats in squeezed_features]

  # Save indices
  with open(f"/content/drive/MyDrive/batched_indices_fulltext1/batch_{batch_ind}.txt", "w") as f:
    f.write("\n".join(id_set))
  print(f"Batch {batch_ind} fulltext indices saved!")

  # Save embeded titleabstracts
  directory_embedding = os.path.join("/content/drive/MyDrive/batched_titleabstracts_fulltext1", f"batch_{batch_ind}.npy")
  np.save(directory_embedding, aggregated_features)
  print(f"Batch {batch_ind} fulltext embeddings saved!")

In [None]:
# Reconstruct array of all embeddings
directory_base = "/content/drive/MyDrive/batched_titleabstracts_fulltext"
all_embeddings_fulltext = []
for batch in os.listdir(directory_base):
  batch_filename = os.path.join(directory_base, batch)
  batch_embeddings = np.load(batch_filename)
  all_embeddings_fulltext.append(batch_embeddings)
combined_embeddings_fulltext = np.vstack(all_embeddings_fulltext)
print(combined_embeddings_fulltext.shape)


(40, 1024)


In [None]:
# Reconstrct array of all indices
directory_base = "/content/drive/MyDrive/batched_indices_fulltext"
all_indices_fulltext = []
for batch in os.listdir(directory_base):
  batch_filename = os.path.join(directory_base,batch)
  with open(batch_filename, "r") as index_reader:
    all_indices_fulltext.extend(index_reader.read().splitlines())
print(all_indices_fulltext)

In [None]:
k = 5
for ind in top_k_similar_patent_indices('18363819', all_indices, combined_embeddings, k):
  print(ind_to_text(ind))

BATTERY, POWER CONSUMING APPARATUS, AND METHOD AND APPARATUS FOR MANUFACTURING BATTERY 
Provided are a battery, comprising: a battery cell group comprising N battery cell rows arranged in a first direction, battery cells in each battery cell row being arranged in a second direction, the first direction being perpendicular to the second direction; a signal transmission assembly provided on the first face of the battery cell group and comprising a busbar component and an insulation layer, the busbar component being configured to be electrically connected to the battery cells at holes of the insulation layer; and a cooling system provided between two adjacent battery cell rows in the N battery cell rows, with a blocking member being provided at an opening, facing the first face, of a gap between the two adjacent battery cell rows to block the opening to prevent condensed liquid generated by the cooling system from reaching the busbar component.

BATTERY UNIT, BATTERY, AND ELECTRIC APPARAT

In [None]:
def combine_titleabstract_fulltext(abstracttitles, fulltexts, ATtoFtRatio = 1):
  abstracttitles_normalized = abstracttitles / np.linalg.norm(abstracttitles, axis=1, keepdims=True)
  fulltexts_normalized = fulltexts / np.linalg.norm(fulltexts, axis = 1, keepdims = True)
  combined_array = np.concatenate((abstracttitles_normalized*ATtoFtRatio, fulltexts_normalized), axis=1)
  return combined_array

In [None]:
k = 5
ATFtcombined = combine_titleabstract_fulltext(combined_embeddings, combined_embeddings_fulltext)
for ind in top_k_similar_patent_indices('18363819', all_indices, ATFtcombined, k):
  print(ind_to_text(ind))

BATTERY, POWER CONSUMING APPARATUS, AND METHOD AND APPARATUS FOR MANUFACTURING BATTERY 
Provided are a battery, comprising: a battery cell group comprising N battery cell rows arranged in a first direction, battery cells in each battery cell row being arranged in a second direction, the first direction being perpendicular to the second direction; a signal transmission assembly provided on the first face of the battery cell group and comprising a busbar component and an insulation layer, the busbar component being configured to be electrically connected to the battery cells at holes of the insulation layer; and a cooling system provided between two adjacent battery cell rows in the N battery cell rows, with a blocking member being provided at an opening, facing the first face, of a gap between the two adjacent battery cell rows to block the opening to prevent condensed liquid generated by the cooling system from reaching the busbar component.

COATING APPARATUS AND COATING SYSTEM 
Provi

In [None]:
os.mkdir(f"/content/drive/MyDrive/batched_indices_fulltext1_trunc")
os.mkdir(f"/content/drive/MyDrive/batched_titleabstracts_fulltext1_trunc")

In [None]:
# Encode the description texts batch by batch and save them
for batch_ind, textinfo in enumerate(fulltextloader(10)):
  print(f"Processing batch {batch_ind}")
  id_set, texts = textinfo

  new_texts = texts
  new_texts_shortened = [truncate_to_512_tokens(text) for text in new_texts]
  print(f"batch {batch_ind} has been shortened")

  # Extract features for each text
  features = feature_extracter(new_texts_shortened)
  squeezed_features = [np.squeeze(np.array(feats), axis=0) for feats in features]

  # Aggregate token embeddings (e.g., using the mean)
  aggregated_features = [np.mean(np.array(feats), axis=0) for feats in squeezed_features]

  # Save indices
  with open(f"/content/drive/MyDrive/batched_indices_fulltext1_trunc/batch_{batch_ind}.txt", "w") as f:
    f.write("\n".join(id_set))
  print(f"Batch {batch_ind} fulltext indices saved!")

  # Save embeded titleabstracts
  directory_embedding = os.path.join("/content/drive/MyDrive/batched_titleabstracts_fulltext1_trunc", f"batch_{batch_ind}.npy")
  np.save(directory_embedding, aggregated_features)
  print(f"Batch {batch_ind} fulltext embeddings saved!")