# Install

In [None]:
!pip install -q transformers

## Imports

In [None]:
import json
import zipfile
from pprint import pprint
import os

## Unzipping the zipped file

In [None]:
with zipfile.ZipFile('/content/patent_jsons_ML Assignment.zip', 'r') as zip_ref:
  zip_ref.extractall('patents')

## Importing Sample Data

In [None]:
# Opening JSON file
f = open('/content/patents/patent_jsons/JP-H09311786-A.json')

# returns JSON object as
# a dictionary
data = json.load(f)

(data.keys())

dict_keys(['patent_number', 'publication_id', 'family_id', 'publication_date', 'titles', 'abstracts', 'claims', 'descriptions', 'inventors', 'assignees', 'ipc_classes', 'locarno_classes', 'ipcr_classes', 'national_classes', 'ecla_classes', 'cpc_classes', 'f_term_classes', 'legal_status', 'priority_date', 'application_date', 'family_members'])

In [None]:
data['abstracts']

[{'lang': 'JA',
  'paragraph_markup': '<abstract lang="JA" load-source="patent-office" mxw-id="PA58764623"><p>(57)【要約】\n【課題】  スーパスカラ方式におけるデータ・ハザード<br/>のよるパイプライン・ストールを削減し、処理速度の向<br/>上を実現することにある。\n【解決手段】  隣接する２つの２オペランド命令が、１<br/>つの３オペランド命令と同等であることを検出する回路<br/>と、そうであれば２つの命令を１つの３オペランド命令<br/>に統合して後続の実行ステージに送出する回路を命令デ<br/>コーダに設ける。また隣接する２つの命令がデータフロ<br/>ーの関係にあるが１つの３オペランド命令には統合でき<br/>ないことを検出すると、先行命令のソースデータを後続<br/>命令のための演算器に送る回路を設ける。\n【効果】  隣接命令間のデータフローにより従来であれ<br/>ば２クロックの時間を要していた２つの命令処理を１ク<br/>ロックで実行できる。したがって、全体としての実行ク<br/>ロック数を削減できる。\n</p></abstract>'},
 {'lang': 'EN',
  'paragraph_markup': '<abstract lang="EN" load-source="docdb" mxw-id="PA114921630" source="PAJ"><p>PROBLEM TO BE SOLVED: To reduce a pipeline stall due to a data hazard of a superscalar system and to improve the processing speed by changing an instruction in 1st instruction format stored in an instruction memory into an instruction in 2nd instruction format. SOLUTION: The instruction is taken in a 1st stage from the instruction m

In [None]:
# Going through each key for better understanding
abstract_data = data['abstracts'][1]['paragraph_markup'] # abstract where language is english

In [None]:
abstract_data

'<abstract lang="EN" load-source="docdb" mxw-id="PA114921630" source="PAJ"><p>PROBLEM TO BE SOLVED: To reduce a pipeline stall due to a data hazard of a superscalar system and to improve the processing speed by changing an instruction in 1st instruction format stored in an instruction memory into an instruction in 2nd instruction format. SOLUTION: The instruction is taken in a 1st stage from the instruction memory and the instruction taken in the 1st stage 101 is decoded in a 2nd stage 103. The decoded instruction is executed in a 3rd stage and when the execution result is written in a register in a 4th stage 107, the instruction in the 1st instruction format stored in the instruction memory is changed into the instruction in the 2nd instruction format and executed. Consequently, the pipeline stall due to the data hazard of the superscalar system can be reduced and the processing speed is improved.</p></abstract>'

# Checking whether each patent has english language abstract or not

In [None]:
def read_patent_jsons(directory):
    patents = []

    count = 0

    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            file_path = os.path.join(directory, filename)

            # Read the content of each JSON file
            with open(file_path, "r", encoding="utf-8") as file:
                data = json.load(file)
                patents.append(data)
                count += 1
    print(f"total patent json files: {count}")

    return patents

patents = read_patent_jsons("/content/patents/patent_jsons")

total patent json files: 500


In [None]:
# counting the no.of patents with english abstract

english_abstract_patent_counts = 0
for patent in patents:
  if "abstracts" in list(patent.keys()):
    for abstract in patent["abstracts"]:
      if abstract['lang'] == 'EN':
        english_abstract_patent_counts += 1
        break

english_abstract_patent_counts

500

In [None]:
# This proves we have english patent for every json patent

# Sample searching using transformers

In [None]:
"""
This is a simple application for sentence embeddings: semantic search

We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.

This script outputs for various queries the top 5 most similar sentences in the corpus.
"""

from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Corpus with example sentences
corpus = [
    """<abstract lang="EN" load-source="docdb" mxw-id="PA114921630" source="PAJ"><p>PROBLEM TO BE SOLVED: To reduce a pipeline stall due """,
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = [
    """<abstract lang="EN" load-source="docdb" mxw-id="PA114921630" source="PAJ"><p>PROBLEM TO BE SOLVED: To reduce a pipeline stall due to a data hazard of a superscalar system and to improve the processing speed by changing an instruction in 1st instruction format stored in an instruction memory into an instruction in 2nd instruction format. SOLUTION: The instruction is taken in a 1st stage from the instruction memory and the instruction taken in the 1st stage 101 is decoded in a 2nd stage 103. The decoded instruction is executed in a 3rd stage and when the execution result is written in a register in a 4th stage 107, the instruction in the 1st instruction format stored in the instruction memory is changed into the instruction in the 2nd instruction format and executed. Consequently, the pipeline stall due to the data hazard of the superscalar system can be reduced and the processing speed is improved.</p></abstract>",
    "Someone in a gorilla costume is playing a set of drums.",
    "A cheetah chases prey on across a field.""",
]


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx][:200], "(Score: {:.4f})".format(score))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()






Query: <abstract lang="EN" load-source="docdb" mxw-id="PA114921630" source="PAJ"><p>PROBLEM TO BE SOLVED: To reduce a pipeline stall due to a data hazard of a superscalar system and to improve the processing speed by changing an instruction in 1st instruction format stored in an instruction memory into an instruction in 2nd instruction format. SOLUTION: The instruction is taken in a 1st stage from the instruction memory and the instruction taken in the 1st stage 101 is decoded in a 2nd stage 103. The decoded instruction is executed in a 3rd stage and when the execution result is written in a register in a 4th stage 107, the instruction in the 1st instruction format stored in the instruction memory is changed into the instruction in the 2nd instruction format and executed. Consequently, the pipeline stall due to the data hazard of the superscalar system can be reduced and the processing speed is improved.</p></abstract>",
    "Someone in a gorilla costume is playing a set of drums."

In [None]:
!pip show sentence_transformers

Name: sentence-transformers
Version: 2.5.1
Summary: Multilingual text embeddings
Home-page: https://www.SBERT.net
Author: Nils Reimers
Author-email: info@nils-reimers.de
License: Apache License 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, Pillow, scikit-learn, scipy, torch, tqdm, transformers
Required-by: 


In [None]:
!pip show torch

Name: torch
Version: 2.1.0+cu121
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, fsspec, jinja2, networkx, sympy, triton, typing-extensions
Required-by: fastai, sentence-transformers, torchaudio, torchdata, torchtext, torchvision


In [None]:
!pip install torch

