# Patent technology domain analysis using NLP
This program automates the patent analysis process for large batches (10,000+) of patents by allowing users to extract data from Google Patents. By extracting insights such as the Title, Abstract, and frequently occurring keywords in the patent text, users can quickly deduce the technological domain of each patent.

The program significantly reduces the time spent on manual patent page reviews, especially when dealing with large patent portfolios. This time-saving effect becomes even more pronounced as the size of the patent batch increases.

# Importing the required modules

In [1]:
# Install required libraries and download NLTK data
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
from collections import Counter
import pandas as pd
from google.colab import files

# Ensure NLTK data is downloaded
nltk.download('punkt_tab')
nltk.download('punkt', raise_on_error=True)
nltk.download('stopwords', raise_on_error=True)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Fetch Patent page content
This function will fetch the patent content from Google Patent's database

In [2]:
def fetch_patent_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to fetch page: {response.status_code}")

# Extract Claim 1 and text from Patent page
This function is responsible for extracting the first claim from the patent text

In [3]:
def extract_claim_abstract_and_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extracting Claim 1
    first_claim_div = soup.find('div', id='CLM-00001')  # Look for the div with id="CLM-00001"
    first_claim = ''
    if first_claim_div:
        # Combine all text within this div and its children
        first_claim = ' '.join(first_claim_div.stripped_strings)

    # Extracting the Abstract
    abstract_section = soup.find('section', itemprop='abstract')
    abstract = ''
    if abstract_section:
        abstract_div = abstract_section.find('div', itemprop='content') or abstract_section.find('abstract')
        if abstract_div:
            abstract = abstract_div.get_text(separator=' ', strip=True)

    # Extracting text from abstract, claims, and description
    abstract_text = soup.find('div', class_='abstract')
    description = soup.find('section', class_='description')

    text = ''
    if abstract_text:
        text += abstract_text.get_text(separator=' ', strip=True) + ' '
    if description:
        text += description.get_text(separator=' ', strip=True) + ' '

    return first_claim, abstract, text

# Cleaning and tokenizing the text

In [4]:
# Function to clean and tokenize the text
def clean_and_tokenize_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

# Extracting Bigrams (2-Word Keywords) and Trigrams (3-word keywords) from the patent text
This function will extract bigrams and trigrams from the patent text. I chose bigrams and trigrams instead of one-word keywords as one-word keywords may contain junk values; some words may have a high frequency but don't necessarily provide any insight to the patent's technology domain. For example, the word "user" may appear many times in a patent's text, but by itself it is meaningless.


In [5]:
def extract_bigrams_and_trigrams(words, num_keywords=10):
    bigrams = ngrams(words, 2)
    trigrams = ngrams(words, 3)

    #get the frequency count for the bigrams and trigrams
    bigram_freq = Counter(bigrams)
    trigram_freq = Counter(trigrams)

    #get the most commonly occuring bigrams and trigrams
    common_bigrams = bigram_freq.most_common(num_keywords)
    common_trigrams = trigram_freq.most_common(num_keywords)

    bigram_keywords = [' '.join(bigram) for bigram, _ in common_bigrams]
    trigram_keywords = [' '.join(trigram) for trigram, _ in common_trigrams]

    return bigram_keywords, trigram_keywords

# Process Patents from an Excel File

In [6]:
def process_patents_from_excel(file_path, patent_column, output_file):
    # Read the Excel file
    df = pd.read_excel(file_path)

    # Add new columns for claim 1, abstract, bigrams, and trigrams
    df['Claim 1'] = ''
    df['Abstract'] = ''
    df['Bigrams'] = ''
    df['Trigrams'] = ''

    # Iterate through the patents
    for index, row in df.iterrows():
        patent_number = row[patent_column]
        if pd.isna(patent_number):
            continue

        # Construct the patent URL
        url = f"https://patents.google.com/patent/{patent_number.strip()}"

        try:
            # Fetch and process the patent page
            html_content = fetch_patent_page(url)
            claim_1, abstract, text = extract_claim_abstract_and_text(html_content)
            words = clean_and_tokenize_text(text)

            # Extract bigrams and trigrams
            bigrams, trigrams = extract_bigrams_and_trigrams(words)

            # Update the DataFrame with claim 1, abstract, bigrams, and trigrams
            df.at[index, 'Claim 1'] = claim_1
            df.at[index, 'Abstract'] = abstract
            df.at[index, 'Bigrams'] = ', '.join(bigrams)
            df.at[index, 'Trigrams'] = ', '.join(trigrams)
        except Exception as e:
            print(f"Error processing patent {patent_number}: {e}")
            continue

    # Save the updated DataFrame back to Excel
    df.to_excel(output_file, index=False)
    print(f"Processing complete. Results saved to {output_file}")

# Uploading Excel file and calling the process Patents function



In [7]:
#Upload the Excel file
uploaded = files.upload()
input_excel = list(uploaded.keys())[0]  # Get the uploaded file name

# Output file
output_excel = 'processed_patents.xlsx'
patent_column_name = 'Patent Number'  # Column name containing patent numbers

# Process the patents
process_patents_from_excel(input_excel, patent_column_name, output_excel)

#Download the output excel
files.download(output_excel)

Saving patents.xlsx to patents.xlsx
Processing complete. Results saved to processed_patents.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>