In [1]:
import os
import pandas as pd
import numpy as np

In [None]:
import csv

def save_dict_to_csv(data, output_file):
    """
    Save a dictionary to a CSV file.
    
    Args:
        data (dict): The dictionary to save (keys as rows, values as columns).
        output_file (str): Path to the output CSV file.
    """
    try:
        with open(output_file, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            # Write header
            writer.writerow(["CPC", "Title"])
            # Write data
            for cpc, title in data.items():
                writer.writerow([cpc, title])
        print(f"Dictionary saved to {output_file}")
    except Exception as e:
        print(f"Error saving dictionary to CSV: {e}")

In [2]:
def normalize(embedding):
    
    return embedding / np.linalg.norm(embedding)

## IPC

### Main dataset (patent->IPC)

In [2]:


# Directory containing the files
directory_path = 'patent_cpc_data'

# Initialize a list to store data from all files
all_data = []

# Loop through each file in the directory
for filename in os.listdir(directory_path):
    # Construct the full file path
    file_path = os.path.join(directory_path, filename)
    
    # Check if it's a .txt file
    if filename.endswith('.txt'):
        print(f"Processing file: {filename}")
        
        # Read the file and extract Patent ID and CPC
        with open(file_path, 'r') as file:
            for line in file:
                if len(line) > 22:
                    try:
                        # Extract Patent ID (11 digits after the 10th position)
                        patent_id = line[10:21]
                        
                        # Extract Main CPC (1 characters after the Patent ID)
                        main_cpc = line[21]

                        # Extract Big CPC (2 characters after the Patent ID)
                        big_cpc = line[21:24]

                        # Extract Medium CPC (4 characters after the Patent ID)
                        medium_cpc = line[21:25]
                        
                        # Extract Refined CPC (remainder of the CPC code until a space)
                        refined_cpc = line[25:].split()[0]
                        
                        # Append to the data list
                        all_data.append({
                            'Patent ID': patent_id,
                            'Main CPC': main_cpc,
                            'Big CPC': big_cpc,
                            'Medium CPC': medium_cpc,
                            'Refined CPC': refined_cpc
                        })
                    except IndexError:
                        print(f"Skipping invalid line in {filename}: {line.strip()}")

# Convert the list of dictionaries to a dataframe
df = pd.DataFrame(all_data)

# Display the combined dataframe
print(df)

# Save the dataframe to a CSV file (optional)
output_file = 'combined_patent_cpc_data.csv'
df.to_csv(output_file, index=False)
print(f"Combined data saved to {output_file}")


Processing file: US_PGPub_CPC_MCF_20130050000.txt
Processing file: US_PGPub_CPC_MCF_20120150000.txt
Processing file: US_PGPub_CPC_MCF_20120100000.txt
Processing file: US_PGPub_CPC_MCF_20140000001.txt
Processing file: US_PGPub_CPC_MCF_20140350000.txt
Processing file: US_PGPub_CPC_MCF_20140100000.txt
Processing file: US_PGPub_CPC_MCF_20130000001.txt
Processing file: US_PGPub_CPC_MCF_20140150000.txt
Processing file: US_PGPub_CPC_MCF_20130300000.txt
Processing file: US_PGPub_CPC_MCF_20140200000.txt
Processing file: US_PGPub_CPC_MCF_20120300000.txt
Processing file: US_PGPub_CPC_MCF_20140250000.txt
Processing file: US_PGPub_CPC_MCF_20130150000.txt
Processing file: US_PGPub_CPC_MCF_20130250000.txt
Processing file: US_PGPub_CPC_MCF_20130100000.txt
Processing file: US_PGPub_CPC_MCF_20140300000.txt
Processing file: US_PGPub_CPC_MCF_20120050000.txt
Processing file: US_PGPub_CPC_MCF_20130200000.txt
Processing file: US_PGPub_CPC_MCF_20120250000.txt
Processing file: US_PGPub_CPC_MCF_20140050000.txt


In [24]:
df[('Patent ID')].nunique()

1057711

In [23]:
import xml.etree.ElementTree as ET

# Path to the .xsd file
xsd_file = './venv/CPCDefinitionsSchema10.xsd'

# Parse the .xsd file
try:
    tree = ET.parse(xsd_file)
    root = tree.getroot()

    namespace = {'xs': 'http://www.w3.org/2001/XMLSchema'}

    # Find all elements with name 'definitions'
    definitions = root.findall('.//xs:element[@name="definition-item"]', namespace)

    # Extract details of each 'definitions' element
    for definition in definitions:
        print(f"Definition Element: {definition.attrib}")
    def print_tree(element, level=0):
        indent = "  " * level
        print(f"{indent}<{element.tag} {dict(element.attrib)}>")
        for child in element:
            print_tree(child, level + 1)
        print(f"{indent}</{element.tag}>")

    # Print the root and its children
    print_tree(root)
except FileNotFoundError:
    print(f"The file {xsd_file} was not found.")
except ET.ParseError as e:
    print(f"Error parsing the XSD file: {e}")

Definition Element: {'name': 'definition-item', 'type': 'definition-item-type'}
<{http://www.w3.org/2001/XMLSchema}schema {}>
  <{http://www.w3.org/2001/XMLSchema}annotation {}>
    <{http://www.w3.org/2001/XMLSchema}documentation {}>
    </{http://www.w3.org/2001/XMLSchema}documentation>
  </{http://www.w3.org/2001/XMLSchema}annotation>
  <{http://www.w3.org/2001/XMLSchema}element {'name': 'definitions', 'type': 'definitions-type'}>
    <{http://www.w3.org/2001/XMLSchema}annotation {}>
      <{http://www.w3.org/2001/XMLSchema}documentation {}>
      </{http://www.w3.org/2001/XMLSchema}documentation>
    </{http://www.w3.org/2001/XMLSchema}annotation>
  </{http://www.w3.org/2001/XMLSchema}element>
  <{http://www.w3.org/2001/XMLSchema}element {'name': 'abbreviations', 'type': 'section-body-type'}>
    <{http://www.w3.org/2001/XMLSchema}annotation {}>
      <{http://www.w3.org/2001/XMLSchema}documentation {}>
      </{http://www.w3.org/2001/XMLSchema}documentation>
    </{http://www.w3.o

### IPC dataset

In [36]:
import os
import xml.etree.ElementTree as ET

def extract_cpc_and_title_from_xml(file_path):
    """
    Extract CPC and Title from a given XML file.
    
    Args:
        file_path (str): Path to the XML file.
    
    Returns:
        list: A list of tuples containing CPC and Title pairs.
    """
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        results = []
        #for item in root.findall(".//definition-item"):
        cpc = root.find(".//classification-symbol")
        title = root.find(".//definition-title")
        if cpc is not None and title is not None:
            results.append((cpc.text.strip(), title.text.strip()))
        return results
    except ET.ParseError:
        print(f"Error parsing {file_path}. Skipping...")
        return []

def process_directory(directory_path):
    """
    Process all XML files in a directory and extract CPC and Title.
    
    Args:
        directory_path (str): Path to the directory containing XML files.
    
    Returns:
        dict: A dictionary with CPC codes as keys and Titles as values.
    """
    combined_results = {}
    
    for file_name in os.listdir(directory_path):
        if file_name.endswith(".xml"):
            file_path = os.path.join(directory_path, file_name)
            file_results = extract_cpc_and_title_from_xml(file_path)
            for cpc, title in file_results:
                # Combine results, overwrite if duplicate CPC is found
                combined_results[cpc] = title
    
    return combined_results

# Directory containing XML files
directory_path = "venv/FullCPCDefinitionXML202501"

combined_results = process_directory(directory_path)

# Print the combined results
for cpc, title in combined_results.items():
    print(f"CPC: {cpc}, Title: {title}")


CPC: G06E, Title: OPTICAL COMPUTING DEVICES; {COMPUTING DEVICES USING OTHER RADIATIONS WITH SIMILAR PROPERTIES} (optical logic elements per se
CPC: A41D, Title: OUTERWEAR; PROTECTIVE GARMENTS; ACCESSORIES
CPC: B64G, Title: COSMONAUTICS; VEHICLES OR EQUIPMENT THEREFOR
CPC: B65C, Title: LABELLING OR TAGGING MACHINES, APPARATUS, OR PROCESSES  (nailing or stapling in general
CPC: A61F, Title: FILTERS IMPLANTABLE INTO BLOOD VESSELS; PROSTHESES; DEVICES PROVIDING PATENCY TO, OR PREVENTING COLLAPSING OF, TUBULAR STRUCTURES OF THE BODY, e.g. STENTS; ORTHOPAEDIC, NURSING OR CONTRACEPTIVE DEVICES; FOMENTATION; TREATMENT OR PROTECTION OF EYES OR EARS; BANDAGES, DRESSINGS OR ABSORBENT PADS; FIRST-AID KITS  (dental prosthetics
CPC: F16M, Title: FRAMES, CASINGS OR BEDS OF ENGINES, MACHINES OR APPARATUS, NOT SPECIFIC TO ENGINES, MACHINES OR APPARATUS PROVIDED FOR ELSEWHERE; STANDS; SUPPORTS
CPC: A61Q, Title: SPECIFIC USE OF COSMETICS OR SIMILAR TOILETRY PREPARATIONS
CPC: D03D, Title: WOVEN FABRICS; M

In [19]:
import csv

def save_dict_to_csv(data, output_file):
    """
    Save a dictionary to a CSV file.
    
    Args:
        data (dict): The dictionary to save (keys as rows, values as columns).
        output_file (str): Path to the output CSV file.
    """
    try:
        with open(output_file, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            # Write header
            writer.writerow(["CPC", "Title"])
            # Write data
            for cpc, title in data.items():
                writer.writerow([cpc, title])
        print(f"Dictionary saved to {output_file}")
    except Exception as e:
        print(f"Error saving dictionary to CSV: {e}")




In [38]:
# Save the combined results to a CSV file
output_csv_path = "cpc_definitions.csv"
save_dict_to_csv(combined_results, output_csv_path)

Dictionary saved to cpc_definitions.csv


### IPC Big

In [6]:
import os
import re
import pandas as pd

# Directory containing .txt files
directory = "IPC definitions"

# Regex pattern for 3-digit codes
pattern = r"^([A-Z]\d{2})\t\t(.+)"

# List to store extracted data
data = []

# Iterate through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        filepath = os.path.join(directory, filename)
        
        # Open and read the file
        with open(filepath, "r", encoding="utf-8") as file:
            for line in file:
                # Match the pattern and extract the code and description
                match = re.match(pattern, line)
                if match:
                    code, description = match.groups()
                    data.append({ "Code": code, "Description": description})

# Create a DataFrame from the data
big_ipc = pd.DataFrame(data)

# Display the DataFrame
print(big_ipc)

    Code                                        Description
0    C01                                INORGANIC CHEMISTRY
1    C02  TREATMENT OF WATER, WASTE WATER, SEWAGE, OR SL...
2    C03                        GLASS; MINERAL OR SLAG WOOL
3    C04  CEMENTS; CONCRETE; ARTIFICIAL STONE; CERAMICS;...
4    C05                   FERTILISERS; MANUFACTURE THEREOF
..   ...                                                ...
132  B67  OPENING, CLOSING {OR CLEANING} BOTTLES, JARS O...
133  B68                               SADDLERY; UPHOLSTERY
134  B81                         MICROSTRUCTURAL TECHNOLOGY
135  B82                                     NANOTECHNOLOGY
136  B99  SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN T...

[137 rows x 2 columns]


In [11]:
results=[]
for i, title in big_ipc.iterrows():
    results.append([title[0],model.encode(title[1])])
big_features_df=pd.DataFrame(results,columns=['CPC','Title'])    
output_file = 'big_ipc_embeddings.pkl'
big_features_df.to_pickle(output_file)

  results.append([title[0],model.encode(title[1])])


### IPC MAIN

In [9]:

data=[['A', 'HUMAN NECESSITIES'],['B', 'PERFORMING OPERATIONS; TRANSPORTING'],['C','CHEMISTRY; METALLURGY'],['D', 'TEXTILES; PAPER'],[ 'E', 'FIXED CONSTRUCTIONS'],[ 'F','MECHANICAL ENGINEERING; LIGHTING; HEATING; WEAPONS; BLASTING'],['G','PHYSICS'],[ 'H', 'ELECTRICITY'],['Y','GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPMENTS; GENERAL TAGGING OF CROSS-SECTIONAL TECHNOLOGIES SPANNING OVER SEVERAL SECTIONS OF THE IPC; TECHNICAL SUBJECTS COVERED BY FORMER USPC CROSS-REFERENCE ART COLLECTIONS [XRACs] AND DIGESTS']]
main_ipc=pd.DataFrame(data,columns=['CPC','Title'] )

In [8]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [10]:

results=[]
for i, title in main_ipc.iterrows():
    results.append([title[0],model.encode(title[1])])
main_features_df=pd.DataFrame(results,columns=['CPC','Title'])    
output_file = 'main_ipc_embeddings.pkl'
main_features_df.to_pickle(output_file)


  results.append([title[0],model.encode(title[1])])


## Claims

### Data cleaning

In [3]:
csv_file_path = "filtered_pgpub_claims_2012_2014.csv"  # Replace with your CSV file path
 # Replace with your desired Parquet file path

# Read the CSV file
df_claims = pd.read_csv(csv_file_path)


In [3]:
df_claims

Unnamed: 0,pub_no,appl_id,claim_no,claim_txt,dependencies,ind_flg
0,20120003120,13159492,1,A method for recovering heat in a device for t...,,1
1,20120003120,13159492,2,"A method according to claim 1, wherein the pre...",1,0
2,20120003120,13159492,3,A device for the sterilization of biological m...,,1
3,20120003150,13231050,15,"The method of claim 1, further comprising admi...",1,0
4,20120003150,13231050,17,"The method of claim 1, wherein said ribonuclea...",1,0
...,...,...,...,...,...,...
19388245,20140380539,13986990,1,1. A new and distinct variety of Phalaenopsis ...,,1
19388246,20140380540,13986991,1,1. A new and distinct variety of Phalaenopsis ...,,1
19388247,20140380541,13986997,1,1. A new and distinct cultivar of Agapanthus p...,,1
19388248,20140380542,13987008,1,1. A new and distinct cultivar of Campanula pl...,,1


In [4]:
df_claims = df_claims.drop(columns=['appl_id','claim_no','dependencies','ind_flg'])

In [5]:
df_claims['claim_txt'] = df_claims['claim_txt'].fillna('').astype(str)

In [6]:
import numpy as np

In [19]:
# Split dataset into 1000 chunks
chunks = np.array_split(df_claims, 1000)

# Function to aggregate claims for a chunk
def process_chunk(chunk):
    return chunk.groupby('pub_no', as_index=False).agg({'claim_txt': ' '.join})

# Process each chunk and collect results
results = [process_chunk(chunk) for chunk in chunks[:100]]
final_result = pd.concat(results).reset_index(drop=True)

print(final_result)

  return bound(*args, **kwds)


            pub_no                                          claim_txt
0      20120003120  A method for recovering heat in a device for t...
1      20120003150  The method of claim 1, further comprising admi...
2      20120003151  (Canceled). (Canceled). (Canceled). The method...
3      20120003153  (Canceled). The indolyl-oxadiazolyl-diazabicyc...
4      20120003156  A method for treating neoplasia in a subject, ...
...            ...                                                ...
89296  20120069587  1. A light clip comprising: a base; a magnet a...
89297  20120069588  1. A light interfacing board, comprising: a st...
89298  20120069589  1. An LED landing light arrangement for an air...
89299  20120069590  1. A direction indicator comprising: a directi...
89300  20120069591  1. An exterior mirror vision system for a vehi...

[89301 rows x 2 columns]


In [20]:
chunks[1]

Unnamed: 0,pub_no,claim_txt
19389,20120015887,A pharmaceutical composition for enhancement o...
19390,20120015887,"The 4-copy branched peptide of claim 1, wherei..."
19391,20120015887,"The 4-copy branched peptide of claim 1, wherei..."
19392,20120015887,A 4-copy branched peptide represented by a for...
19393,20120015889,The method of claim 18 wherein said primary tu...
...,...,...
38773,20120034231,The antibody or binding protein according to c...
38774,20120034231,A process for the production of the antibody o...
38775,20120034231,"The antibody of claim 4, wherein the mutated o..."
38776,20120034231,(Canceled).


In [8]:
print(final_result.head(1)['claim_txt'].values[0])

A method for recovering heat in a device for the sterilization of biological material, comprising transferring heat from a sterilized effluent stream to a stream in a heat recovery circuit transferring heat from the stream in the heat recovery circuit to a stream of biologically contaminated feed while maintaining the pressure (p12) in the sterilized effluent stream higher than the pressure (p10) in the heat recovery circuit, which is maintained higher than the pressure (p11) in the stream of biologically contaminated feed. A method according to claim 1, wherein the pressure p12 in the sterilized effluent stream is over 6 bar, the pressure p10 in the heat recovery circuit is at minimum 1 bar and at maximum 3 bar, and the pressure p11 in the stream of biologically contaminated feed is 0.5 bar or less. A device for the sterilization of biological material, comprising: a feed line for contaminated material; a unit for heat treatment of said material; an effluent line for sterilized materi

In [21]:
final=pd.DataFrame()
for i in range(10):
    results_ = [process_chunk(chunk) for chunk in chunks[i*100:100*i+100]]
    final_result = pd.concat(results)
    final = pd.concat([final_result,final]).reset_index(drop=True)
print(final)

             pub_no                                          claim_txt
0       20120003120  A method for recovering heat in a device for t...
1       20120003150  The method of claim 1, further comprising admi...
2       20120003151  (Canceled). (Canceled). (Canceled). The method...
3       20120003153  (Canceled). The indolyl-oxadiazolyl-diazabicyc...
4       20120003156  A method for treating neoplasia in a subject, ...
...             ...                                                ...
893005  20120069587  1. A light clip comprising: a base; a magnet a...
893006  20120069588  1. A light interfacing board, comprising: a st...
893007  20120069589  1. An LED landing light arrangement for an air...
893008  20120069590  1. A direction indicator comprising: a directi...
893009  20120069591  1. An exterior mirror vision system for a vehi...

[893010 rows x 2 columns]


array([[20120003120,
        'A method for recovering heat in a device for the sterilization of biological material, comprising transferring heat from a sterilized effluent stream to a stream in a heat recovery circuit transferring heat from the stream in the heat recovery circuit to a stream of biologically contaminated feed while maintaining the pressure (p12) in the sterilized effluent stream higher than the pressure (p10) in the heat recovery circuit, which is maintained higher than the pressure (p11) in the stream of biologically contaminated feed. A method according to claim 1, wherein the pressure p12 in the sterilized effluent stream is over 6 bar, the pressure p10 in the heat recovery circuit is at minimum 1 bar and at maximum 3 bar, and the pressure p11 in the stream of biologically contaminated feed is 0.5 bar or less. A device for the sterilization of biological material, comprising: a feed line for contaminated material; a unit for heat treatment of said material; an efflu

In [14]:
final3=final.drop_duplicates(subset='pub_no', keep='first').reset_index(drop=True)
print(final3)

            pub_no                                          claim_txt
0      20120003120  A method for recovering heat in a device for t...
1      20120003150  The method of claim 1, further comprising admi...
2      20120003151  (Canceled). (Canceled). (Canceled). The method...
3      20120003153  (Canceled). The indolyl-oxadiazolyl-diazabicyc...
4      20120003156  A method for treating neoplasia in a subject, ...
...            ...                                                ...
89199  20120069587  1. A light clip comprising: a base; a magnet a...
89200  20120069588  1. A light interfacing board, comprising: a st...
89201  20120069589  1. An LED landing light arrangement for an air...
89202  20120069590  1. A direction indicator comprising: a directi...
89203  20120069591  1. An exterior mirror vision system for a vehi...

[89204 rows x 2 columns]


In [3]:
final3['pub_no'].nunique()

89204

In [16]:
output_file = 'merged_claims_dataset.csv'
final3.to_csv(output_file, index=False)

### Embedding Generation

In [5]:
file = 'merged_claims_dataset.csv'
final3 = pd.read_csv(file)

In [6]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
chunks = np.array_split(final3, 500)

# Function to aggregate claims for a chunk
def process_chunk_text(chunk):
    return  {chunk[0]: model.encode(chunk[1]) }

final_result=dict()

for i in range(len(chunks)):
    results=dict()
    for chunk in chunks[i].values:
        results.update(process_chunk_text(chunk)) 
    final_result.update(results)
    #df_embedings = pd.concat([final_result,df_embedings]).reset_index(drop=True)
print(len(final_result))
output_file = 'claims_embeddings.csv'
save_dict_to_csv(final_result, output_file)


  return bound(*args, **kwds)


89204
Dictionary saved to claims_embeddings.csv


384

## Graph creation


### Adjancy Matrix

In [72]:

csv_file_path = 'combined_patent_cpc_data.csv'  # Replace with your CSV file path
 # Replace with your desired Parquet file path

# Read the CSV file
df = pd.read_csv(csv_file_path)

In [73]:
df=df[df['Patent ID']<=20120069591].reset_index(drop=True).head(70000)

In [61]:
df[df['Patent ID']<=20120069591]['Patent ID'].reset_index(drop=True).unique()

True

In [64]:
# Read the CSV file
cpc_embeddings_df = pd.read_pickle('cpc_embeddings.pkl')

In [65]:
# Read the CSV file
patent_embeddings_df = pd.read_pickle('patent_embeddings.pkl')
patent_embeddings_df

Unnamed: 0,CPC,Title
0,20120003120,"[-0.11524424302180801, 0.035369316806693044, -..."
1,20120003150,"[-0.10961097201373773, 0.08915278811117368, -0..."
2,20120003151,"[-0.047954025970529966, 0.014629226891009645, ..."
3,20120003153,"[-0.06551284180965877, 0.005760260720849254, -..."
4,20120003156,"[0.032399873895817576, 0.03216320989584813, -0..."
...,...,...
89199,20120069587,"[-0.03020640550456989, -0.017039942102577955, ..."
89200,20120069588,"[-0.00904109424740548, -0.11033405409037364, -..."
89201,20120069589,"[0.027305297402422164, -0.049771271204415046, ..."
89202,20120069590,"[-0.030531046299497298, 0.036647538899396585, ..."


In [78]:
df

Unnamed: 0,Patent ID,Main CPC,Big CPC,Medium CPC,Refined CPC
0,20120050000,G,G05,G05B,23/0289
1,20120050000,G,G05,G05B,9/02
2,20120050001,G,G07,G07C,9/00944
3,20120050001,H,H04,H04L,12/10
4,20120050001,H,H04,H04L,12/40045
...,...,...,...,...,...
69995,20120060117,G,G06,G06F,3/0482
69996,20120060117,G,G06,G06F,16/248
69997,20120060118,A,A63,A63B,24/0062
69998,20120060118,A,A43,A43B,3/34


In [23]:
import numpy as np

# Normalize a single embedding
def normalize(embedding):
    
    return embedding / np.linalg.norm(embedding)

# Apply normalization
def process_array_string(array_string):
    # Add commas between numbers
    if array_string[1]==' ':
        string_list = list(array_string)
        # Modify the character at index 2
        string_list[1] = ''

        # Convert back to a string
        array_string= ''.join(string_list)
       
        

    cleaned_string =array_string.strip().replace("   ", " ").replace("  ", ",").replace(" ", ", ")

    # Step 2: Convert string to a NumPy array
    # Use `eval` cautiously if you trust the data source
    try:
        array = np.array(eval(cleaned_string))
        return array
    except:
        array = np.array(cleaned_string)
        return array
        
    

#cpc_embeddings['Title'] = cpc_embeddings['Title'].apply(process_array_string)
#cpc_embeddings['Title'] = cpc_embeddings['Title'].apply(normalize)
patent_embeddings['Title'] = patent_embeddings['Title'].apply(process_array_string)
patent_embeddings['Title'] = patent_embeddings['Title'].apply(normalize)
cpc_embeddings.to_pickle('cpc_embeddings.pkl')
patent_embeddings.to_pickle('patent_embeddings.pkl')

NameError: name 'patent_embeddings' is not defined

In [79]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

# Example DataFrame


# Step 1: Get unique nodes for each category
patent_nodes = df["Patent ID"].unique()
medium_ipc_nodes = df["Medium CPC"].unique()
big_ipc_nodes = df["Big CPC"].unique()
main_ipc_nodes = df["Main CPC"].unique()

# Step 2: Create index mappings
patent_idx = {node: i for i, node in enumerate(patent_nodes)}
medium_ipc_idx = {node: i for i, node in enumerate(medium_ipc_nodes)}
big_ipc_idx = {node: i for i, node in enumerate(big_ipc_nodes)}
main_ipc_idx = {node: i for i, node in enumerate(main_ipc_nodes)}

In [80]:
len(patent_nodes)

10105

In [81]:
# Step 3: Build adjacency matrices
# Patent ↔ Medium IPC
patent_medium_edges = [(patent_idx[row["Patent ID"]], medium_ipc_idx[row["Medium CPC"]])
                       for _, row in df.iterrows()]
# Add reverse edges for undirected graph



rows, cols = zip(*set(patent_medium_edges))
patent_medium_adj = coo_matrix(
    (np.ones(len(rows)), 
     (rows, cols)),
    shape=(len(patent_nodes), len(medium_ipc_nodes))
)
# Medium IPC ↔ Big IPC
medium_big_edges = [(medium_ipc_idx[row["Medium CPC"]], big_ipc_idx[row["Big CPC"]])
                    for _, row in df.iterrows()]
rows, cols = zip(*set(medium_big_edges))
medium_big_adj = coo_matrix(
    (np.ones(len(rows)), 
     (rows, cols)),
    shape=(len(medium_ipc_nodes), len(big_ipc_nodes))
)
# Big IPC ↔ Main IPC
big_main_edges = [(big_ipc_idx[row["Big CPC"]], main_ipc_idx[row["Main CPC"]])
                  for _, row in df.iterrows()]

rows, cols = zip(*set(big_main_edges))
big_main_adj = coo_matrix(
    (np.ones(len(rows)), 
     (rows, cols)),
    shape=(len(big_ipc_nodes), len(main_ipc_nodes))
)

# Print adjacency matrices as

print("Patent ↔ Medium IPC Adjacency Matrix:\n", patent_medium_adj.toarray())
print("Medium IPC ↔ Big IPC Adjacency Matrix:\n", medium_big_adj.toarray())
print("Big IPC ↔ Main IPC Adjacency Matrix:\n", big_main_adj.toarray())

Patent ↔ Medium IPC Adjacency Matrix:
 [[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Medium IPC ↔ Big IPC Adjacency Matrix:
 [[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Big IPC ↔ Main IPC Adjacency Matrix:
 [[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


In [68]:
# Step 3: Build adjacency matrices
# Patent ↔ Medium IPC (separate matrices)
patent_medium_edges = [(patent_idx[row["Patent ID"]], medium_ipc_idx[row["Medium CPC"]])
                       for _, row in df.iterrows()]

# Create 'Patent -> Medium' adjacency matrix
rows, cols = zip(*set(patent_medium_edges))
patent_medium_adj = coo_matrix(
    (np.ones(len(rows)), 
     (rows, cols)),
    shape=(len(patent_nodes), len(medium_ipc_nodes))
)

# Create 'Medium -> Patent' adjacency matrix (reverse edges)
medium_patent_edges = [(medium_ipc_idx[row["Medium CPC"]], patent_idx[row["Patent ID"]])
                       for _, row in df.iterrows()]
rows, cols = zip(*set(medium_patent_edges))
medium_patent_adj = coo_matrix(
    (np.ones(len(rows)), 
     (rows, cols)),
    shape=(len(medium_ipc_nodes), len(patent_nodes))
)

# Medium IPC ↔ Big IPC (separate matrices)
medium_big_edges = [(medium_ipc_idx[row["Medium CPC"]], big_ipc_idx[row["Big CPC"]])
                    for _, row in df.iterrows()]

# Create 'Medium -> Big' adjacency matrix
rows, cols = zip(*set(medium_big_edges))
medium_big_adj = coo_matrix(
    (np.ones(len(rows)), 
     (rows, cols)),
    shape=(len(medium_ipc_nodes), len(big_ipc_nodes))
)

# Create 'Big -> Medium' adjacency matrix (reverse edges)
big_medium_edges = [(big_ipc_idx[row["Big CPC"]], medium_ipc_idx[row["Medium CPC"]])
                    for _, row in df.iterrows()]
rows, cols = zip(*set(big_medium_edges))
big_medium_adj = coo_matrix(
    (np.ones(len(rows)), 
     (rows, cols)),
    shape=(len(big_ipc_nodes), len(medium_ipc_nodes))
)

# Big IPC ↔ Main IPC (separate matrices)
big_main_edges = [(big_ipc_idx[row["Big CPC"]], main_ipc_idx[row["Main CPC"]])
                  for _, row in df.iterrows()]

# Create 'Big -> Main' adjacency matrix
rows, cols = zip(*set(big_main_edges))
big_main_adj = coo_matrix(
    (np.ones(len(rows)), 
     (rows, cols)),
    shape=(len(big_ipc_nodes), len(main_ipc_nodes))
)

# Create 'Main -> Big' adjacency matrix (reverse edges)
main_big_edges = [(main_ipc_idx[row["Main CPC"]], big_ipc_idx[row["Big CPC"]])
                  for _, row in df.iterrows()]
rows, cols = zip(*set(main_big_edges))
main_big_adj = coo_matrix(
    (np.ones(len(rows)), 
     (rows, cols)),
    shape=(len(main_ipc_nodes), len(big_ipc_nodes))
)

# Print adjacency matrices as dense arrays for checking
print("Patent -> Medium IPC Adjacency Matrix:\n", patent_medium_adj.toarray())
print("Medium -> Patent IPC Adjacency Matrix:\n", medium_patent_adj.toarray())
print("Medium -> Big IPC Adjacency Matrix:\n", medium_big_adj.toarray())
print("Big -> Medium IPC Adjacency Matrix:\n", big_medium_adj.toarray())
print("Big -> Main IPC Adjacency Matrix:\n", big_main_adj.toarray())
print("Main -> Big IPC Adjacency Matrix:\n", main_big_adj.toarray())


KeyboardInterrupt: 

In [82]:
print(f"patent_medium_adj shape: {medium_patent_adj.shape}")
print(f"medium_big_adj shape: {medium_big_adj.shape}")
print(f"big_main_adj shape: {big_main_adj.shape}")

patent_medium_adj shape: (555, 9098)
medium_big_adj shape: (557, 128)
big_main_adj shape: (128, 9)


In [28]:
#from scipy.sparse import save_npz, load_npz
# Save the sparse matrix to a file
#save_npz("patent_medium_adj.npz", patent_medium_adj)

#patent_medium_adj = load_npz("patent_medium_adj.npz")

#big_main_adj = load_npz("big_main_adj.npz")


In [83]:
from scipy.sparse import coo_matrix, hstack, vstack

# Define zero matrices where connections don't exist
zero_patent_medium = coo_matrix((len(patent_nodes), len(medium_ipc_nodes)))  # (1057711, 668)
zero_medium_big = coo_matrix((len(medium_ipc_nodes), len(big_ipc_nodes)))    # (668, 32)
zero_big_main = coo_matrix((len(big_ipc_nodes), len(main_ipc_nodes)))        # (32, 9)
patent_self_connections = coo_matrix(
    (np.ones(len(patent_nodes)), 
     (range(len(patent_nodes)), range(len(patent_nodes)))),  # Diagonal elements
    shape=(len(patent_nodes), len(patent_nodes))
)
medium_self_connections = coo_matrix(
    (np.ones(len(medium_ipc_nodes)), 
     (range(len(medium_ipc_nodes)), range(len(medium_ipc_nodes)))),  # Diagonal elemmedium_ipc  shape=(len(medium_ipc_nodes), len(medium_ipc_nodes))
)
big_self_connections = coo_matrix(
    (np.ones(len(big_ipc_nodes)), 
     (range(len(big_ipc_nodes)), range(len(big_ipc_nodes)))),  # Diagonal elements
    shape=(len(big_ipc_nodes), len(big_ipc_nodes))
)
main_self_connections = coo_matrix(
    (np.ones(len(main_ipc_nodes)), 
     (range(len(main_ipc_nodes)), range(len(main_ipc_nodes)))),  # Diagonal elements
    shape=(len(main_ipc_nodes), len(main_ipc_nodes))
)
# Top row: patent-medium connections, and no direct connections to big or main IPCs
upper = hstack([
    patent_self_connections,  
    patent_medium_adj,                                   # Patent-medium connections
    coo_matrix((len(patent_nodes), len(big_ipc_nodes))),  # No direct patent-big connections
    coo_matrix((len(patent_nodes), len(main_ipc_nodes)))   # No direct patent-main connections
])
print(upper.shape)

# Second row: medium-patent transpose, medium-big connections, and no direct medium-main connections
middle1 = hstack([
    patent_medium_adj.T,  # Medium-patent connections
    medium_self_connections,  # No self-connections for medium IPCs
    medium_big_adj,  # Medium-big connections
    coo_matrix((len(medium_ipc_nodes), len(main_ipc_nodes)))  # No direct medium-main connections
])
print(middle1.shape)
# Third row: big-medium transpose, big-main connections, and no direct big-patent connections
middle2 = hstack([
    coo_matrix((len(patent_nodes), len(big_ipc_nodes))).T,  # No direct big-patent connections
    medium_big_adj.T,   # Big-medium connections
    big_self_connections,  # No self-connections for big IPCs
    big_main_adj  # Big-main connections
])
print(middle2.shape)
# Bottom row: main-big transpose, and no direct connections to patents or medium IPCs
lower = hstack([
    coo_matrix((len(patent_nodes), len(main_ipc_nodes))).T,  # No direct main-patent connections
    coo_matrix((len(medium_ipc_nodes), len(main_ipc_nodes))).T,  # No direct main-medium connections
    big_main_adj.T,   # Main-big connections
    main_self_connections  # No self-connections for main IPCs
])
print(lower.shape)
# Stack all rows to create the full adjacency matrix
combined_adj = vstack([upper, middle1, middle2, lower])


(10105, 10799)
(557, 10799)
(128, 10799)
(9, 10799)


In [84]:
# Check if the combined adjacency matrix is symmetric
if (combined_adj != combined_adj.T).nnz == 0:
    print("The adjacency matrix is symmetric.")
else:
    print("The adjacency matrix is not symmetric.")


The adjacency matrix is symmetric.


In [85]:
from scipy.sparse import save_npz, load_npz
save_npz("combined_adj_small.npz", combined_adj)

In [87]:
print(combined_adj.toarray())

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


### Feature Matrix

In [88]:
patent_features_df = pd.read_pickle('patent_embeddings.pkl')
patent_features_df["CPC"] = patent_features_df["CPC"].astype(str)
medium_features_df = pd.read_pickle('cpc_embeddings.pkl')
big_features_df=pd.read_pickle('big_ipc_embeddings.pkl')
main_features_df=pd.read_pickle('main_ipc_embeddings.pkl')

In [89]:
main_features_df

Unnamed: 0,CPC,Title
0,A,"[-0.029365331, 0.04460647, -0.015125234, 0.014..."
1,B,"[-0.024831055, 0.004053353, -0.022859566, 0.00..."
2,C,"[-0.10232349, 0.032099463, -0.048996966, 0.101..."
3,D,"[-0.1294748, 0.07721605, -0.017150132, 0.07094..."
4,E,"[-0.037969474, 0.036008626, 0.06348025, 0.0234..."
5,F,"[-0.05052111, 0.039543033, 0.04952723, 0.06663..."
6,G,"[-0.015305433, 0.020689487, 0.008086105, 0.113..."
7,H,"[-0.05202508, 0.11225732, 0.029993694, 0.08111..."
8,Y,"[-0.012505763, -0.098826125, -0.0462021, -0.02..."


In [90]:
# Step 2: Create index mappings
patent_idx = {str(node): i for i, node in enumerate(patent_nodes)}
medium_ipc_idx = {str(node): i for i, node in enumerate(medium_ipc_nodes)}
big_ipc_idx = {str(node): i for i, node in enumerate(big_ipc_nodes)}
main_ipc_idx = {str(node): i for i, node in enumerate(main_ipc_nodes)}

In [91]:
len(patent_idx)

10105

In [92]:
patent_features_df.dtypes

CPC      object
Title    object
dtype: object

In [93]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix, save_npz
import torch



def align_features(df, node_mapping, feature_dim):
    aligned_features = np.zeros((len(node_mapping), feature_dim))  # Initialize a zero matrix
    for node, idx in node_mapping.items():
        if node in df.index:  # Ensure the node exists in the DataFrame
            feature_vector = df.loc[node, "Title"]  # Get the feature vector
            aligned_features[idx, :] = np.array(feature_vector)  # Assign the feature vector
        else:
            aligned_features[idx, :] = np.zeros(feature_dim)  # Assign a zero vector for missing nodes
    return torch.tensor(aligned_features, dtype=torch.float32)



# Convert the features DataFrames to numpy arrays and set the index to the respective node IDs
feature_dim = 384  # Example feature dimension (adjust as needed)

# For patents
patent_features_df.set_index("CPC", inplace=True)
aligned_patent_features = align_features(patent_features_df, patent_idx, feature_dim)

# For medium IPCs
medium_features_df.set_index("CPC", inplace=True)
aligned_medium_features = align_features(medium_features_df, medium_ipc_idx, feature_dim)

# For big IPCs
big_features_df.set_index("CPC", inplace=True)
aligned_big_features = align_features(big_features_df, big_ipc_idx, feature_dim)

# For main IPCs
main_features_df.set_index("CPC", inplace=True)
aligned_main_features = align_features(main_features_df, main_ipc_idx, feature_dim)

# Step 3: Stack the features
combined_features = np.vstack([
    aligned_patent_features,
    aligned_medium_features,
    aligned_big_features,
    aligned_main_features
])

# Step 4: Save the combined feature matrix
save_npz("combined_features_matrix.npz", coo_matrix(combined_features))

# Optional: Verify the shape and order
print(f"Combined feature matrix shape: {combined_features.shape}")
print(f"Number of patent features: {aligned_patent_features.shape[0]}")
print(f"Number of medium IPC features: {aligned_medium_features.shape[0]}")
print(f"Number of big IPC features: {aligned_big_features.shape[0]}")
print(f"Number of main IPC features: {aligned_main_features.shape[0]}")


Combined feature matrix shape: (10799, 384)
Number of patent features: 10105
Number of medium IPC features: 557
Number of big IPC features: 128
Number of main IPC features: 9


In [95]:
'20130050000' in patent_features_df.index



False

In [96]:
aligned_patent_features

tensor([[-0.0367, -0.0003, -0.0465,  ...,  0.0678, -0.0619, -0.0571],
        [-0.0490, -0.0121, -0.0688,  ...,  0.1723,  0.0379, -0.0342],
        [-0.0546, -0.0845, -0.1177,  ...,  0.0683,  0.0276, -0.0078],
        ...,
        [-0.0820, -0.0027, -0.0289,  ...,  0.1307,  0.0727, -0.0230],
        [-0.0833, -0.0214,  0.0456,  ...,  0.0633, -0.0748,  0.1133],
        [-0.0227,  0.0270, -0.0251,  ...,  0.1149, -0.0076,  0.0462]])

## Graph Training

In [None]:
import networkx as nx

# Create a graph
G = nx.Graph()

# Add nodes with embeddings as attributes
for node_id, embedding in node_embeddings.items():
    G.add_node(node_id, embedding=embedding)

# Add edges with attributes from DataFrame
for _, row in df.iterrows():
    G.add_edge(row["Source"], row["Target"], weight=row["Weight"])