In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
indices = np.load("embedded/all/indices.npy")
titleabstracts = np.load("embedded/all/titleabstracts.npy")
fulltext = np.load("embedded/all/fulltext.npy")
fulltext_trunc = np.load("embedded/all/fulltext_trunc.npy")

In [3]:
top_level = pd.read_csv("top_level_classifications.csv")
second_level = pd.read_csv("second_level_classifications.csv")
# Convert each element in app_ref_doc_number to str
top_level["app_ref_doc_number"] = top_level["app_ref_doc_number"].astype(str)
second_level["app_ref_doc_number"] = second_level["app_ref_doc_number"].astype(str)

In [4]:
def top_all_indices(patent_index, all_patent_indices, all_patent_embeddings):
    ind = np.where(all_patent_indices == patent_index)[0]
    patent_embedding = all_patent_embeddings[ind]
    
    # Normalize A and B for cosine similarity
    patent_embedding_norm = patent_embedding / np.linalg.norm(patent_embedding)
    all_patent_embeddings_norm = all_patent_embeddings / np.linalg.norm(all_patent_embeddings, axis=1, keepdims=True)

    # Compute cosine similarity
    similarities = np.dot(all_patent_embeddings_norm, patent_embedding_norm.T)

    # Combine C and D into a 2D array
    CD_combined = np.column_stack((similarities, all_patent_indices))

    # Sort by the first column (C) in descending order
    CD_sorted = CD_combined[np.argsort(CD_combined[:, 0])[::-1]]

    # Extract the sorted D values
    D_sorted = CD_sorted[:, 1]

    # D_sorted is the desired output
    return D_sorted

In [13]:
print(top_all_indices("17482972", indices, titleabstracts))

['17482972' '17557370' '17369347' ... '17803623' '17803631' '17803624']


In [5]:
def in_top_percentage(similarity, ind, percentage = 0.2):
    return (np.where(similarity == ind)[0] <= len(similarity) * percentage)[0]

In [10]:
def other_index_gen(each_index):
    try:
        top_index = np.where(top_level['app_ref_doc_number'] == each_index)[0][0]
        index_row = top_level.iloc[top_index]
        # Get list of column titles that have value of 1
        columns = index_row[index_row == 1].index.tolist()
        # Get all rows that have 1 in in the same columns
        rows = top_level[top_level[columns].eq(1).any(axis = 1)]
        yield from rows["app_ref_doc_number"].tolist()
    except:
        yield None
    

In [None]:
def accuracies(indices, embeddings):
    accuracy = []
    for i, each_index in enumerate(indices):
        if i % 100 == 0:
            print(i)
        # top_all_indices sorts all other indices in decreasing order of similarity
        similar = top_all_indices(each_index, indices, embeddings).tolist()
        locations = []
        other_indices = []
        # Other index gen yields all other indices in the same category as each_index patent
        for other_index_in_cat in other_index_gen(each_index):
            if other_index_in_cat == None:
                break
            other_indices.append(other_index_in_cat)
            try:
                locations.append(similar.index(other_index_in_cat))
            except:
                pass
        
        # locations ends up being a list of rankings of other indices in 
        # the same category as each_index patent
        # Percentage in the first half is then accuracy
        accuracy.append(len([i for i in locations if i < 15000]) / len(locations))
    return accuracy


In [None]:
title_abstract_accuracies = accuracies(indices, titleabstracts)
print("Title Abstract Accuracy: ", sum(title_abstract_accuracies) / len(title_abstract_accuracies))

In [None]:
fulltext_accuracies = accuracies(indices, fulltext)
print("Fulltext Accuracy: ", sum(fulltext_accuracies) / len(fulltext_accuracies))

In [None]:
fulltext_accuracies_trunc = accuracies(indices, fulltext)
print("Fulltext Accuracy: ", sum(fulltext_accuracies_trunc) / len(fulltext_accuracies_trunc))

In [None]:
all_embeddings = np.hstack((titleabstracts, fulltext))
all_embeddings_accuracies = accuracies(indices, all_embeddings)
print("All Embeddings Accuracy: ", sum(all_embeddings_accuracies) / len(all_embeddings_accuracies))

In [15]:
df = pd.read_csv("allpatents.csv")
print(df.shape)
df.head()

(616840, 12)


Unnamed: 0,title,classifications,classification_versions,abstract_text,assignees_orgnames,assignees_cities,assignees_countries,inventors_last_names,inventors_first_names,inventors_cities,inventors_countries,week
0,ODORANT COMPOUND,"['C11B 9/003', 'C07C 49/647', 'C07C 2601/08']","['20130101', '20130101', '20170501']",\nThe present invention relates to the field o...,[],[],[],"['CHAPUIS', 'COULOMB', 'ROBVIEUX']","['Christian', 'Julien', 'Fabrice']","['Satigny', 'Satigny', 'Satigny']","['CH', 'CH', 'CH']",ipab20230720_wk29
1,SYNCHRONIZATION OF RF PULSING SCHEMES AND OF S...,"['H01J 37/32174', 'H01J 37/32146']","['20130101', '20130101']",\nSystems and methods for synchronization of r...,[],[],[],"['Drewery', 'Wu', 'Paterson', 'Albarede']","['John Stephen', 'Ying', 'Alexander Miller', '...","['San Jose', 'Livermore', 'San Jose', 'Fremont']","['US', 'US', 'US', 'US']",ipab20230720_wk29
2,FUNCTION EXECUTION METHOD AND APPARATUS,['G06F 9/451'],['20180201'],\nA function execution method and electronic d...,"['VIVO MOBILE COMMUNICATION CO., LTD.']",['Dongguan'],['CN'],['HUANG'],['Li'],['Dongguan'],['CN'],ipab20230720_wk29
3,IMAGE FORMING APPARATUS,"['G03G 15/0105', 'G03G 21/1619', 'G03G 21/1652...","['20130101', '20130101', '20130101', '20130101...",\nAn image forming apparatus that is capable o...,[],[],[],"['Yoshida', 'Fukushima']","['Nobuyoshi', 'Naoki']","['Shizuoka', 'Shizuoka']","['JP', 'JP']",ipab20230720_wk29
4,DISPLAY DEVICE AND METHOD OF FABRICATING THE SAME,"['H04R 5/02', 'H05K 5/03', 'H04R 2499/15']","['20130101', '20130101', '20130101']",\nA display device includes a display panel wh...,[],[],[],"['AHN', 'YEON', 'LEE']","['Yi Joon', 'Eun Kyung', 'Jae Been']","['Seoul', 'Suwon-si', 'Seoul']","['KR', 'KR', 'KR']",ipab20230720_wk29
