In [1]:
import pickle
import random
import urllib
import time
import json
import pandas as pd
import os
from tqdm import tqdm

# Goal: Analyze MA change by authors

First test: sample patents, then authors, then all patents (& therefore compounds) made by authors

## Find Authors & Assignees

From JSON files, find all authors & assignees

In [78]:
def get_authors_and_assignees(data):
    """ Use the Pubchem-downloaded patent JSON files to get all authors & assignees 

    Args:
        data (json dict): dictionary of patent JSON records
    """
    inventor_dict = [x for x in data["Record"]["Section"] if x["TOCHeading"] == "Inventor"][0]["Information"][0]["Value"]["StringWithMarkup"]
    authors = [x["String"] for x in inventor_dict]
    
    assignee_dict = [x for x in data["Record"]["Section"] if x["TOCHeading"] == "Assignee"][0]["Information"][0]["Value"]["StringWithMarkup"]
    assignees = [x["String"] for x in assignee_dict]

    return authors, assignees


In [82]:
## Get info from all patents in the Data/Patents directory
fp = "Data/Patents/"
files = os.listdir(fp)
authors = []
assignees = []

for file in tqdm(files):

    data = json.load(open(fp + file))

    new_authors, new_assignees = get_authors_and_assignees(data)

    authors.extend(new_authors)
    assignees.extend(new_assignees)


100%|██████████| 74/74 [00:00<00:00, 1113.23it/s]


In [None]:
#Remove duplicates (if present) and save
authors = list(set(authors))
assignees = list(set(assignees))

pickle.dump(authors, file=open("Data/Patents/authors.p", "wb"))
pickle.dump(assignees, file=open("Data/Patents/assignees.p", "wb"))

## Find patent ids associated with authors

Scraped patents associated with authors - see *patent_scraping.py*, now I want to
find all patents (and eventually compounds) associated with each author

In [5]:
## Getting patent IDs from all JSON author files
full_data = []

fp = "Data/Patents/Patent_Author_Records/"
files = os.listdir(fp)

patent_data = []

for f in tqdm(files):
    try:
        data = json.load(open(file=fp + f))
    except ValueError:
        data = [] 

    for patent in data:
        if "inventors" in patent:
            inventors = patent["inventors"]
            #Get all authors (in list form)
            if type(inventors) != list:
                authors = []
                authors.append(inventors)
            else:
                authors = inventors
        else:
            authors = [""]
        
        #Get assignees which are not part of author list (meant to find only businesses/universities/other)
        if "assignees" in patent:
            a = patent["assignees"]
            if type(a) != list:
                assignees = []
                assignees.append(a)
            else:
                assignees = a

            assignees = list(set(assignees) - set(authors))
        else:
            assignees = ""

        if "classification" in patent:
            classification = patent["classification"]
        else:
            classification = ""
    
        for author in authors:
            patent_data.append({"ID": patent["publicationnumber"], "author": author, "assignees": assignees,
                "classification": classification})

    
df = pd.DataFrame.from_dict(patent_data)
print(df)


100%|██████████| 21947/21947 [14:30<00:00, 25.22it/s]  


                       ID                         author  \
0           US-8653233-B2  HOLLINGSWORTH MICHAEL ANTHONY   
1           US-8653233-B2                  KOHLGRAF KARL   
2           US-8653233-B2                    CAFFREY TOM   
3        US-2011312290-A1                 BEELER MICHAEL   
4        US-2011312290-A1   CANNON RICHARD HOLLINGSWORTH   
...                   ...                            ...   
7056907     US-8663196-B2          ZYZELEWSKI MARK EDWIN   
7056908     US-9610166-B2              GUNTHER STEPHEN B   
7056909     US-9610166-B2              O'FARRELL DESMOND   
7056910     US-9610166-B2          ZYZELEWSKI MARK EDWIN   
7056911     US-9610166-B2         RODENHOUSE ANDREW JOHN   

                                       assignees  \
0                     [UNIV NEBRASKA MEDICAL CT]   
1                     [UNIV NEBRASKA MEDICAL CT]   
2                     [UNIV NEBRASKA MEDICAL CT]   
3                         [COMTECH EF DATA CORP]   
4                  

In [6]:
df.to_csv("Data/Patents/patent_author_records.csv")

## Linking patent IDs to compounds: testing

In [14]:
## Test 1: patent_cpd_edges

patent_cpd_edges = pickle.load(file=open("Data/PubchemTesting/patent_cpd_edges_1980-01.p", "rb"))

print(list(patent_cpd_edges.items())[:2])

"""
Summary: has a list of all patents in a month, associated with SureChemBL compounds

This works! May be a pain to search over every month, but it's possible...
"""

[('US-4181664-A', ['SCHEMBL1812', 'SCHEMBL2454118', 'SCHEMBL11454132', 'SCHEMBL11470144', 'SCHEMBL11472968', 'SCHEMBL11455057', 'SCHEMBL1816', 'SCHEMBL11476626', 'SCHEMBL11472939', 'SCHEMBL11470688', 'SCHEMBL11462969', 'SCHEMBL11452721', 'SCHEMBL10498831', 'SCHEMBL10940552', 'SCHEMBL9458455', 'SCHEMBL1009808', 'SCHEMBL11483372', 'SCHEMBL11467835', 'SCHEMBL11454317', 'SCHEMBL120896', 'SCHEMBL1247', 'SCHEMBL11454025', 'SCHEMBL11454227', 'SCHEMBL11471395', 'SCHEMBL1967', 'SCHEMBL11476624', 'SCHEMBL10564539', 'SCHEMBL11471592', 'SCHEMBL9489027', 'SCHEMBL11481038', 'SCHEMBL11467833', 'SCHEMBL11475722', 'SCHEMBL2262', 'SCHEMBL11454052', 'SCHEMBL11462594', 'SCHEMBL11460770', 'SCHEMBL1798', 'SCHEMBL11085564', 'SCHEMBL25158', 'SCHEMBL11481104', 'SCHEMBL11453131', 'SCHEMBL11467831', 'SCHEMBL11470679', 'SCHEMBL11470687', 'SCHEMBL11454040', 'SCHEMBL9096638', 'SCHEMBL1586', 'SCHEMBL393249', 'SCHEMBL6966505', 'SCHEMBL9807176', 'SCHEMBL10699570', 'SCHEMBL11451896', 'SCHEMBL11462970', 'SCHEMBL11472941

"\nSummary: has a list of all patents in a month, associated with SureChemBL compounds\n\nThis works! May be a pain to search over every month, but it's possible...\n"

In [None]:
## Testing 2: SureChemBL data files

"""
Summary: holds all cpd:structure pairs

Useful for linking ids to structures, but nothing else right now...
"""

In [13]:
## Testing 3: index_edgelist_bipartite.p

index_edgelist = pickle.load(file=open("Data/PubchemTesting/index_edgelist_bipartite.p", "rb"))
print(type(index_edgelist))

print(index_edgelist[0:10])

"""
Summary: I think this is the igraph format of the graph structure...using igraph's ids

Not helpful, I'll have to link ids to patent/cpd ids anyway

"""

<class 'list'>
[(21642937, 13500164), (21642937, 13496703), (21642937, 11697068), (21642937, 13000345), (21642937, 13008282), (21642937, 11213625), (21642937, 14644888), (21642937, 8925145), (21642937, 14299252), (21642937, 11867437)]


In [16]:
## Testing 4: patent_summary.csv
patent_summary = pd.read_csv("Data/PubchemTesting/0-patent_summary.csv")

print(patent_summary)

"""
Summary: holds the numbers of patents (total & new) for each month

Not useful for this, unfortunately

"""


     Unnamed: 0    month  newPatents  totalPatents
0           635  1962-01           1             1
1           634  1962-02           0             1
2           633  1962-03           0             1
3           632  1962-04           0             1
4           631  1962-05           0             1
..          ...      ...         ...           ...
703         703  2020-08       17772       4718060
704         702  2020-09       18737       4736797
705         707  2020-10       22580       4759377
706         706  2020-11       17575       4776952
707         705  2020-12       22717       4799669

[708 rows x 4 columns]


In [19]:
## Testing 5: cpd_patent_edges

cpd_patent_edges = pickle.load(file=open("Data/PubchemTesting/cpd_patent_edges_1980-01.p", "rb"))

print(cpd_patent_edges[:2])

"""
Summary: holds all cpd:patent id pairs, showing where that particular compounds is (first?) found

Not as useful as patent_cpd_edges
"""

[('SCHEMBL11310284', 'US-4186208-A'), ('SCHEMBL7622', 'EP-0007112-A2')]


'\nSummary: holds all cpd:patent id pairs, showing where that particular compounds is (first?) found\n\nNot as useful as patent_cpd_edges\n'