In [1]:
import pickle
import random
import urllib
import time
import json
import pandas as pd
import os
from tqdm import tqdm

import rdkit.Chem as Chem

# Goal: Analyze MA change by authors & assignees (companies)

First test: sample patents, then authors, then all patents (& therefore compounds) made by authors


(do the same as above, but with assignees)

## Find Authors & Assignees

From JSON files, find all authors & assignees

In [78]:
def get_authors_and_assignees(data):
    """ Use the Pubchem-downloaded patent JSON files to get all authors & assignees 

    Args:
        data (json dict): dictionary of patent JSON records
    """
    inventor_dict = [x for x in data["Record"]["Section"] if x["TOCHeading"] == "Inventor"][0]["Information"][0]["Value"]["StringWithMarkup"]
    authors = [x["String"] for x in inventor_dict]
    
    assignee_dict = [x for x in data["Record"]["Section"] if x["TOCHeading"] == "Assignee"][0]["Information"][0]["Value"]["StringWithMarkup"]
    assignees = [x["String"] for x in assignee_dict]

    return authors, assignees


In [82]:
## Get info from all patents in the Data/Patents directory
fp = "Data/Patents/"
files = os.listdir(fp)
authors = []
assignees = []

for file in tqdm(files):

    data = json.load(open(fp + file))

    new_authors, new_assignees = get_authors_and_assignees(data)

    authors.extend(new_authors)
    assignees.extend(new_assignees)


100%|██████████| 74/74 [00:00<00:00, 1113.23it/s]


In [None]:
#Remove duplicates (if present) and save
authors = list(set(authors))
assignees = list(set(assignees))

pickle.dump(authors, file=open("Data/Patents/authors.p", "wb"))
pickle.dump(assignees, file=open("Data/Patents/assignees.p", "wb"))

## Find patent ids associated with authors

Scraped patents associated with authors - see *patent_scraping.py*, now I want to
find all patents (and eventually compounds) associated with each author

In [5]:
## Getting patent IDs from all JSON author files
full_data = []

fp = "Data/Patents/Patent_Author_Records/"
files = os.listdir(fp)

patent_data = []

for f in tqdm(files):
    try:
        data = json.load(open(file=fp + f))
    except ValueError:
        data = [] 

    for patent in data:
        if "inventors" in patent:
            inventors = patent["inventors"]
            #Get all authors (in list form)
            if type(inventors) != list:
                authors = []
                authors.append(inventors)
            else:
                authors = inventors
        else:
            authors = [""]
        
        #Get assignees which are not part of author list (meant to find only businesses/universities/other)
        if "assignees" in patent:
            a = patent["assignees"]
            if type(a) != list:
                assignees = []
                assignees.append(a)
            else:
                assignees = a

            assignees = list(set(assignees) - set(authors))
        else:
            assignees = ""

        if "classification" in patent:
            classification = patent["classification"]
        else:
            classification = ""
    
        for author in authors:
            patent_data.append({"ID": patent["publicationnumber"], "author": author, "assignees": assignees,
                "classification": classification})

    
df = pd.DataFrame.from_dict(patent_data)
print(df)


100%|██████████| 21947/21947 [14:30<00:00, 25.22it/s]  


                       ID                         author  \
0           US-8653233-B2  HOLLINGSWORTH MICHAEL ANTHONY   
1           US-8653233-B2                  KOHLGRAF KARL   
2           US-8653233-B2                    CAFFREY TOM   
3        US-2011312290-A1                 BEELER MICHAEL   
4        US-2011312290-A1   CANNON RICHARD HOLLINGSWORTH   
...                   ...                            ...   
7056907     US-8663196-B2          ZYZELEWSKI MARK EDWIN   
7056908     US-9610166-B2              GUNTHER STEPHEN B   
7056909     US-9610166-B2              O'FARRELL DESMOND   
7056910     US-9610166-B2          ZYZELEWSKI MARK EDWIN   
7056911     US-9610166-B2         RODENHOUSE ANDREW JOHN   

                                       assignees  \
0                     [UNIV NEBRASKA MEDICAL CT]   
1                     [UNIV NEBRASKA MEDICAL CT]   
2                     [UNIV NEBRASKA MEDICAL CT]   
3                         [COMTECH EF DATA CORP]   
4                  

In [6]:
df.to_csv("Data/Patents/patent_author_records.csv")

## Link patent IDs to compounds

Based on testing (below) - using patent_cpd_edges to search

In [2]:
df = pd.read_csv("Data/Patents/patent_author_records.csv")

In [3]:
## 1: get all unique patents
patents = set(df["ID"])

### Randomly sampling from patents in order to make this computationally feasible (goal: ~1 million cpds)
patents = random.sample(patents, 100000)

print(len(patents))


100000


In [4]:
# Delete patent dataframe

del(df)

In [50]:
def build_month_increments(start, stop):
    """ Build month increments in the form YYYY-MM

    Args:
        start (int): Starting year
        stop (int): Ending year

    Returns:
        list: list of strings in the form YYYY-MM (e.g., "1980-01")
    """
    months = []
    while start <= stop:
        for month in [
                "01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
                "11", "12"
        ]:
            months.append(str(start) + "-" + month)
        start += 1

    return months

In [60]:
### Tangent - put all patent_cpd_edgesXXX into a single dictionary to search from

#NOTE: ONLY SHOULD BE RUN ONCE

patent_cpd_edges = {}

months = build_month_increments(1980, 2020)

files = []
for month in months:
    files.append("Data/CpdPatentIdsDates/Patent_Cpd_Edges/patent_cpd_edges_" + month + ".p")

cpds = []
for file in tqdm(files):
    with open(file, "rb") as f:
        try:
            edges = pickle.load(f)
        except EOFError:
            edges = {}

        patent_cpd_edges.update(edges)

# pickle.dump(patent_cpd_edges, file=open("Data/CpdPatentIdsDates/Patent_Cpd_Edges/full_patent_cpd_edges.p", "wb"))

100%|██████████| 492/492 [04:44<00:00,  1.73it/s]


In [57]:
## Get all compound IDs - filter down into a non-repeating list

# #NOTE: UNCOMMENT PICKLE.LOAD WHEN RUNNING THIS WITHOUT BLOCK ABOVE
# patent_cpd_edges = pickle.load(file=open("Data/CpdPatentIdsDates/Patent_Cpd_Edges/full_patent_cpd_edges.p", "rb"))
cpds = []
patents = pickle.load(file=open("Data/Patents/author_patent_samples.p", "rb"))
#selected_patents = [p for p in patent_cpd_edges.keys() if p in patents]
    
for p in tqdm(patents):
    if p in patent_cpd_edges:
        cpds.extend(patent_cpd_edges[p])

cpds = list(set(cpds))

## NOTE: goal of this is to have ~1 million (change number of patents otherwise)
print(len(cpds))
print(cpds[0:10])

100%|██████████| 100000/100000 [00:00<00:00, 100615.67it/s]


1034394
['SCHEMBL6077928', 'SCHEMBL1050012', 'SCHEMBL2848301', 'SCHEMBL48933', 'SCHEMBL8729586', 'SCHEMBL7002444', 'SCHEMBL12741818', 'SCHEMBL14350314', 'SCHEMBL22645514', 'SCHEMBL7919644']


In [58]:
## Save compounds
pickle.dump(cpds, file=open("Data/Patents/author_cpds_sampled.p", "wb"))

## Save sampled patents too
pickle.dump(patents, file=open("Data/Patents/author_patent_samples.p", "wb"))

KeyboardInterrupt: 

In [None]:
# Delete edges from memory
del(patent_cpd_edges)

## Find patent cpds associated with assignees

Goal - less than 1 million unique cpds (different from patent samples - over 1e6 is fine, as long as unique cpds stay below 1e6)

In [66]:
## Step 1: get all assignees, and find those which are companies (filter by required naming)

assignees = [a[:-5] for a in os.listdir("Data/Patents/Patent_Assignee_Records/")]

print(len(assignees))
print(assignees[0:10])

6166
['', '10X_GENOMICS_INC', '21ST_CENTURY_MEDICINE', '3D_ECO_OIL_LTD', '3D_SYSTEMS_INC', 'A-DEC_INC', 'A_P_C_A_ASSEMBLEE_PERMANENTE_DES_CHAMBRES_D_AGRICULTURE', 'AAKESSON_PER', 'AASBERG-PETERSEN_KIM', 'ABAECHERLI_ROGER']


In [67]:
# Filtering - from companiesinc.com (might be better ways to do this?), as well as adding universities
terms = ["CORP", "INC", "CO", "LTD", "LLC", "LLLP", "RLLLP", "CORPORATION", "INCORPORATED", "LIMITED", "COMPANY", "UNIV", "UNIVERSITY"]

assignees = [a for a in assignees if any(term in a for term in terms)]

#Sample 1000 assigness from this list (otherwise too many compounds)
assignees = random.sample(assignees, 1000)

print(len(assignees))
print(assignees[0:10])

1000
['UNIV_MAINE_SYSTEM', 'SHIONOGI_&_CO', 'PARK_OHIO_INDUSTRIES_INC', 'KODAK_LTD', 'BIOMEDICAL_RES_GROUP_INC', 'KAWASAKI_STEEL_CO', 'IMMUNEX_CORP', 'NANYA_PLASTICS_CORP', 'POLYSAR_LTD', 'XYLECO_INC']


In [68]:
## Step 2: get all patents from JSON files
full_data = []

fp = "Data/Patents/Patent_Assignee_Records/"

#Use filtered assignee list to narrow down files
files = [f for f in os.listdir(fp) if f[:-5] in assignees]

patent_data = []

for f in tqdm(files):
    try:
        data = json.load(open(file=fp + f))
    except ValueError:
        data = [] 

    for patent in data:
        if "inventors" in patent:
            inventors = patent["inventors"]
            #Get all authors (in list form)
            if type(inventors) != list:
                authors = []
                authors.append(inventors)
            else:
                authors = inventors
        else:
            authors = [""]

        if "classification" in patent:
            classification = patent["classification"]
        else:
            classification = ""

        #Save patent data by assignee - one patent per assignee, which is obtained from the file name
        try:
            patent_data.append({"ID": patent["publicationnumber"], "author": authors, "assignees": f[:-5],
                "classification": classification})
        except TypeError:
            pass

    
df = pd.DataFrame.from_dict(patent_data)
print(df)


100%|██████████| 1000/1000 [06:21<00:00,  2.62it/s] 


                      ID                                             author  \
0       WO-2021226290-A1  [MCDONNELL WYATT, STUBBINGTON MICHAEL JOHN, MC...   
1       WO-2021247618-A1  [MCDONNELL WYATT JAMES, PFEIFFER KATHERINE, RA...   
2         US-10544413-B2  [BHARADWAJ RAJIV, SCHNALL-LEVIN MICHAEL, MAKAR...   
3       US-2018312873-A1                                    [ZHENG XINYING]   
4          US-9975122-B2  [MASQUELIER DONALD A, HINDSON BENJAMIN, NESS K...   
...                  ...                                                ...   
340935     US-9592253-B1          [KEYSER DONALD JEFFREY, GUILLEM ALVARO F]   
340936     US-9861658-B2          [KEYSER DONALD JEFFREY, GUILLEM ALVARO F]   
340937     US-9943637-B2          [KEYSER DONALD JEFFREY, GUILLEM ALVARO F]   
340938     US-8344322-B2                                 [EDWARDS OLIVER J]   
340939     US-9212990-B1                                  [MURAVIEV ANDREY]   

               assignees                           

In [69]:
df.to_csv("Data/Patents/patent_assignee_records.csv")

In [70]:
## Step 3: Get all compound IDs - filter down into a non-repeating list
## NOTE: NEEDS TO BE RUN WITH PATENT_CPD_EDGES BUILDING BLOCK ABOVE

cpds = []
patents = df["ID"].tolist()
#selected_patents = [p for p in patent_cpd_edges.keys() if p in patents]
    
for p in tqdm(patents):
    if p in patent_cpd_edges:
        cpds.extend(patent_cpd_edges[p])

cpds = list(set(cpds))

print(len(cpds))
print(cpds[0:10])

100%|██████████| 340940/340940 [00:09<00:00, 36930.20it/s]


2048622
['SCHEMBL2960146', 'SCHEMBL2819623', 'SCHEMBL1564641', 'SCHEMBL9194335', 'SCHEMBL1337775', 'SCHEMBL3560392', 'SCHEMBL10007518', 'SCHEMBL5850481', 'SCHEMBL16528158', 'SCHEMBL48933']


In [71]:
## Step 4: filter cpds with patent author cpds, goal is to have ~1 million unique cpds (otherwise, sample!)
author_cpds = pickle.load(file=open("Data/Patents/author_cpds_sampled.p", "rb"))

cpds = list(set(cpds) - set(author_cpds))

print(len(cpds))
print(cpds[0:10])


1751784
['SCHEMBL2960146', 'SCHEMBL2819623', 'SCHEMBL1564641', 'SCHEMBL9194335', 'SCHEMBL1337775', 'SCHEMBL3560392', 'SCHEMBL10007518', 'SCHEMBL5850481', 'SCHEMBL16528158', 'SCHEMBL1941238']


In [72]:
## Save compounds
pickle.dump(cpds, file=open("Data/Patents/assignee_cpds_sampled.p", "wb"))

## Save sampled patents too
pickle.dump(patents, file=open("Data/Patents/assignee_patent_samples.p", "wb"))

## Link cpds with structures

Will be done regardless of author/assignee origin

In [2]:
## Load compounds (START HERE FOR ID -> STRUCTURES)

cpds = pickle.load(file=open("Data/Patents/assignee_cpds_sampled.p", "rb"))

In [3]:
surechembl_allcpds = pickle.load(file=open("Data/Cpd_Data/SureChemBL_allCpds.p", "rb"))
print(type(surechembl_allcpds))

<class 'pandas.core.frame.DataFrame'>


In [4]:
cpd_df = surechembl_allcpds[surechembl_allcpds["SureChEMBL_ID"].isin(cpds)]

In [5]:
del(surechembl_allcpds)

In [6]:
cpd_df.to_csv("Data/Patents/assignee_cpds_structures.csv")

In [None]:
def inchi_to_mol(inchi, ID):
    """ Translates a smiles string to a mol file and saves it in the appropriate location

    Args:
        smiles (str): smiles description of a molecule
        index (int): identifying number of the molecule (unique to database)
        database (str): name of database (corresponds to a directory in 'Data')
    """
    try:
        mol = Chem.MolFromInchi(inchi)
        print(Chem.MolToMolBlock(mol),
              file=open(
                  "Data/AssemblyValues/Patent_Authors/" + ID + ".mol",
                  "w+"))
    except:
        pass

In [None]:
## Get mol files
tqdm.pandas()

cpd_df.progress_apply(lambda x: inchi_to_mol(x["InChI"], x["SureChEMBL_ID"]), axis=1)
    

  9%|▉         | 535989/5963485 [5:54:14<8389:24:52,  5.56s/it] 

## Linking patent IDs to compounds: testing

In [None]:
## Test 1: patent_cpd_edges

patent_cpd_edges = pickle.load(file=open("Data/PubchemTesting/patent_cpd_edges_1980-01.p", "rb"))

print(list(patent_cpd_edges.items())[:2])

"""
Summary: has a list of all patents in a month, associated with SureChemBL compounds

This works! May be a pain to search over every month, but it's possible...
"""

[('US-4181664-A', ['SCHEMBL1812', 'SCHEMBL2454118', 'SCHEMBL11454132', 'SCHEMBL11470144', 'SCHEMBL11472968', 'SCHEMBL11455057', 'SCHEMBL1816', 'SCHEMBL11476626', 'SCHEMBL11472939', 'SCHEMBL11470688', 'SCHEMBL11462969', 'SCHEMBL11452721', 'SCHEMBL10498831', 'SCHEMBL10940552', 'SCHEMBL9458455', 'SCHEMBL1009808', 'SCHEMBL11483372', 'SCHEMBL11467835', 'SCHEMBL11454317', 'SCHEMBL120896', 'SCHEMBL1247', 'SCHEMBL11454025', 'SCHEMBL11454227', 'SCHEMBL11471395', 'SCHEMBL1967', 'SCHEMBL11476624', 'SCHEMBL10564539', 'SCHEMBL11471592', 'SCHEMBL9489027', 'SCHEMBL11481038', 'SCHEMBL11467833', 'SCHEMBL11475722', 'SCHEMBL2262', 'SCHEMBL11454052', 'SCHEMBL11462594', 'SCHEMBL11460770', 'SCHEMBL1798', 'SCHEMBL11085564', 'SCHEMBL25158', 'SCHEMBL11481104', 'SCHEMBL11453131', 'SCHEMBL11467831', 'SCHEMBL11470679', 'SCHEMBL11470687', 'SCHEMBL11454040', 'SCHEMBL9096638', 'SCHEMBL1586', 'SCHEMBL393249', 'SCHEMBL6966505', 'SCHEMBL9807176', 'SCHEMBL10699570', 'SCHEMBL11451896', 'SCHEMBL11462970', 'SCHEMBL11472941

"\nSummary: has a list of all patents in a month, associated with SureChemBL compounds\n\nThis works! May be a pain to search over every month, but it's possible...\n"

In [None]:
## Testing 2: SureChemBL data files

"""
Summary: holds all cpd:structure pairs

Useful for linking ids to structures, but nothing else right now...
"""

In [None]:
## Testing 3: index_edgelist_bipartite.p

index_edgelist = pickle.load(file=open("Data/CpdPatentIdsDates/index_edgelist_bipartite.p", "rb"))
print(type(index_edgelist))

print(index_edgelist[0:10])

test_list = index_edgelist[0:10]

print(dict(test_list))
"""
Summary: I think this is the igraph format of the graph structure...using igraph's ids

Not helpful, I'll have to link ids to patent/cpd ids anyway

"""

<class 'list'>
[(21642937, 13500164), (21642937, 13496703), (21642937, 11697068), (21642937, 13000345), (21642937, 13008282), (21642937, 11213625), (21642937, 14644888), (21642937, 8925145), (21642937, 14299252), (21642937, 11867437)]
{21642937: 11867437}


"\nSummary: I think this is the igraph format of the graph structure...using igraph's ids\n\nNot helpful, I'll have to link ids to patent/cpd ids anyway\n\n"

In [None]:
## Testing 4: patent_summary.csv
patent_summary = pd.read_csv("Data/PubchemTesting/0-patent_summary.csv")

print(patent_summary)

"""
Summary: holds the numbers of patents (total & new) for each month

Not useful for this, unfortunately

"""


     Unnamed: 0    month  newPatents  totalPatents
0           635  1962-01           1             1
1           634  1962-02           0             1
2           633  1962-03           0             1
3           632  1962-04           0             1
4           631  1962-05           0             1
..          ...      ...         ...           ...
703         703  2020-08       17772       4718060
704         702  2020-09       18737       4736797
705         707  2020-10       22580       4759377
706         706  2020-11       17575       4776952
707         705  2020-12       22717       4799669

[708 rows x 4 columns]


In [None]:
## Testing 5: cpd_patent_edges

cpd_patent_edges = pickle.load(file=open("Data/PubchemTesting/cpd_patent_edges_1980-01.p", "rb"))

print(cpd_patent_edges[:2])

"""
Summary: holds all cpd:patent id pairs, showing where that particular compounds is (first?) found

Not as useful as patent_cpd_edges
"""

[('SCHEMBL11310284', 'US-4186208-A'), ('SCHEMBL7622', 'EP-0007112-A2')]


'\nSummary: holds all cpd:patent id pairs, showing where that particular compounds is (first?) found\n\nNot as useful as patent_cpd_edges\n'

In [None]:
## Testing 6: patent_ID_index_dict

patent_ID_index_dict = pickle.load(file=open("Data/patent_ID_index_dict.p", "rb"))

print(list(patent_ID_index_dict.items())[0:10])


""" 
Not useful this is the iGraph IDs, which could potentially be useful, but still needs to loop through everything...

Eh...wait...maybe if I can get a dictionary of all numbers associated with cpds from the master edgelist...
"""



[('EP-0269274-B1', 0), ('EP-1567545-A2', 1), ('EP-0835920-B1', 2), ('US-20050277593-A1', 3), ('EP-1937634-A4', 4), ('EP-1523328-A2', 5), ('EP-1474414-B1', 6), ('JP-H02174-A', 7), ('EP-1897683-B1', 8), ('US-3906010-A', 9)]


In [None]:
cpd_df = pickle.load(file=open("Data/Cpd_Data/master_cpd_date_index_df.p", "rb"))

print(cpd_df)

                      Cpd    Month     Index
0          SCHEMBL7356245  1962-01  17422145
1          SCHEMBL7247395  1963-08    992275
2          SCHEMBL7340057  1963-08   8302315
3           SCHEMBL180193  1965-02   9088770
4           SCHEMBL180194  1965-02   4821631
...                   ...      ...       ...
18845995  SCHEMBL21610019  2019-12  20096324
18845996  SCHEMBL21609918  2019-12    563495
18845997  SCHEMBL21609646  2019-12  13053968
18845998  SCHEMBL21609627  2019-12  12059745
18845999  SCHEMBL21610082  2019-12  12689868

[18846000 rows x 3 columns]


In [None]:
print(min(cpd_df["Index"]))

-1
