In [15]:
import pickle
import random
import urllib
import time
import json
import pandas as pd

# Goal: Analyze MA change by authors

First test: sample patents, then authors, then all patents (& therefore compounds) made by authors

## Testing on Nov 2020 unique patents

In [2]:
#Read in unique patents from Nov 2020
data = pickle.load(file=open("Data/CpdPatentIdsDates/unique_patents_2020-11.p", "rb"))

print(data[0:10])

['US-10821293-B2', 'WO-2020237223-A1', 'EP-2981258-B8', 'US-10836113-B2', 'US-20200369598-A1', 'US-10829892-B2', 'US-20200363418-A1', 'EP-3355706-B1', 'US-10822514-B2', 'EP-3733775-A1']


In [3]:
#Sample 10 random patents - will be manually searched on pubchem
test = random.sample(data, 10)
print(test)

['WO-2020221451-A1', 'US-20200359684-A1', 'EP-3736314-A1', 'EP-3733705-A1', 'US-10836813-B2', 'US-20200360496-A1', 'EP-3733791-A1', 'EP-3737711-A1', 'US-10844114-B2', 'EP-3512942-B1']


## Sample PubChem API patent searching

Goal - access & search authors/assignees within patent records, in order to find a list of patents associated with each author/assignee

In [8]:
patent = "EP-3465118-B1"
patentapi = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/patent/{0}/JSON?".format(patent)
try:
    url=urllib.request.urlopen(patentapi)
except urllib.error.HTTPError as err:
    print("tried {} will sleep on it".format(patentapi))
    time.sleep(5)
    try:
        url=urllib.request.urlopen(patentapi)
    except urllib.error.HTTPError as err:
        print(err)
        
patresp = json.loads(url.read().decode('latin-1'))
print(patresp)
# submissiondate = patresp['Record']['Section'][0]["Information"][0]["Value"]["DateISO8601"][0]
# print("{0} {1} {2}".format(patent,submissiondate))

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1129)>

## Linking patent IDs to compounds: testing

In [14]:
## Test 1: patent_cpd_edges

patent_cpd_edges = pickle.load(file=open("Data/PubchemTesting/patent_cpd_edges_1980-01.p", "rb"))

print(list(patent_cpd_edges.items())[:2])

"""
Summary: has a list of all patents in a month, associated with SureChemBL compounds

This works! May be a pain to search over every month, but it's possible...
"""

[('US-4181664-A', ['SCHEMBL1812', 'SCHEMBL2454118', 'SCHEMBL11454132', 'SCHEMBL11470144', 'SCHEMBL11472968', 'SCHEMBL11455057', 'SCHEMBL1816', 'SCHEMBL11476626', 'SCHEMBL11472939', 'SCHEMBL11470688', 'SCHEMBL11462969', 'SCHEMBL11452721', 'SCHEMBL10498831', 'SCHEMBL10940552', 'SCHEMBL9458455', 'SCHEMBL1009808', 'SCHEMBL11483372', 'SCHEMBL11467835', 'SCHEMBL11454317', 'SCHEMBL120896', 'SCHEMBL1247', 'SCHEMBL11454025', 'SCHEMBL11454227', 'SCHEMBL11471395', 'SCHEMBL1967', 'SCHEMBL11476624', 'SCHEMBL10564539', 'SCHEMBL11471592', 'SCHEMBL9489027', 'SCHEMBL11481038', 'SCHEMBL11467833', 'SCHEMBL11475722', 'SCHEMBL2262', 'SCHEMBL11454052', 'SCHEMBL11462594', 'SCHEMBL11460770', 'SCHEMBL1798', 'SCHEMBL11085564', 'SCHEMBL25158', 'SCHEMBL11481104', 'SCHEMBL11453131', 'SCHEMBL11467831', 'SCHEMBL11470679', 'SCHEMBL11470687', 'SCHEMBL11454040', 'SCHEMBL9096638', 'SCHEMBL1586', 'SCHEMBL393249', 'SCHEMBL6966505', 'SCHEMBL9807176', 'SCHEMBL10699570', 'SCHEMBL11451896', 'SCHEMBL11462970', 'SCHEMBL11472941

"\nSummary: has a list of all patents in a month, associated with SureChemBL compounds\n\nThis works! May be a pain to search over every month, but it's possible...\n"

In [None]:
## Testing 2: SureChemBL data files

"""
Summary: holds all cpd:structure pairs

Useful for linking ids to structures, but nothing else right now...
"""

In [13]:
## Testing 3: index_edgelist_bipartite.p

index_edgelist = pickle.load(file=open("Data/PubchemTesting/index_edgelist_bipartite.p", "rb"))
print(type(index_edgelist))

print(index_edgelist[0:10])

"""
Summary: I think this is the igraph format of the graph structure...using igraph's ids

Not helpful, I'll have to link ids to patent/cpd ids anyway

"""

<class 'list'>
[(21642937, 13500164), (21642937, 13496703), (21642937, 11697068), (21642937, 13000345), (21642937, 13008282), (21642937, 11213625), (21642937, 14644888), (21642937, 8925145), (21642937, 14299252), (21642937, 11867437)]


In [16]:
## Testing 4: patent_summary.csv
patent_summary = pd.read_csv("Data/PubchemTesting/0-patent_summary.csv")

print(patent_summary)

"""
Summary: holds the numbers of patents (total & new) for each month

Not useful for this, unfortunately

"""


     Unnamed: 0    month  newPatents  totalPatents
0           635  1962-01           1             1
1           634  1962-02           0             1
2           633  1962-03           0             1
3           632  1962-04           0             1
4           631  1962-05           0             1
..          ...      ...         ...           ...
703         703  2020-08       17772       4718060
704         702  2020-09       18737       4736797
705         707  2020-10       22580       4759377
706         706  2020-11       17575       4776952
707         705  2020-12       22717       4799669

[708 rows x 4 columns]


In [19]:
## Testing 5: cpd_patent_edges

cpd_patent_edges = pickle.load(file=open("Data/PubchemTesting/cpd_patent_edges_1980-01.p", "rb"))

print(cpd_patent_edges[:2])

"""
Summary: holds all cpd:patent id pairs, showing where that particular compounds is (first?) found

Not as useful as patent_cpd_edges
"""

[('SCHEMBL11310284', 'US-4186208-A'), ('SCHEMBL7622', 'EP-0007112-A2')]


'\nSummary: holds all cpd:patent id pairs, showing where that particular compounds is (first?) found\n\nNot as useful as patent_cpd_edges\n'