In [74]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import os
import numpy as np
import json

In [None]:
#Matplotlib defaults (eventually)

# Step 1: Build data structures

See pg 116 in notebook

* Dataframe, organized by individual patent-author / patent-assignee combinations (with appropriate tags)
    - in Data/Patents/patent_MA_results.csv
* Dictionary linking patents to a list of cpds associated with it
    - in Data/Patents/patent_cpds_links.p
* Dictionary linking patents to a list of MA values associated with it
* Dictionary linking patents to classifications

In [3]:
## Patent/classification dictionary
author_df = pd.read_csv("Data/Patents/patent_author_records.csv")
assignee_df = pd.read_csv("Data/Patents/patent_assignee_records.csv")

In [6]:
#Do author/assignees have different patent ids?

author_ids = list(set(list(author_df["ID"])))
assignee_ids = list(set(list(assignee_df["ID"])))

print(len(author_ids))
print(len(assignee_ids))

print(len(set(author_ids) - set(assignee_ids))) #This should be 0...

744686
332467
660634


In [7]:
patent_classifications = dict(zip(author_df.ID, author_df.classification))

print(len(patent_classifications))

patent_classifications.update(dict(zip(assignee_df.ID, assignee_df.classification)))

print(len(patent_classifications))

# tqdm.pandas()

# print("----- Author Patents ----- ")
# for index, row in tqdm(author_df.iterrows(), total=len(author_df)):
#     patent_classifications[row["ID"]] = row["classification"]

# print("----- Assignee Patents -----")
# for index, row in tqdm(assignee_df.iterrows(), total=len(assignee_df)):
#     patent_classifications[row["ID"]] = row["classification"]



744686
993101


In [8]:
pickle.dump(patent_classifications, file=open("Data/Patents/patent_classification_links.p", "wb"))

# Step 2: Link MAs with authors/companies

(1/9) Currently incomplete - data is still running on agave

In [2]:
results_df = pd.read_csv("Data/Patents/patent_MA_results.csv")

print(results_df.head())

   Unnamed: 0                ID         author    assignees
0           0  US-2016000413-A1  A AMAR OUSAMA  UNIV_BOSTON
1           3  US-2016000413-A1   BIGIO IRVING  UNIV_BOSTON
2           6  US-2016000413-A1  BRIGGS JOHN C  UNIV_BOSTON
3           9  US-2016000413-A1  CHARGIN DAVID  UNIV_BOSTON
4          12  US-2016000413-A1  LEE STEPHANIE  UNIV_BOSTON


In [3]:
patent_cpd_links = pickle.load(file=open("Data/Patents/patent_cpd_links.p", "rb"))

In [5]:
print(list(patent_cpd_links.items())[0:5])

[('WO-2013069771-A1', ['SCHEMBL14948774', 'SCHEMBL14948814', 'SCHEMBL14948780', 'SCHEMBL14948825', 'SCHEMBL14948770', 'SCHEMBL14948779', 'SCHEMBL14948837', 'SCHEMBL14948833', 'SCHEMBL14948841', 'SCHEMBL14948781', 'SCHEMBL14948768', 'SCHEMBL14948832', 'SCHEMBL14948838', 'SCHEMBL14948773', 'SCHEMBL14948775', 'SCHEMBL14948777', 'SCHEMBL14948842', 'SCHEMBL14948776', 'SCHEMBL14948831', 'SCHEMBL14948824', 'SCHEMBL14948819', 'SCHEMBL14948828', 'SCHEMBL14948765', 'SCHEMBL14948767', 'SCHEMBL14948820', 'SCHEMBL14948827', 'SCHEMBL14948830', 'SCHEMBL14948766', 'SCHEMBL14948835', 'SCHEMBL14948839', 'SCHEMBL14948769', 'SCHEMBL14948826', 'SCHEMBL14948818', 'SCHEMBL14948843', 'SCHEMBL14948821', 'SCHEMBL14948772', 'SCHEMBL14948834', 'SCHEMBL14948778', 'SCHEMBL14948764', 'SCHEMBL14948771', 'SCHEMBL14948813', 'SCHEMBL14948836', 'SCHEMBL14948840', 'SCHEMBL14948822', 'SCHEMBL675', 'SCHEMBL14948829', 'SCHEMBL14948823']), ('US-5326574-A', ['SCHEMBL762', 'SCHEMBL762', 'SCHEMBL15065', 'SCHEMBL4591', 'SCHEMBL38

In [6]:
def get_MA(fp):
    """ Get AssemblyGo MA value

    Args:
        fp (str): filepath to a particular .txt AssemblyGo output file

    Returns:
        label (str): label of the compound which was analzyed (empty if a failure)
        MA (int): assemblyGo MA value (-1 if a failure)
    """
    with open(fp) as f:
        lines = f.readlines()

    try:
        #molfile will be the last element in 0th line
        label = lines[0].split()[-1].split("/")[-1].split(".")[0]

        #MA will be last elemnt in -2nd line (will be an int)
        MA = int(lines[-2].split()[-1])

        time = float(lines[-1].split()[-1])
        
        return label, MA, time

    except:
        return None, None, None

In [8]:
fp = "Data/AssemblyValues/AssigneeCpds_Done/"
MA_values = []
for file in tqdm(os.listdir(fp)):
    if file.endswith(".txt"):
        label, MA, time = get_MA(fp + file)
        MA_values.append({
            "label": label,
            "MA_assemblyGo": MA,
            "time": time
        })

MA_df = pd.DataFrame(MA_values)
MA_df.to_csv("Data/AssemblyValues/assigneeCpds_AssemblyGo.csv")

100%|██████████| 332246/332246 [02:11<00:00, 2517.27it/s]


In [9]:
#Link all cpds with MAs
cpd_MA_links = dict(zip(MA_df["label"], MA_df["MA_assemblyGo"]))
print(list(cpd_MA_links.items())[0:5])

[('SCHEMBL10000057', 10.0), ('SCHEMBL10000065', 8.0), ('SCHEMBL10000069', 12.0), ('SCHEMBL1000007', 19.0), ('SCHEMBL10000081', 16.0)]


In [13]:
def get_MA_values(ids, cpd_MA_links):
    """ Given a list of surechembl cpd ids, return a list of MA values associated with those ids

    Args:
        ids (list): list of all cpd ids
        cpd_MA_links (dict): id:MA dictionary

    Returns:
        list: list of all corresponding MA values
    """
    MAs = []
    for id in ids:
        try:
            MAs.append(cpd_MA_links[id])
        except KeyError:
            pass

    return MAs

In [14]:
patent_MA_links = {}

for patent, ids in tqdm(patent_cpd_links.items()):
    patent_MA_links[patent] = get_MA_values(ids, cpd_MA_links)

100%|██████████| 41940/41940 [00:03<00:00, 11108.97it/s]


In [15]:
print(list(patent_MA_links.items())[0:5])

[('WO-2013069771-A1', [22.0, 21.0, 14.0, 22.0, 10.0, 18.0, 11.0]), ('US-5326574-A', []), ('US-7687629-B2', []), ('US-6576689-B2', []), ('EP-1214363-B1', [])]


In [16]:
results_df["MAs"] = results_df["ID"].map(patent_MA_links)

   Unnamed: 0                ID         author    assignees  MAs
0           0  US-2016000413-A1  A AMAR OUSAMA  UNIV_BOSTON  NaN
1           3  US-2016000413-A1   BIGIO IRVING  UNIV_BOSTON  NaN
2           6  US-2016000413-A1  BRIGGS JOHN C  UNIV_BOSTON  NaN
3           9  US-2016000413-A1  CHARGIN DAVID  UNIV_BOSTON  NaN
4          12  US-2016000413-A1  LEE STEPHANIE  UNIV_BOSTON  NaN


In [17]:
print(results_df)

        Unnamed: 0                ID                author          assignees  \
0                0  US-2016000413-A1         A AMAR OUSAMA        UNIV_BOSTON   
1                3  US-2016000413-A1          BIGIO IRVING        UNIV_BOSTON   
2                6  US-2016000413-A1         BRIGGS JOHN C        UNIV_BOSTON   
3                9  US-2016000413-A1         CHARGIN DAVID        UNIV_BOSTON   
4               12  US-2016000413-A1         LEE STEPHANIE        UNIV_BOSTON   
...            ...               ...                   ...                ...   
373735      907679      US-5361239-A     ZOELLER WILLIAM A   BAKER_HUGHES_INC   
373736      907680  US-2021354847-A1        ZWEIG ANDREW M          BOEING_CO   
373737      907681     US-9751966-B2        ZWEIG ANDREW M          BOEING_CO   
373738      907682      US-4914171-A        ZWEIG ANDREW M  ALLIED_SIGNAL_INC   
373739      907683  US-2021009819-A1  ZWEIG ANDREW MICHAEL          BOEING_CO   

                           

In [32]:
testing_df = results_df[results_df["ID"].isin(["US-9751966-B2", "US-4914171-A"])]
print(testing_df)

        Unnamed: 0             ID          author          assignees  \
373737      907681  US-9751966-B2  ZWEIG ANDREW M          BOEING_CO   
373738      907682   US-4914171-A  ZWEIG ANDREW M  ALLIED_SIGNAL_INC   

                                              MAs  MA_avg  
373737  [9.0, 9.0, 8.0, 9.0, 8.0, 7.0, 9.0, 10.0]     NaN  
373738                                 [9.0, 9.0]     NaN  


# Step 3: Statistics over MAs

Include dates, avg / max MA

In [69]:
# testing_df["MA_avg"] = testing_df.explode("MAs").groupby("ID").mean()
#print(testing_df.explode("MAs").groupby("ID").head())
print("-----")
MA_avgs = testing_df.explode("MAs").groupby("ID").MAs.apply(np.mean)
print(MA_avgs)
print(type(MA_avgs))
MA_avgs = dict(MA_avgs)
print(MA_avgs)

print(testing_df)

testing_df["MA_avg"] = testing_df["ID"].map(MA_avgs)
print(testing_df)

-----
ID
US-4914171-A     9.000
US-9751966-B2    8.625
Name: MAs, dtype: float64
<class 'pandas.core.series.Series'>
{'US-4914171-A': 9.0, 'US-9751966-B2': 8.625}
        Unnamed: 0             ID          author          assignees  \
373737      907681  US-9751966-B2  ZWEIG ANDREW M          BOEING_CO   
373738      907682   US-4914171-A  ZWEIG ANDREW M  ALLIED_SIGNAL_INC   

                                              MAs  MA_avg  
373737  [9.0, 9.0, 8.0, 9.0, 8.0, 7.0, 9.0, 10.0]     NaN  
373738                                 [9.0, 9.0]     NaN  
        Unnamed: 0             ID          author          assignees  \
373737      907681  US-9751966-B2  ZWEIG ANDREW M          BOEING_CO   
373738      907682   US-4914171-A  ZWEIG ANDREW M  ALLIED_SIGNAL_INC   

                                              MAs  MA_avg  
373737  [9.0, 9.0, 8.0, 9.0, 8.0, 7.0, 9.0, 10.0]   8.625  
373738                                 [9.0, 9.0]   9.000  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_df["MA_avg"] = testing_df["ID"].map(MA_avgs)


In [71]:
## Add avg MA to results_df
MA_avgs = results_df.explode("MAs").groupby("ID").MAs.apply(np.mean)
MA_avgs = dict(MA_avgs)

results_df["MA_avg"] = results_df["ID"].map(MA_avgs)
print(results_df)

        Unnamed: 0                ID                author          assignees  \
0                0  US-2016000413-A1         A AMAR OUSAMA        UNIV_BOSTON   
1                3  US-2016000413-A1          BIGIO IRVING        UNIV_BOSTON   
2                6  US-2016000413-A1         BRIGGS JOHN C        UNIV_BOSTON   
3                9  US-2016000413-A1         CHARGIN DAVID        UNIV_BOSTON   
4               12  US-2016000413-A1         LEE STEPHANIE        UNIV_BOSTON   
...            ...               ...                   ...                ...   
373735      907679      US-5361239-A     ZOELLER WILLIAM A   BAKER_HUGHES_INC   
373736      907680  US-2021354847-A1        ZWEIG ANDREW M          BOEING_CO   
373737      907681     US-9751966-B2        ZWEIG ANDREW M          BOEING_CO   
373738      907682      US-4914171-A        ZWEIG ANDREW M  ALLIED_SIGNAL_INC   
373739      907683  US-2021009819-A1  ZWEIG ANDREW MICHAEL          BOEING_CO   

                           

In [73]:
#Drop all NaNs (testing for now)
results_df = results_df.dropna()
print(results_df)

        Unnamed: 0              ID             author  \
169            929   US-8440663-B2          AAY NAING   
170            947   US-8440663-B2      ARCALAS ARLYN   
171            965   US-8440663-B2      BROWN S DAVID   
172            983   US-8440663-B2  CHAN WAI KI VICKY   
173           1001   US-8440663-B2          CHEN JEFF   
...            ...             ...                ...   
373699      907631   US-8309497-B1         ZHANG YING   
373732      907673  US-10619065-B2     ZHOU ZHANG-LIN   
373733      907675  US-10619030-B2     ZHOU ZHANG-LIN   
373737      907681   US-9751966-B2     ZWEIG ANDREW M   
373738      907682    US-4914171-A     ZWEIG ANDREW M   

                             assignees  \
169                       EXELIXIS_INC   
170                       EXELIXIS_INC   
171                       EXELIXIS_INC   
172                       EXELIXIS_INC   
173                       EXELIXIS_INC   
...                                ...   
373699     HALLIBURTO

### Add dates to patent MAs

Use JSON files & priority dates (instead of filing date or publication date)

In [119]:
def get_date(ID):
    """ Find the priority date of a given patent

    Args:
        ID (str): patent ID

    Returns:
        str: priority date for a patent
    """
    fp = "Data/Patents/Patent_Records/patent_" + ID + ".json"

    try:
        data = json.load(open(fp))
    except FileNotFoundError as e:
        print(e)

    #Find specific priority date record
    headings = data["Record"]["Section"]
    for head in headings:
        if head["TOCHeading"] == "Important Dates":
            if head["Section"][0]["TOCHeading"] == "Priority Date":
                return head["Section"][0]["Information"][0]["Value"]["DateISO8601"][0]


In [120]:
get_date("US-8637499-B2")

[Errno 2] No such file or directory: 'Data/Patents/Patent_Records/patent_US-8637499-B2.json'


UnboundLocalError: local variable 'data' referenced before assignment

In [117]:
#Get unique patents
patent_ids = list(results_df["ID"].unique())

print(patent_ids[0:10])

['US-8440663-B2', 'US-8012956-B2', 'US-8637499-B2', 'US-8324231-B2', 'US-8242129-B2', 'US-9006153-B2', 'US-9284482-B2', 'US-10011763-B2', 'US-9034802-B2', 'US-7431087-B2']


In [None]:
patent_date_links = {}

for id in tqdm(patent_ids):
    patent_date_links[id] = get_date(id)

In [121]:
print([k for k, v in patent_date_links.items() if v != ""])

['US-7683061-B2', 'US-6387889-B1', 'US-7179836-B2', 'US-9475767-B2', 'EP-3021384-B1', 'US-4296224-A', 'US-8623961-B2', 'US-8728515-B2', 'WO-2007129005-A1', 'US-7491383-B2', 'EP-1041976-B1', 'US-9080009-B2', 'US-6645980-B1', 'US-4221814-A', 'US-9969709-B2', 'EP-0478626-B2', 'US-4639267-A', 'US-8133881-B2', 'US-7482425-B2', 'US-8951722-B1', 'US-8299241-B2', 'US-9833432-B2', 'US-10332693-B2', 'WO-2005040247-A1', 'US-8889706-B2', 'US-8642016-B2', 'WO-2008090382-A1', 'US-10464885-B2', 'US-5366665-A', 'US-9151989-B2', 'US-5562866-A', 'US-7777039-B2', 'US-6921784-B2', 'US-6841601-B2', 'US-6136828-A', 'US-5162305-A', 'US-9464206-B2', 'US-7622563-B2', 'US-8946410-B2', 'US-10308620-B2', 'US-9611460-B2', 'US-10294466-B2', 'US-5432172-A', 'EP-0496409-B1', 'US-7935824-B2', 'US-9370589-B2', 'US-10400137-B2', 'US-7037489-B2', 'US-9987241-B2', 'US-4749521-A', 'US-8394468-B2', 'EP-1426098-B1', 'US-5798032-A', 'US-4428883-A', 'US-6884896-B2', 'US-5891897-A', 'US-8444450-B2', 'EP-0472913-B1', 'US-5084225