In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import os

In [None]:
#Matplotlib defaults (eventually)

# Step 1: Build data structures

See pg 116 in notebook

* Dataframe, organized by individual patent-author / patent-assignee combinations (with appropriate tags)
    - in Data/Patents/patent_MA_results.csv
* Dictionary linking patents to a list of cpds associated with it
    - in Data/Patents/patent_cpds_links.p
* Dictionary linking patents to a list of MA values associated with it
* Dictionary linking patents to classifications

In [3]:
## Patent/classification dictionary
author_df = pd.read_csv("Data/Patents/patent_author_records.csv")
assignee_df = pd.read_csv("Data/Patents/patent_assignee_records.csv")

In [6]:
#Do author/assignees have different patent ids?

author_ids = list(set(list(author_df["ID"])))
assignee_ids = list(set(list(assignee_df["ID"])))

print(len(author_ids))
print(len(assignee_ids))

print(len(set(author_ids) - set(assignee_ids))) #This should be 0...

744686
332467
660634


In [7]:
patent_classifications = dict(zip(author_df.ID, author_df.classification))

print(len(patent_classifications))

patent_classifications.update(dict(zip(assignee_df.ID, assignee_df.classification)))

print(len(patent_classifications))

# tqdm.pandas()

# print("----- Author Patents ----- ")
# for index, row in tqdm(author_df.iterrows(), total=len(author_df)):
#     patent_classifications[row["ID"]] = row["classification"]

# print("----- Assignee Patents -----")
# for index, row in tqdm(assignee_df.iterrows(), total=len(assignee_df)):
#     patent_classifications[row["ID"]] = row["classification"]



744686
993101


In [8]:
pickle.dump(patent_classifications, file=open("Data/Patents/patent_classification_links.p", "wb"))

# Step 2: Link MAs with authors/companies

(1/9) Currently incomplete - data is still running on agave

In [2]:
results_df = pd.read_csv("Data/Patents/patent_MA_results.csv")

print(results_df.head())

   Unnamed: 0                ID         author    assignees
0           0  US-2016000413-A1  A AMAR OUSAMA  UNIV_BOSTON
1           3  US-2016000413-A1   BIGIO IRVING  UNIV_BOSTON
2           6  US-2016000413-A1  BRIGGS JOHN C  UNIV_BOSTON
3           9  US-2016000413-A1  CHARGIN DAVID  UNIV_BOSTON
4          12  US-2016000413-A1  LEE STEPHANIE  UNIV_BOSTON


In [3]:
patent_cpd_links = pickle.load(file=open("Data/Patents/patent_cpd_links.p", "rb"))

In [5]:
print(list(patent_cpd_links.items())[0:5])

[('WO-2013069771-A1', ['SCHEMBL14948774', 'SCHEMBL14948814', 'SCHEMBL14948780', 'SCHEMBL14948825', 'SCHEMBL14948770', 'SCHEMBL14948779', 'SCHEMBL14948837', 'SCHEMBL14948833', 'SCHEMBL14948841', 'SCHEMBL14948781', 'SCHEMBL14948768', 'SCHEMBL14948832', 'SCHEMBL14948838', 'SCHEMBL14948773', 'SCHEMBL14948775', 'SCHEMBL14948777', 'SCHEMBL14948842', 'SCHEMBL14948776', 'SCHEMBL14948831', 'SCHEMBL14948824', 'SCHEMBL14948819', 'SCHEMBL14948828', 'SCHEMBL14948765', 'SCHEMBL14948767', 'SCHEMBL14948820', 'SCHEMBL14948827', 'SCHEMBL14948830', 'SCHEMBL14948766', 'SCHEMBL14948835', 'SCHEMBL14948839', 'SCHEMBL14948769', 'SCHEMBL14948826', 'SCHEMBL14948818', 'SCHEMBL14948843', 'SCHEMBL14948821', 'SCHEMBL14948772', 'SCHEMBL14948834', 'SCHEMBL14948778', 'SCHEMBL14948764', 'SCHEMBL14948771', 'SCHEMBL14948813', 'SCHEMBL14948836', 'SCHEMBL14948840', 'SCHEMBL14948822', 'SCHEMBL675', 'SCHEMBL14948829', 'SCHEMBL14948823']), ('US-5326574-A', ['SCHEMBL762', 'SCHEMBL762', 'SCHEMBL15065', 'SCHEMBL4591', 'SCHEMBL38

In [6]:
def get_MA(fp):
    """ Get AssemblyGo MA value

    Args:
        fp (str): filepath to a particular .txt AssemblyGo output file

    Returns:
        label (str): label of the compound which was analzyed (empty if a failure)
        MA (int): assemblyGo MA value (-1 if a failure)
    """
    with open(fp) as f:
        lines = f.readlines()

    try:
        #molfile will be the last element in 0th line
        label = lines[0].split()[-1].split("/")[-1].split(".")[0]

        #MA will be last elemnt in -2nd line (will be an int)
        MA = int(lines[-2].split()[-1])

        time = float(lines[-1].split()[-1])
        
        return label, MA, time

    except:
        return None, None, None

In [None]:
fp = "Data/AssemblyValues/AssigneeCpds_Done/"
MA_values = []
for file in tqdm(os.listdir(fp)):
    if file.endswith(".txt"):
        label, MA, time = get_MA(fp + file)
        MA_values.append({
            "label": label,
            "MA_assemblyGo": MA,
            "time": time
        })

MA_df = pd.DataFrame(MA_values)
MA_df.to_csv("Data/AssemblyValues/assigneeCpds_AssemblyGo.csv")

In [None]:
cpd_MA_links = dict(zip(MA_df["label"], MA_df["MA_assemblyGo"]))
print(list(cpd_MA_links.items())[0:5])