In [1]:
import json
import pandas as pd

import util

In [2]:
data_config = util.get_data_config()

## Prepare Ground Truth

In [3]:
ground_truth = {
    'NCT00037648': ['anakinra'],
    'NCT00048542': ['adalimumab', 'methotrexate'],
    'NCT00071487': ['belimumab'],
    'NCT00071812': ['belimumab'],
    'NCT00072839': ['ALX-0600'],
    'NCT00074438': ['rituximab', 'methotrexate'],
    'NCT00078806': ['etanercept'],
    'NCT00078819': ['etanercept'],
    'NCT00079937': ['omalizumab'],
    'NCT00090142': [],
    'NCT00092131': [],
    'NCT00095173': ['BMS-188667', 'Abatacept'],
    'NCT00097370': ['mepolizumab', 'mepolizumab'],
    'NCT00106522': ['tocilizumab', 'methotrexate', 'tocilizumab', 'tocilizumab', 'methotrexate'],
    'NCT00106535': ['tocilizumab', 'methotrexate', 'tocilizumab', 'tocilizumab'],
    'NCT00106548': ['tocilizumab', 'methotrexate', 'tocilizumab', 'tocilizumab'],
    'NCT00109408': ['tocilizumab', 'methotrexate', 'tocilizumab', 'methotrexate'],
    'NCT00109707': ['Imatinib', 'imatinib', 'imatinib', 'imatinib', 'Imatinib', 'imatinib','imatinib','imatinib'],
    'NCT00110916': ['anakinra', 'anakinra'],
    'NCT00111436': ['etanercept', 'etanercept', 'etanercept'],
    'NCT00119678': ['Abatacept', 'prednisone'],
    'NCT00120523': ['pimecrolimus'],
    'NCT00130390': ['nitazoxanide'],
    'NCT00137969': ['rituximab'],
    'NCT00141921': ['etanercept'],
    'NCT00146640': ['prednisone', 'prednisone'],
    'NCT00171860': ['imatinib mesylate', 'imatinib mesylate', 'prednisone', 'hydroxyurea', 'oxyurea'],
    'NCT00175877': ['Certolizumab Pegol'],
    'NCT00195663': ['adalimumab', 'methotrexate', 'adalimumab'],
    'NCT00195702': ['adalimumab', 'adalimumab', 'methotrexate',],
    'NCT00206596': ['Leukine'],
    'NCT00206661': ['sargramostim'],
    'NCT00206700': ['sargramostim'],
    'NCT00206713': ['Leukine', 'Leukine'],
    'NCT00207714': ['Golimumab', 'CNTO 148'],
    'NCT00207740': ['CNTO 148', 'golimumab'],
    'NCT00221026': [],
    'NCT00235820': ['Adalimumab', 'Methotrexate'],
    'NCT00244842': ['voclosporin'],
    'NCT00245570': [],
    'NCT00245765': ['CDP870'],
    'NCT00254293': ['Abatacept'],
    'NCT00264537': ['golimumab', 'methotrexate'],
    'NCT00264550': ['golimumab', 'methotrexate', 'methotrexate'],
    'NCT00265096': ['golimumab'],
    'NCT00265122': ['CNTO 1275'],
    'NCT00266565': [],
    'NCT00267956': ['CNTO 1275', 'ustekinumab'],
    'NCT00267969': ['ustekinumab', 'CNTO 1275'],
    'NCT00269841': [],
    'NCT00269854': [],
}
ground_truth = {k:list(set(v)) for k,v in ground_truth.items()}


with open(data_config['ground_truth_raw_file_name'], 'w') as fout:
    fout.write(json.dumps(ground_truth))

# Clean up ground truth terms (synonyms only)

In [4]:
synonyms_df = pd.read_parquet(data_config['processed_synonyms_file_name'])

In [5]:
synonym_maps = util.get_synonym_maps(synonyms_df)
preferred_name_by_term = synonym_maps['preferred_name_by_term']


In [6]:
# preferred_name_by_term = {}
# for _, row in synonyms_df.iterrows():
#     preferred_name = row['preferred_name']
#     synonyms = row['synonyms']
#     for synonym in synonyms:
#         preferred_name_by_term[util.clean_up_synonym_term(synonym)] = preferred_name

In [7]:
ground_truth_cleaned = {}
for nct_id, terms in ground_truth.items():
    terms_cleaned = []
    for term in terms:
        cleaned = util.clean_up_synonym_term(term)
        if cleaned not in preferred_name_by_term:
            continue
        terms_cleaned.append(preferred_name_by_term[cleaned])
    ground_truth_cleaned[nct_id] = list(set(terms_cleaned))

In [8]:
with open(data_config['ground_truth_cleaned_file_name'], 'w') as fout:
    fout.write(json.dumps(ground_truth_cleaned))