In [1]:
import pandas as pd
from os.path import join
from collections import Counter
from tqdm.notebook import tqdm
import json

med_kg_cols = {
    'h':'SUBJECT_CUI',
    'r':'PREDICATE',
    't':'OBJECT_CUI',
    'p':'label_proba'
}
edu_kg_cols = {
    'h':'head',
    'r':'rel',
    't':'tail',
    'p':'proba'
}
gen_med_kg_cols = {
    'h':'SUBJECT_CUI',
    'r':'PREDICATE',
    't':'OBJECT_CUI',
    'p':'label_proba',
    'h_type':'SUBJECT_TYPE',
    't_type':'OBJECT_TYPE',
    'r_h1':'PREDICATE_Level_1',
    'r_h0':'PREDICATE_Level_0'
}
# h_type, t_type, r_h1, r_h0
MAIN_DATA_DIR = '/home/pc/Desktop/AdilStuff/Projects/KG Embedding/ProbaKGE/beurre_applications/data'
NET_DIR = '/home/pc/Desktop/AdilStuff/Projects/SemRepMed/umls semantic network/processed_data'
PROBA_MODEL_DATA_PATH = join(MAIN_DATA_DIR, 'medfpkg')
GEN_PROBA_MODEL_DATA_PATH = join(MAIN_DATA_DIR, 'gen_medfpkg')
EDU_KG_DATA_PATH = join(MAIN_DATA_DIR, 'edujobkg')

In [2]:
def save_json(save_path, data):
    with open(save_path, 'w') as f:
        json.dump(data, f)
        
def read_json(save_path):
    with open(save_path, 'r') as f:
        data = json.load(f)
    return data

In [3]:
def get_format_data(df_, kg_map):
    all_ents = list(set(list(df_[kg_map['h']])+list(df_[kg_map['t']])))
    all_ents = {ent:idx for idx, ent in enumerate(all_ents)}
    all_rels = {rel:idx for idx, rel in enumerate(list(set(df_[kg_map['r']])))}
    df_[kg_map['h']] = df_[kg_map['h']].map(all_ents)
    df_[kg_map['t']] = df_[kg_map['t']].map(all_ents)
    df_[kg_map['r']] = df_[kg_map['r']].map(all_rels)
    return df_, all_ents, all_rels

def clean_triples(df_, kg_map):
    # remove triples with entities that appear less than 10 times
    all_ents = Counter(list(df_[kg_map['h']])+list(df_[kg_map['t']]))
    unwanted_ents = [i for i in all_ents if all_ents[i]<10]
    print(len(unwanted_ents))
    return df_[~(df_[kg_map['h']].isin(unwanted_ents) | df_[kg_map['t']].isin(unwanted_ents))]

def get_stratified_score(df_, kg_map):
    all_ents = Counter(list(df_[kg_map['h']])+list(df_[kg_map['t']]))
    all_rels = Counter(list(df_[kg_map['r']]))
    df_['scores'] = df_[kg_map['h']].map(all_ents)+df_[kg_map['t']].map(all_ents)+df_[kg_map['r']].map(all_rels)
#     df_['scores'] = df_[kg_map['h']].map(all_ents)+df_[kg_map['t']].map(all_ents)
#     df_['scores'] = df_[kg_map['r']].map(all_rels)
    return df_

def get_split(df_, kg_map):
#     df_ = clean_triples(df_ = df_, kg_map = kg_map)
    df_ = get_stratified_score(df_ = df_, kg_map = kg_map)
    train_df = df_.sample(n=int(df_.shape[0]*0.8), weights='scores', random_state=666)
    val_test = df_[~df_.index.isin(train_df.index)]
    test_df = val_test.sample(n=val_test.shape[0]//2, weights='scores', random_state=666)
    val_df = val_test[~val_test.index.isin(test_df.index)]
    return train_df.reset_index(drop=True), val_df.reset_index(drop=True), test_df.reset_index(drop=True)

def get_split_grouped_v2(df_, kg_map):
#     df_ = clean_triples(df_ = df_, kg_map = kg_map)
    print('scoring')
    df_ = get_stratified_score(df_ = df_, kg_map = kg_map)
#     return df_
    print('groupe sampling train')
    train_df = df_.groupby([kg_map['r'], kg_map['t']]).sample(frac=0.6, weights=list(df_['scores']), random_state=666)
    print('sampling test val')
    val_test = df_[~df_.index.isin(train_df.index)]
    test_df = val_test.sample(n=val_test.shape[0]//2, weights='scores', random_state=666)
    val_df = val_test[~val_test.index.isin(test_df.index)]
    return train_df.reset_index(drop=True), val_df.reset_index(drop=True), test_df.reset_index(drop=True)

def get_split_grouped(df_, kg_map):
    print('scoring')
    df_ = get_stratified_score(df_ = df_, kg_map = kg_map)
    print('grouping')
    df_grouped = df_.groupby([kg_map['h'], kg_map['t']])
    all_train = []
    all_test = []
    all_val = []
    for group_name, df_group in tqdm(df_grouped):
        print(group_name)
        print(df_group.shape)
        train_df, val_df, test_df = get_split(df_ = df_group, kg_map = kg_map)
        all_train.append(train_df)
        all_val.append(val_df)
        all_test.append(test_df)
    return pd.concat(all_train).reset_index(), pd.concat(all_val).reset_index(), pd.concat(all_test).reset_index()
# def get_split(df_, kg_map):
# #     df_ = clean_triples(df_ = df_, kg_map = kg_map)
#     df_ = get_stratified_score(df_ = df_, kg_map = kg_map)
#     val_df = df_.sample(n=int(df_.shape[0]*0.1), weights='scores', random_state=666).reset_index(drop=True)
#     train_test = df_[~df_.index.isin(val_df.index)]
#     test_df = train_test.sample(n=int(train_test.shape[0]*0.1), weights='scores', random_state=666).reset_index(drop=True)
#     train_df = train_test[~train_test.index.isin(test_df.index)]
#     return train_df, val_df, test_df

def iterative_spliting(df_, kg_map):
    not_complete = True
    while not_complete:
        train_df, val_df, test_df = get_split(df_, kg_map)
        train_ents = set(list(train_df[kg_map['h']])+list(train_df[kg_map['t']]))
        train_rels = set(list(train_df[kg_map['r']]))
        val_ents = set(list(val_df[kg_map['h']])+list(val_df[kg_map['t']]))
        val_rels = set(list(val_df[kg_map['r']]))
        test_ents = set(list(test_df[kg_map['h']])+list(test_df[kg_map['t']]))
        test_rels = set(list(test_df[kg_map['r']]))
        print('ents in val not in train', len(val_ents.difference(train_ents)))
        print('ents in test not in train', len(test_ents.difference(train_ents)))

def get_tvs_format_data(train_df, val_df, test_df, kg_map):
    train_df, val_df, test_df = train_df.copy(), val_df.copy(), test_df.copy()
    train_ents = set(list(train_df[kg_map['h']])+list(train_df[kg_map['t']]))
    train_rels = set(list(train_df[kg_map['r']]))
    val_ents = set(list(val_df[kg_map['h']])+list(val_df[kg_map['t']]))
    val_rels = set(list(val_df[kg_map['r']]))
    test_ents = set(list(test_df[kg_map['h']])+list(test_df[kg_map['t']]))
    test_rels = set(list(test_df[kg_map['r']]))
    ent_map = list(set(list(train_ents)+list(val_ents)+list(test_ents)))
    rel_map = list(set(list(train_rels)+list(val_rels)+list(test_rels)))
    ent_map = {i:idx for idx, i in enumerate(ent_map)}
    rel_map = {i:idx for idx, i in enumerate(rel_map)}
    
    train_df[kg_map['h']] = train_df[kg_map['h']].map(ent_map)
    train_df[kg_map['t']] = train_df[kg_map['t']].map(ent_map)
    train_df[kg_map['r']] = train_df[kg_map['r']].map(rel_map)

    val_df[kg_map['h']] = val_df[kg_map['h']].map(ent_map)
    val_df[kg_map['t']] = val_df[kg_map['t']].map(ent_map)
    val_df[kg_map['r']] = val_df[kg_map['r']].map(rel_map)

    test_df[kg_map['h']] = test_df[kg_map['h']].map(ent_map)
    test_df[kg_map['t']] = test_df[kg_map['t']].map(ent_map)
    test_df[kg_map['r']] = test_df[kg_map['r']].map(rel_map)
    return train_df, val_df, test_df, ent_map, rel_map

def normalise_col_names(train_df, val_df, test_df, kg_map):
    rev_kg_map = {v:k for k, v in kg_map.items()}
    train_df = train_df[list(kg_map.values())].rename(rev_kg_map, axis='columns')
    val_df = val_df[list(kg_map.values())].rename(rev_kg_map, axis='columns')
    test_df = test_df[list(kg_map.values())].rename(rev_kg_map, axis='columns')
    return train_df, val_df, test_df

def save_splits(train_df, val_df, test_df, ent_map, rel_map, ent_type_map, save_path):
    train_df.to_csv(join(save_path, 'train.csv'), index = False)
    val_df.to_csv(join(save_path, 'val.csv'), index = False)
    test_df.to_csv(join(save_path, 'test.csv'), index = False)
    save_json(join(save_path, 'ent_map.json'), ent_map)
    save_json(join(save_path, 'rel_map.json'), rel_map)
    if ent_type_map is not None:
        save_json(join(save_path, 'ent_type_map.json'), ent_type_map)

def split_process(split_df, kg_map):
    split_ents = set(list(split_df[kg_map['h']])+list(split_df[kg_map['t']]))
    if ('h_type' in kg_map) and ('t_type' in kg_map):
        split_ent_types = set(list(split_df[kg_map['h_type']])+list(split_df[kg_map['t_type']]))
    else:
        split_ent_types = None
        
    split_rels = list(split_df[kg_map['r']])
    for i in range(len(kg_map)):
        if 'r_h'+str(i) in kg_map:
            split_rels += list(split_df[kg_map['r_h'+str(i)]])
    split_rels = set(split_rels)
    return split_ents, split_ent_types, split_rels

def split_maps(split_df, ent_map, rel_map, ent_type_map, kg_map):
    split_df[kg_map['h']] = split_df[kg_map['h']].map(ent_map)
    split_df[kg_map['t']] = split_df[kg_map['t']].map(ent_map)
    split_df[kg_map['r']] = split_df[kg_map['r']].map(rel_map)
    split_df[kg_map['r_h1']] = split_df[kg_map['r_h1']].map(rel_map)
    split_df[kg_map['r_h0']] = split_df[kg_map['r_h0']].map(rel_map)
    if ent_type_map is not None:
        split_df[kg_map['h_type']] = split_df[kg_map['h_type']].map(ent_type_map)
        split_df[kg_map['t_type']] = split_df[kg_map['t_type']].map(ent_type_map)
    return split_df

def organize_split(split_df, kg_map, simple_mode = True):
#     h_type, t_type, r_h1, r_h0
    if simple_mode:
        split_df = split_df[[kg_map['h'],kg_map['r'],kg_map['t'],kg_map['p']]]
    else:
        split_df = split_df[[
            kg_map['h'],kg_map['r'],kg_map['t'],kg_map['p'],kg_map['h_type'],
            kg_map['t_type'],kg_map['r_h1'],kg_map['r_h0']]]
    return split_df

def process_splits(train_df, val_df, test_df, kg_map):
    train_ents, train_ent_types, train_rels = split_process(train_df, kg_map)
    val_ents, val_ent_types, val_rels = split_process(val_df, kg_map)
    test_ents, test_ent_types, test_rels = split_process(test_df, kg_map)
    ent_map = list(set(list(train_ents)+list(val_ents)+list(test_ents)))
    rel_map = list(set(list(train_rels)+list(val_rels)+list(test_rels)))
    if None not in [train_ent_types, val_ent_types, test_ent_types]:
        ent_type_map = list(set(list(train_ent_types)+list(val_ent_types)+list(test_ent_types)))
    else:
        ent_type_map = None
        
    ent_map = {i:idx for idx, i in enumerate(ent_map)}
    rel_map = {i:idx for idx, i in enumerate(rel_map)}
    if ent_type_map is not None:
        ent_type_map = {i:idx for idx, i in enumerate(ent_type_map)}
    
    train_df = split_maps(train_df, ent_map, rel_map, ent_type_map, kg_map)
    val_df = split_maps(val_df, ent_map, rel_map, ent_type_map, kg_map)
    test_df = split_maps(test_df, ent_map, rel_map, ent_type_map, kg_map)
    
    train_df = organize_split(train_df, kg_map, simple_mode = False)
    val_df = organize_split(val_df, kg_map, simple_mode = False)
    test_df = organize_split(test_df, kg_map, simple_mode = False)
    
    return train_df, val_df, test_df, ent_map, rel_map, ent_type_map

def save_train_test_val(train_df, val_df, test_df, save_path):
    ren = {'head':'h', 'rel':'r', 'tail':'t', 'total_c':'p'}
    train_df = train_df[[kg_map['h'],kg_map['r'],kg_map['t'],kg_map['p']]].rename(ren, axis='columns').to_csv(join(save_path, 'train.csv'), index = False)
    val_df = val_df[[kg_map['h'],kg_map['r'],kg_map['t'],kg_map['p']]].rename(ren, axis='columns').to_csv(join(save_path, 'val.csv'), index = False)
    test_df = test_df[[kg_map['h'],kg_map['r'],kg_map['t'],kg_map['p']]].rename(ren, axis='columns').to_csv(join(save_path, 'test.csv'), index = False)

def test_split(train_df, val_df, test_df, kg_map):
    train_ents = set(list(train_df[kg_map['h']])+list(train_df[kg_map['t']]))
    train_rels = set(list(train_df[kg_map['r']]))
    val_ents = set(list(val_df[kg_map['h']])+list(val_df[kg_map['t']]))
    val_rels = set(list(val_df[kg_map['r']]))
    test_ents = set(list(test_df[kg_map['h']])+list(test_df[kg_map['t']]))
    test_rels = set(list(test_df[kg_map['r']]))
    print('ents in val not in train', len(val_ents.difference(train_ents)))
    print('ents in test not in train', len(test_ents.difference(train_ents)))
    print('ents in train not in val', len(train_ents.difference(val_ents)))
    print('ents in train not in test', len(train_ents.difference(test_ents)))

    print('rels in val not in train', len(val_rels.difference(train_rels)))
    print('rels in test not in train', len(test_rels.difference(train_rels)))
    print('rels in train not in val', len(train_rels.difference(val_rels)))
    print('rels in train not in test', len(train_rels.difference(test_rels)))
    print('train size: ', train_df.shape[0])
    print('val size: ', val_df.shape[0])
    print('test size: ', test_df.shape[0])
    sum_num = train_df.shape[0]+val_df.shape[0]+test_df.shape[0]
    print('train %: ', train_df.shape[0]/sum_num)
    print('val %: ', val_df.shape[0]/sum_num)
    print('test %: ', test_df.shape[0]/sum_num)
    print('VOCAB_SIZE: ', len(set(list(train_ents)+list(val_ents)+list(test_ents))))
    print('REL_VOCAB_SIZE: ', len(set(list(train_rels)+list(val_rels)+list(test_rels))))
    train_ents = max(list(train_df[kg_map['h']])+list(train_df[kg_map['t']]))
    print(train_ents)
    train_rels, train_rels_ = max(list(train_df[kg_map['r']])), min(list(train_df[kg_map['r']]))
    print(train_rels, train_rels_)
    val_ents = max(list(val_df[kg_map['h']])+list(val_df[kg_map['t']]))
    print(val_ents)
    val_rels, val_rels_ = max(list(val_df[kg_map['r']])), min(list(val_df[kg_map['r']]))
    print(val_rels, val_rels_)
    test_ents = max(list(test_df[kg_map['h']])+list(test_df[kg_map['t']]))
    print(test_ents)
    test_rels, test_rels_ = max(list(test_df[kg_map['r']])), min(list(test_df[kg_map['r']]))
    print(test_rels, test_rels_)

def process_data(df_, kg_map, save_path):
    df_ = clean_triples(df_ = df_, kg_map = kg_map)
    df_ = get_format_data(df_ = df_, kg_map = kg_map)
    df_ = get_stratified_score(df_, kg_map)
    train_df, val_df, test_df = get_split(df_, kg_map)
    train_df, val_df, test_df = save_split(train_df, val_df, test_df, kg_map)
    test_split(train_df, val_df, test_df, kg_map)
#     save_train_test_val(train_df, val_df, test_df, save_path)
    
    

## Data Processing

### Simple Med KG

In [44]:
df = pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')

In [45]:
df.head()

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba
0,0,PROCESS_OF,C0003725,C0999630,1.0
1,1,ISA,C0039258,C0446169,1.0
2,2,ISA,C0318627,C0206590,1.0
3,3,ISA,C0446169,C0003725,1.0
4,4,PROCESS_OF,C0012634,C0020114,0.989018


In [46]:
df.shape

(28416917, 5)

In [47]:
df = clean_triples(df_ = df, kg_map = med_kg_cols)

185977


In [48]:
df.head()

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba
0,0,PROCESS_OF,C0003725,C0999630,1.0
1,1,ISA,C0039258,C0446169,1.0
2,2,ISA,C0318627,C0206590,1.0
3,3,ISA,C0446169,C0003725,1.0
4,4,PROCESS_OF,C0012634,C0020114,0.989018


In [49]:
df.shape

(27847338, 5)

In [50]:
train_df, val_df, test_df = get_split_grouped_v2(df_ = df, kg_map = med_kg_cols)

scoring
groupe sampling train
sampling test val


In [51]:
train_df.head()

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,scores
0,21012863,CAUSES,100033819,C0004391,1.0,1145378
1,22628456,AUGMENTS,100033819,C0007587,1.0,772188
2,22628462,ASSOCIATED_WITH,100033819,C0019158,1.0,1282230
3,21678707,STIMULATES,100033819,C0077157,1.0,1645041
4,21678707,STIMULATES,100033819,C0077157,1.0,1645041


In [57]:
test_split(train_df, val_df, test_df, med_kg_cols)

ents in val not in train 0
ents in test not in train 0
ents in train not in val 37806
ents in train not in test 42100
rels in val not in train 0
rels in test not in train 0
rels in train not in val 0
rels in train not in test 2
train size:  21998120
val size:  2924609
test size:  2924609
train %:  0.789954142115846
val %:  0.10502292894207697
test %:  0.10502292894207697
VOCAB_SIZE:  159013
REL_VOCAB_SIZE:  63
C5548100
same_as ADMINISTERED_TO
C5548100
same_as ADMINISTERED_TO
C5547366
same_as ADMINISTERED_TO


In [53]:
train_df_, val_df_, test_df_, ent_map, rel_map = get_tvs_format_data(train_df, val_df, test_df, med_kg_cols)

In [54]:
test_split(train_df_, val_df_, test_df_, med_kg_cols)

ents in val not in train 0
ents in test not in train 0
ents in train not in val 37806
ents in train not in test 42100
rels in val not in train 0
rels in test not in train 0
rels in train not in val 0
rels in train not in test 2
train size:  21998120
val size:  2924609
test size:  2924609
train %:  0.789954142115846
val %:  0.10502292894207697
test %:  0.10502292894207697
VOCAB_SIZE:  159013
REL_VOCAB_SIZE:  63
159012
62 0
159010
62 0
159012
62 0


In [55]:
train_df_, val_df_, test_df_ = normalise_col_names(train_df_, val_df_, test_df_, med_kg_cols)

In [56]:
save_splits(train_df_, val_df_, test_df_, ent_map, rel_map, PROBA_MODEL_DATA_PATH)

### Typed Hierarchical Med KG

In [18]:
def get_net_data():
    meta_net = pd.read_csv(join(NET_DIR, 'meta_net.csv'))
    rel_heirarchy = pd.read_csv(join(NET_DIR, 'rel_heirarchy.csv'))
    level_2_1 = dict(zip(rel_heirarchy['H2'].str.upper(), rel_heirarchy['H1'].str.upper()))
    level_2_0 = dict(zip(rel_heirarchy['H2'].str.upper(), rel_heirarchy['H0'].str.upper()))
    return level_2_1, level_2_0

def get_triples():
    level_2_1, level_2_0 = get_net_data()
    df = pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')
    all_rels = list(set(list(df['PREDICATE'])))
    for rel in all_rels:
        if rel not in level_2_1:
            level_2_1[rel] = rel
        if rel not in level_2_0:
            level_2_0[rel] = rel
    all_entity_names = read_json(join('all_triples', 'all_entity_names.json'))
    all_entity_types = read_json(join('all_triples', 'all_entity_types.json'))
    df['SUBJECT_NAME'] = df['SUBJECT_CUI'].map(all_entity_names)
    df['OBJECT_NAME'] = df['OBJECT_CUI'].map(all_entity_names)
    df['SUBJECT_TYPE'] = df['SUBJECT_CUI'].map(all_entity_types)
    df['OBJECT_TYPE'] = df['OBJECT_CUI'].map(all_entity_types)
    df['PREDICATE_Level_1'] = df['PREDICATE'].map(level_2_1)
    df['PREDICATE_Level_0'] = df['PREDICATE'].map(level_2_0)
    return df

In [19]:
all_df = get_triples()

In [20]:
# del all_df

In [21]:
get_net_data()

({'PHYSICALLY_RELATED_TO': 'PHYSICALLY_RELATED_TO',
  'PART_OF': 'PART_OF',
  'CONTAINS': 'CONTAINS',
  'LOCATION_OF': 'LOCATION_OF',
  'TEMPORALLY_RELATED_TO': 'TEMPORALLY_RELATED_TO',
  'CO-OCCURS_WITH': 'CO-OCCURS_WITH',
  'PRECEDES': 'PRECEDES',
  'FUNCTIONALLY_RELATED_TO': 'FUNCTIONALLY_RELATED_TO',
  'PROCESS_OF': 'OCCURS_IN',
  'CARRIES_OUT': 'PERFORMS',
  'INTERACTS_WITH': 'AFFECTS',
  'PRACTICES': 'PERFORMS',
  'PRODUCES': 'BRINGS_ABOUT',
  'EXHIBITS': 'PERFORMS',
  'DISRUPTS': 'AFFECTS',
  'CAUSES': 'BRINGS_ABOUT',
  'PREVENTS': 'AFFECTS',
  'COMPLICATES': 'AFFECTS',
  'MANIFESTATION_OF': 'MANIFESTATION_OF',
  'AFFECTS': 'AFFECTS',
  'OCCURS_IN': 'OCCURS_IN',
  'MANAGES': 'AFFECTS',
  'TREATS': 'AFFECTS',
  'USES': 'USES',
  'INDICATES': 'INDICATES',
  'RESULT_OF': 'RESULT_OF',
  'CONCEPTUALLY_RELATED_TO': 'CONCEPTUALLY_RELATED_TO',
  'PROPERTY_OF': 'PROPERTY_OF',
  'CONCEPTUAL_PART_OF': 'CONCEPTUAL_PART_OF',
  'EVALUATION_OF': 'EVALUATION_OF',
  'MEASURES': 'MEASURES',
  'DI

In [22]:
all_df.head()

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,SUBJECT_NAME,OBJECT_NAME,SUBJECT_TYPE,OBJECT_TYPE,PREDICATE_Level_1,PREDICATE_Level_0
0,0,PROCESS_OF,C0003725,C0999630,1.0,JUN,Lepus capensis (organism),gngm,mamm,OCCURS_IN,FUNCTIONALLY_RELATED_TO
1,1,ISA,C0039258,C0446169,1.0,Tahyna virus,California Group Viruses,virs,virs,ISA,ISA
2,2,ISA,C0318627,C0206590,1.0,Eyach virus,Coltivirus,virs,virs,ISA,ISA
3,3,ISA,C0446169,C0003725,1.0,California Group Viruses,JUN,virs,gngm,ISA,ISA
4,4,PROCESS_OF,C0012634,C0020114,0.989018,Disease,Human,dsyn,grup,OCCURS_IN,FUNCTIONALLY_RELATED_TO


In [23]:
all_df.isna().any()

ORIGIN_ID            False
PREDICATE            False
SUBJECT_CUI          False
OBJECT_CUI           False
label_proba          False
SUBJECT_NAME         False
OBJECT_NAME          False
SUBJECT_TYPE         False
OBJECT_TYPE          False
PREDICATE_Level_1    False
PREDICATE_Level_0    False
dtype: bool

In [24]:
all_df.shape

(28416917, 11)

In [25]:
all_df = clean_triples(df_ = all_df, kg_map = gen_med_kg_cols)

185977


In [26]:
all_df.shape

(27847338, 11)

In [27]:
train_df, val_df, test_df = get_split_grouped_v2(df_ = all_df, kg_map = gen_med_kg_cols)

scoring
groupe sampling train
sampling test val


In [28]:
test_split(train_df, val_df, test_df, gen_med_kg_cols)

ents in val not in train 0
ents in test not in train 0
ents in train not in val 36931
ents in train not in test 42039
rels in val not in train 0
rels in test not in train 0
rels in train not in val 0
rels in train not in test 2
train size:  21025315
val size:  3411012
test size:  3411011
train %:  0.7550206414702906
val %:  0.12248969721989225
test %:  0.12248966130981712
VOCAB_SIZE:  159013
REL_VOCAB_SIZE:  63
C5548100
same_as ADMINISTERED_TO
C5547366
same_as ADMINISTERED_TO
C5548100
same_as ADMINISTERED_TO


In [29]:
train_df, val_df, test_df, ent_map, rel_map, ent_type_map = process_splits(train_df, val_df, test_df, gen_med_kg_cols)

In [30]:
val_df.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,label_proba,SUBJECT_TYPE,OBJECT_TYPE,PREDICATE_Level_1,PREDICATE_Level_0
0,158031,21,74598,1.0,104,104,21,21
1,55364,21,23254,1.0,104,66,21,21
2,4396,50,8431,0.92434,104,8,7,38
3,141702,40,131291,1.0,107,122,40,27
4,89683,50,101065,0.941358,66,55,7,38


In [31]:
test_split(train_df, val_df, test_df, gen_med_kg_cols)

ents in val not in train 0
ents in test not in train 0
ents in train not in val 36931
ents in train not in test 42039
rels in val not in train 0
rels in test not in train 0
rels in train not in val 0
rels in train not in test 2
train size:  21025315
val size:  3411012
test size:  3411011
train %:  0.7550206414702906
val %:  0.12248969721989225
test %:  0.12248966130981712
VOCAB_SIZE:  159013
REL_VOCAB_SIZE:  63
159012
68 0
159012
68 0
159012
68 0


In [32]:
train_df, val_df, test_df = normalise_col_names(train_df, val_df, test_df, gen_med_kg_cols)

In [33]:
rel_map

{'NEG_STIMULATES': 0,
 'NEG_LOCATION_OF': 1,
 'PRECEDES': 2,
 'PREVENTS': 3,
 'ADMINISTERED_TO': 4,
 'NEG_ASSOCIATED_WITH': 5,
 'NEG_TREATS': 6,
 'BRINGS_ABOUT': 7,
 'NEG_MANIFESTATION_OF': 8,
 'TEMPORALLY_RELATED_TO': 9,
 'AFFECTS': 10,
 'NEG_METHOD_OF': 11,
 'higher_than': 12,
 'CONVERTS_TO': 13,
 'TREATS': 14,
 'same_as': 15,
 'COEXISTS_WITH': 16,
 'CONCEPTUALLY_RELATED_TO': 17,
 'NEG_AUGMENTS': 18,
 'NEG_same_as': 19,
 'NEG_INHIBITS': 20,
 'ISA': 21,
 'lower_than': 22,
 'NEG_PREVENTS': 23,
 'NEG_PRODUCES': 24,
 'NEG_OCCURS_IN': 25,
 'NEG_DISRUPTS': 26,
 'PHYSICALLY_RELATED_TO': 27,
 'INHIBITS': 28,
 'AUGMENTS': 29,
 'NEG_ISA': 30,
 'NEG_COEXISTS_WITH': 31,
 'NEG_PROCESS_OF': 32,
 'COMPLICATES': 33,
 'NEG_USES': 34,
 'MANIFESTATION_OF': 35,
 'INTERACTS_WITH': 36,
 'NEG_CONVERTS_TO': 37,
 'FUNCTIONALLY_RELATED_TO': 38,
 'STIMULATES': 39,
 'PART_OF': 40,
 'METHOD_OF': 41,
 'DIAGNOSES': 42,
 'NEG_PART_OF': 43,
 'NEG_PREDISPOSES': 44,
 'PROCESS_OF': 45,
 'NEG_lower_than': 46,
 'SPATIALL

In [34]:
ent_type_map

{'sosy': 0,
 'rcpt': 1,
 'alga': 2,
 'carb': 3,
 'ffas': 4,
 'bsoj': 5,
 'lbtr': 6,
 'antb': 7,
 'dsyn': 8,
 'eehu': 9,
 'chvs': 10,
 'strd': 11,
 'orch': 12,
 'inpo': 13,
 'diap': 14,
 'orgf': 15,
 'vita': 16,
 'edac': 17,
 'emod': 18,
 'ftcn': 19,
 'euka': 20,
 'idcn': 21,
 'opco': 22,
 'cgab': 23,
 'shro': 24,
 'phsu': 25,
 'hops': 26,
 'anst': 27,
 'clna': 28,
 'ocac': 29,
 'comd': 30,
 'hcpp': 31,
 'vtbt': 32,
 'ocdi': 33,
 'topp': 34,
 'invt': 35,
 'rept': 36,
 'npop': 37,
 'qnco': 38,
 'sbst': 39,
 'chvf': 40,
 'blor': 41,
 'amph': 42,
 'fngs': 43,
 'ortf': 44,
 'fndg': 45,
 'gora': 46,
 'clas': 47,
 'fish': 48,
 'plnt': 49,
 'acty': 50,
 'inbe': 51,
 'socb': 52,
 'pros': 53,
 'drdd': 54,
 'celf': 55,
 'hlca': 56,
 'moft': 57,
 'famg': 58,
 'mobd': 59,
 'mosq': 60,
 'rnlw': 61,
 'aggp': 62,
 'inpr': 63,
 'mamm': 64,
 'evnt': 65,
 'gngm': 66,
 'emst': 67,
 'eico': 68,
 'grup': 69,
 'rich': 70,
 'biof': 71,
 'anab': 72,
 'mnob': 73,
 'food': 74,
 'bird': 75,
 'hcro': 76,
 'mcha': 

In [35]:
ent_map

{'C1512326': 0,
 'C1412401': 1,
 'C0019904': 2,
 'C1283707': 3,
 'C4747809': 4,
 'C0877112': 5,
 'C1254144': 6,
 'C0443310': 7,
 'C0050487': 8,
 'C0061072': 9,
 'C0595903': 10,
 'C0140558': 11,
 'C0278838': 12,
 'C0265511': 13,
 'C1160205': 14,
 'C0030268': 15,
 'C0544220': 16,
 'C0175763': 17,
 'C0557538': 18,
 'C0237784': 19,
 'C0684004': 20,
 'C0865236': 21,
 'C0062675': 22,
 'C1427255': 23,
 'C0206173': 24,
 'C1879654': 25,
 'C0007688': 26,
 'C0761975': 27,
 'C3640049': 28,
 'C0301472': 29,
 'C0120725': 30,
 'C2735350': 31,
 'C0686163': 32,
 'C1494103': 33,
 'C1332554': 34,
 'C0266304': 35,
 'C1161201': 36,
 'C0119206': 37,
 'C0016755': 38,
 'C0057085': 39,
 'C4015192': 40,
 'C0260292': 41,
 'C0157223': 42,
 'C1432195': 43,
 'C1154328': 44,
 'C0001036': 45,
 'C1285181': 46,
 'C0641593': 47,
 'C0042826': 48,
 'C0226476': 49,
 'C1484858': 50,
 'C0023054': 51,
 'C0282924': 52,
 'C0192682': 53,
 'C0169904': 54,
 'C0352909': 55,
 'C0013884': 56,
 'C0234235': 57,
 'C1322993': 58,
 'C0146

In [36]:
save_splits(train_df, val_df, test_df, ent_map, rel_map, ent_type_map, GEN_PROBA_MODEL_DATA_PATH)

In [37]:
train_df.head()

Unnamed: 0,h,r,t,p,h_type,t_type,r_h1,r_h0
0,70773,50,79353,1.0,66,55,7,38
1,70773,29,72509,1.0,66,55,29,29
2,70773,55,128039,1.0,66,8,55,55
3,70773,39,97473,1.0,66,66,39,39
4,70773,39,97473,1.0,66,66,39,39


In [38]:
pd.read_csv(join(GEN_PROBA_MODEL_DATA_PATH, 'train.csv'), sep = '\t')

Unnamed: 0,"h,r,t,p,h_type,t_type,r_h1,r_h0"
0,"70773,50,79353,1.0,66,55,7,38"
1,"70773,29,72509,1.0,66,55,29,29"
2,"70773,55,128039,1.0,66,8,55,55"
3,"70773,39,97473,1.0,66,66,39,39"
4,"70773,39,97473,1.0,66,66,39,39"
...,...
21025310,"34195,59,30067,1.0,34,101,59,38"
21025311,"34195,14,150878,1.0,34,8,10,38"
21025312,"34195,14,158267,1.0,34,102,10,38"
21025313,"34195,14,32488,0.5,34,45,10,38"


### Edu KG

In [4]:
df = pd.read_csv(join(EDU_KG_DATA_PATH, 'kg_triples.csv'))

In [5]:
df.head()

Unnamed: 0,head,tail,rel,time_amin,time_amax,proba
0,coursera_--626KkxEeywagovoAKHOQ,hard_skills_94462,_provides,2005-01-01,2005-01-01,0.12499
1,coursera_-0BI9jXyEeWa2g6sjqf03Q,hard_skills_95557,_provides,2005-01-01,2005-01-01,0.000303
2,coursera_-0wI4W8lEeys9RJMWW48Yw,hard_skills_94743,_provides,2005-01-01,2005-01-01,0.068955
3,coursera_-1YwAnTLEeSjmyIAC0aXFg,hard_skills_94234,_provides,2005-01-01,2005-01-01,0.011288
4,coursera_-1cp1vgjEeyxiRKaoDccyw,hard_skills_94561,_provides,2005-01-01,2005-01-01,0.003658


In [75]:
df.shape

(4571567, 6)

In [76]:
df.shape

(4571567, 6)

In [77]:
df = clean_triples(df_ = df, kg_map = edu_kg_cols)

11902


In [78]:
df.shape

(4506771, 6)

In [79]:
# train_df, val_df, test_df = get_split(df_= df, kg_map = edu_kg_cols)

In [80]:
# df_test = get_split_grouped_v2(df_ = df, kg_map = edu_kg_cols)

In [81]:
# df_test['scores'].sum()

In [82]:
train_df, val_df, test_df = get_split_grouped_v2(df_ = df, kg_map = edu_kg_cols)

scoring
groupe sampling train
sampling test val


In [83]:
# df_1 = get_stratified_score(df_ = df, kg_map = edu_kg_cols)
# print('groupe sampling train')

In [84]:
# train_df = df_1.groupby([edu_kg_cols['r'], edu_kg_cols['t']]).sample(frac=0.6, weights=list(df_1['scores']), random_state=123)
# print('sampling test val')


In [85]:
# train_df

In [86]:
# df_1

In [87]:
# val_test = df_1[~df_1.index.isin(train_df.index)]

In [88]:
# df_1['scores'].sum()

In [89]:
# val_test['scores'].min()

In [90]:
# val_test

In [91]:
# val_test['scores'].max()

In [92]:
# test_df = val_test.sample(n=val_test.shape[0]//2, weights='scores', random_state=666)


In [93]:
# val_df = val_test[~val_test.index.isin(test_df.index)]


In [94]:
# train_df.head()

In [95]:
test_split(train_df, val_df, test_df, edu_kg_cols)

ents in val not in train 0
ents in test not in train 0
ents in train not in val 225
ents in train not in test 6242
rels in val not in train 0
rels in test not in train 0
rels in train not in val 0
rels in train not in test 1
train size:  2733082
val size:  886845
test size:  886844
train %:  0.6064390669062173
val %:  0.19678057749106845
test %:  0.19678035560271423
VOCAB_SIZE:  41281
REL_VOCAB_SIZE:  12
soft_skills_9
_requires _acquired_by
soft_skills_9
_requires _acquired_by
soft_skills_9
_requires _belongs_to


In [96]:
train_df_, val_df_, test_df_, ent_map, rel_map = get_tvs_format_data(train_df, val_df, test_df, edu_kg_cols)

In [97]:
test_split(train_df_, val_df_, test_df_, edu_kg_cols)

ents in val not in train 0
ents in test not in train 0
ents in train not in val 225
ents in train not in test 6242
rels in val not in train 0
rels in test not in train 0
rels in train not in val 0
rels in train not in test 1
train size:  2733082
val size:  886845
test size:  886844
train %:  0.6064390669062173
val %:  0.19678057749106845
test %:  0.19678035560271423
VOCAB_SIZE:  41281
REL_VOCAB_SIZE:  12
41280
11 0
41280
11 0
41280
11 0


In [98]:
train_df_, val_df_, test_df_ = normalise_col_names(train_df_, val_df_, test_df_, edu_kg_cols)

In [99]:
train_df_.head()

Unnamed: 0,h,r,t,p
0,4556,1,12068,0.000303
1,6802,1,26447,0.011288
2,20516,1,35239,0.003658
3,13297,1,18027,0.017533
4,7878,1,15322,0.011753


In [100]:
test_df_.head()

Unnamed: 0,h,r,t,p
0,4239,3,9533,0.001678
1,23893,3,11678,0.000114
2,14445,3,20336,0.000296
3,1837,3,7475,0.001054
4,15997,3,6851,0.00013


In [102]:
save_splits(train_df_, val_df_, test_df_, ent_map, rel_map, None, EDU_KG_DATA_PATH)

## More Tests

In [58]:
read_json(join(PROBA_MODEL_DATA_PATH, 'ent_map.json'))

{'C0914140': 0,
 'C3269138': 1,
 'C1707257': 2,
 'C1418195': 3,
 'C0407816': 4,
 'C0266368': 5,
 'C0005151': 6,
 'C1152369': 7,
 'C1136207': 8,
 'C1149538': 9,
 'C0012141': 10,
 'C2242456': 11,
 'C0474200': 12,
 'C1271482': 13,
 'C0747249': 14,
 'C1158484': 15,
 'C0064091': 16,
 'C0348898': 17,
 'C3712838': 18,
 'C2693418': 19,
 'C0428411': 20,
 'C2715016': 21,
 'C1658150': 22,
 'C0202421': 23,
 'C1160340': 24,
 'C0054538': 25,
 'C0333843': 26,
 'C0679082': 27,
 'C1455072': 28,
 'C0700446': 29,
 'C0051529': 30,
 'C0109958': 31,
 'C1419991': 32,
 'C0181129': 33,
 'C1281526': 34,
 'C0270795': 35,
 'C100653382': 36,
 'C0242620': 37,
 'C0044819': 38,
 'C1522259': 39,
 'C1159468': 40,
 'C1327189': 41,
 'C5380060': 42,
 'C0475307': 43,
 'C0310081': 44,
 'C0766039': 45,
 'C3897005': 46,
 'C0948873': 47,
 'C0597548': 48,
 'C0020713': 49,
 'C0301665': 50,
 'C0057577': 51,
 'C0757193': 52,
 'C0081803': 53,
 'C3387895': 54,
 'C0036043': 55,
 'C0110120': 56,
 'C0001953': 57,
 'C0005330': 58,
 'C02

In [113]:
get_split_grouped(df_ = df, kg_map = med_kg_cols)

  0%|          | 0/18442502 [00:00<?, ?it/s]

('100033819', 'C0004391')
(2, 6)
('100033819', 'C0007587')
(2, 6)
('100033819', 'C0019158')
(2, 6)
('100033819', 'C0077157')
(4, 6)
('100033819', 'C0162638')
(2, 6)
('100033819', 'C0667830')
(2, 6)
('100033819', 'C1197259')
(4, 6)
('100033819', 'C1338772')
(4, 6)
('100033819', 'C1365594')
(4, 6)
('100033819', 'C2610958')
(2, 6)
('100037417', 'C0017262')
(2, 6)
('100038246', 'C0008059')
(2, 6)
('100038246', 'C0009402')
(2, 6)
('100038246', 'C0028754')
(2, 6)
('100038246', 'C1301685')
(2, 6)
('100038246', 'C1997217')
(2, 6)
('100048912', '100422933')
(4, 6)
('100048912', '100885789')
(4, 6)
('100048912', 'C0001511')
(2, 6)
('100048912', 'C0001678')
(4, 6)
('100048912', 'C0003392')
(2, 6)
('100048912', 'C0003486')
(2, 6)
('100048912', 'C0003850')
(4, 6)
('100048912', 'C0004391')
(4, 6)
('100048912', 'C0006142')
(4, 6)
('100048912', 'C0006826')
(4, 6)
('100048912', 'C0007090')
(2, 6)
('100048912', 'C0007103')
(2, 6)
('100048912', 'C0007107')
(2, 6)
('100048912', 'C0007134')
(2, 6)
('100048

('100124700', 'C0029431')
(2, 6)
('100124700', 'C0029433')
(2, 6)
('100124700', 'C0029463')
(2, 6)
('100124700', 'C0030421')
(2, 6)
('100124700', 'C0030567')
(2, 6)
('100124700', 'C0032914')
(8, 6)
('100124700', 'C0033860')
(2, 6)
('100124700', 'C0034786')
(2, 6)
('100124700', 'C0035222')
(2, 6)
('100124700', 'C0035298')
(2, 6)
('100124700', 'C0035668')
(2, 6)
('100124700', 'C0035696')
(2, 6)
('100124700', 'C0036421')
(2, 6)
('100124700', 'C0037083')
(2, 6)
('100124700', 'C0037930')
(2, 6)
('100124700', 'C0039101')
(2, 6)
('100124700', 'C0040300')
(4, 6)
('100124700', 'C0040648')
(2, 6)
('100124700', 'C0040690')
(2, 6)
('100124700', 'C0040845')
(2, 6)
('100124700', 'C0042172')
(6, 6)
('100124700', 'C0042333')
(2, 6)
('100124700', 'C0061202')
(2, 6)
('100124700', 'C0064524')
(4, 6)
('100124700', 'C0069676')
(2, 6)
('100124700', 'C0076080')
(6, 6)
('100124700', 'C0077157')
(4, 6)
('100124700', 'C0079744')
(2, 6)
('100124700', 'C0079904')
(2, 6)
('100124700', 'C0085295')
(2, 6)
('10012470

('100126299', 'C0019868')
(4, 6)
('100126299', 'C0021368')
(2, 6)
('100126299', 'C0027651')
(2, 6)
('100126299', 'C0029045')
(2, 6)
('100126299', 'C0031727')
(2, 6)
('100126299', 'C0035668')
(2, 6)
('100126299', 'C0164207')
(4, 6)
('100126299', 'C0243037')
(2, 6)
('100126299', 'C0475358')
(2, 6)
('100126299', 'C0596290')
(2, 6)
('100126299', 'C0596901')
(2, 6)
('100126299', 'C0596902')
(2, 6)
('100126299', 'C0597032')
(2, 6)
('100126299', 'C1150423')
(2, 6)
('100126299', 'C1167046')
(4, 6)
('100126299', 'C1332838')
(2, 6)
('100126299', 'C1515670')
(2, 6)
('100126299', 'C1519692')
(2, 6)
('100126299', 'C2347610')
(2, 6)
('100126299', 'C2613367')
(2, 6)
('100126299', 'C3715610')
(4, 6)
('100126299', 'C4048328')
(2, 6)
('100126310', 'C1336667')
(4, 6)
('100126311', 'C0001418')
(4, 6)
('100126311', 'C0007137')
(2, 6)
('100126311', 'C0007600')
(2, 6)
('100126311', 'C0021368')
(2, 6)
('100126311', 'C0242488')
(4, 6)
('100126311', 'C0596290')
(2, 6)
('100126311', 'C0819757')
(2, 6)
('10012631

('100128191', 'C1622501')
(2, 6)
('100128191', 'C2239176')
(6, 6)
('100128191', 'C2931822')
(2, 6)
('100128191', 'C3539878')
(2, 6)
('100128191', 'C4038423')
(2, 6)
('100128191', 'C4722463')
(2, 6)
('100128252', '100302276')
(4, 6)
('100128252', 'C0001418')
(2, 6)
('100128252', 'C0003873')
(2, 6)
('100128252', 'C0005684')
(2, 6)
('100128252', 'C0006826')
(2, 6)
('100128252', 'C0007102')
(4, 6)
('100128252', 'C0009402')
(2, 6)
('100128252', 'C0012634')
(2, 6)
('100128252', 'C0018270')
(2, 6)
('100128252', 'C0021760')
(2, 6)
('100128252', 'C0023434')
(2, 6)
('100128252', 'C0023467')
(4, 6)
('100128252', 'C0027051')
(2, 6)
('100128252', 'C0037083')
(2, 6)
('100128252', 'C0039215')
(2, 6)
('100128252', 'C0040300')
(4, 6)
('100128252', 'C0087040')
(4, 6)
('100128252', 'C0162638')
(2, 6)
('100128252', 'C0220633')
(2, 6)
('100128252', 'C0224522')
(2, 6)
('100128252', 'C0225369')
(2, 6)
('100128252', 'C0346429')
(2, 6)
('100128252', 'C0585362')
(4, 6)
('100128252', 'C0596290')
(2, 6)
('1001282

('100129696', 'C0019054')
(2, 6)
('100129696', 'C0020114')
(2, 6)
('100129696', 'C0020456')
(4, 6)
('100129696', 'C0022022')
(2, 6)
('100129696', 'C0023206')
(2, 6)
('100129696', 'C0023884')
(2, 6)
('100129696', 'C0024426')
(2, 6)
('100129696', 'C0026046')
(2, 6)
('100129696', 'C0026285')
(6, 6)
('100129696', 'C0026597')
(2, 6)
('100129696', 'C0026649')
(2, 6)
('100129696', 'C0027361')
(2, 6)
('100129696', 'C0027571')
(2, 6)
('100129696', 'C0027573')
(4, 6)
('100129696', 'C0027575')
(2, 6)
('100129696', 'C0028128')
(2, 6)
('100129696', 'C0028778')
(2, 6)
('100129696', 'C0030551')
(2, 6)
('100129696', 'C0030705')
(2, 6)
('100129696', 'C0030956')
(2, 6)
('100129696', 'C0031448')
(4, 6)
('100129696', 'C0031715')
(2, 6)
('100129696', 'C0032176')
(2, 6)
('100129696', 'C0032183')
(2, 6)
('100129696', 'C0032241')
(2, 6)
('100129696', 'C0032594')
(2, 6)
('100129696', 'C0033339')
(4, 6)
('100129696', 'C0033634')
(2, 6)
('100129696', 'C0033684')
(2, 6)
('100129696', 'C0033809')
(2, 6)
('10012969

('100130958', 'C0057277')
(3, 6)
('100130958', 'C0058120')
(3, 6)
('100130958', 'C0059627')
(3, 6)
('100130958', 'C0059668')
(3, 6)
('100130958', 'C0060505')
(3, 6)
('100130958', 'C0060520')
(3, 6)
('100130958', 'C0060549')
(3, 6)
('100130958', 'C0061516')
(3, 6)
('100130958', 'C0061751')
(3, 6)
('100130958', 'C0061928')
(3, 6)
('100130958', 'C0063211')
(3, 6)
('100130958', 'C0064280')
(3, 6)
('100130958', 'C0064636')
(3, 6)
('100130958', 'C0066014')
(3, 6)
('100130958', 'C0067072')
(3, 6)
('100130958', 'C0069717')
(6, 6)
('100130958', 'C0070558')
(3, 6)
('100130958', 'C0072402')
(3, 6)
('100130958', 'C0072422')
(3, 6)
('100130958', 'C0073096')
(3, 6)
('100130958', 'C0074449')
(4, 6)
('100130958', 'C0075842')
(3, 6)
('100130958', 'C0077157')
(6, 6)
('100130958', 'C0079849')
(3, 6)
('100130958', 'C0080356')
(3, 6)
('100130958', 'C0084721')
(3, 6)
('100130958', 'C0085080')
(3, 6)
('100130958', 'C0085262')
(3, 6)
('100130958', 'C0085542')
(3, 6)
('100130958', 'C0086761')
(3, 6)
('10013095

('100132285', 'C1335781')
(8, 6)
('100132285', 'C1338795')
(8, 6)
('100132285', 'C1522214')
(16, 6)
('100132354', 'C0520459')
(2, 6)
('100132354', 'C1423411')
(4, 6)
('100132406', 'C0001804')
(2, 6)
('100132406', 'C0004641')
(2, 6)
('100132406', 'C0004810')
(2, 6)
('100132406', 'C0005802')
(2, 6)
('100132406', 'C0007634')
(2, 6)
('100132406', 'C0008269')
(2, 6)
('100132406', 'C0008377')
(2, 6)
('100132406', 'C0010352')
(2, 6)
('100132406', 'C0010837')
(2, 6)
('100132406', 'C0011953')
(2, 6)
('100132406', 'C0012634')
(2, 6)
('100132406', 'C0012854')
(2, 6)
('100132406', 'C0013618')
(2, 6)
('100132406', 'C0013935')
(2, 6)
('100132406', 'C0014442')
(2, 6)
('100132406', 'C0014834')
(2, 6)
('100132406', 'C0015283')
(2, 6)
('100132406', 'C0016030')
(2, 6)
('100132406', 'C0016712')
(2, 6)
('100132406', 'C0017337')
(2, 6)
('100132406', 'C0017725')
(2, 6)
('100132406', 'C0017757')
(2, 6)
('100132406', 'C0017758')
(2, 6)
('100132406', 'C0019134')
(4, 6)
('100132406', 'C0021740')
(2, 6)
('1001324

('100187907', 'C0007578')
(2, 6)
('100187907', 'C0007587')
(2, 6)
('100187907', 'C0007601')
(2, 6)
('100187907', 'C0007634')
(12, 6)
('100187907', 'C0008018')
(2, 6)
('100187907', 'C0008139')
(2, 6)
('100187907', 'C0008148')
(2, 6)
('100187907', 'C0008624')
(16, 6)
('100187907', 'C0009013')
(2, 6)
('100187907', 'C0009276')
(2, 6)
('100187907', 'C0009388')
(2, 6)
('100187907', 'C0009768')
(2, 6)
('100187907', 'C0010284')
(2, 6)
('100187907', 'C0010583')
(2, 6)
('100187907', 'C0010654')
(2, 6)
('100187907', 'C0011065')
(4, 6)
('100187907', 'C0011306')
(2, 6)
('100187907', 'C0011860')
(2, 6)
('100187907', 'C0011884')
(4, 6)
('100187907', 'C0011991')
(4, 6)
('100187907', 'C0012222')
(2, 6)
('100187907', 'C0012578')
(2, 6)
('100187907', 'C0012634')
(4, 6)
('100187907', 'C0012854')
(2, 6)
('100187907', 'C0012984')
(2, 6)
('100187907', 'C0013227')
(6, 6)
('100187907', 'C0013846')
(2, 6)
('100187907', 'C0014057')
(2, 6)
('100187907', 'C0014257')
(2, 6)
('100187907', 'C0014442')
(2, 6)
('100187

('100187907', 'C0948380')
(2, 6)
('100187907', 'C0949508')
(2, 6)
('100187907', 'C0950580')
(2, 6)
('100187907', 'C0963102')
(2, 6)
('100187907', 'C0966503')
(2, 6)
('100187907', 'C0999921')
(2, 6)
('100187907', 'C100187907')
(4, 6)
('100187907', 'C1120843')
(2, 6)
('100187907', 'C1134659')
(4, 6)
('100187907', 'C1136102')
(2, 6)
('100187907', 'C1140675')
(2, 6)
('100187907', 'C1140999')
(2, 6)
('100187907', 'C1148756')
(2, 6)
('100187907', 'C1148846')
(6, 6)
('100187907', 'C1152760')
(2, 6)
('100187907', 'C1154401')
(2, 6)
('100187907', 'C1154599')
(2, 6)
('100187907', 'C1155379')
(2, 6)
('100187907', 'C1156814')
(2, 6)
('100187907', 'C1157613')
(2, 6)
('100187907', 'C1158770')
(2, 6)
('100187907', 'C1159978')
(2, 6)
('100187907', 'C1160466')
(2, 6)
('100187907', 'C1166650')
(2, 6)
('100187907', 'C1167622')
(2, 6)
('100187907', 'C1171362')
(2, 6)
('100187907', 'C1177422')
(4, 6)
('100187907', 'C1265608')
(2, 6)
('100187907', 'C1266909')
(2, 6)
('100187907', 'C1272641')
(2, 6)
('100187

('100272147', 'C0752046')
(2, 6)
('100272147', 'C0948896')
(2, 6)
('100272147', 'C1149331')
(5, 6)
('100272147', 'C1520200')
(2, 6)
('100272147', 'C4699378')
(2, 6)
('100272228', 'C0006142')
(2, 6)
('100272228', 'C1426935')
(4, 6)
('100287084', 'C0001962')
(2, 6)
('100287084', 'C0007587')
(2, 6)
('100287084', 'C0017337')
(2, 6)
('100287084', 'C0149784')
(2, 6)
('100287084', 'C0162327')
(2, 6)
('100287084', 'C0343537')
(2, 6)
('100287084', 'C0596263')
(2, 6)
('100287084', 'C2265451')
(2, 6)
('100287084', 'C2610186')
(4, 6)
('100287098', 'C0010054')
(4, 6)
('100287098', 'C0024623')
(2, 6)
('100287098', 'C0040300')
(2, 6)
('100287098', 'C0080983')
(8, 6)
('100287098', 'C0162638')
(2, 6)
('100287098', 'C0475358')
(2, 6)
('100287098', 'C0596290')
(2, 6)
('100287098', 'C2239176')
(2, 6)
('100287098', 'C4721610')
(2, 6)
('100287171', 'C0208355')
(2, 6)
('100287171', 'C0675992')
(2, 6)
('100287171', 'C1326207')
(2, 6)
('100287314', 'C0004609')
(4, 6)
('100287314', 'C0007131')
(4, 6)
('10028731

('100289419', 'C0042760')
(5, 6)
('100289419', 'C0042765')
(5, 6)
('100289419', 'C0042776')
(20, 6)
('100289419', 'C0043016')
(4, 6)
('100289419', 'C0054015')
(5, 6)
('100289419', 'C0055599')
(20, 6)
('100289419', 'C0068808')
(4, 6)
('100289419', 'C0082731')
(10, 6)
('100289419', 'C0085087')
(4, 6)
('100289419', 'C0085381')
(5, 6)
('100289419', 'C0087111')
(4, 6)
('100289419', 'C0162326')
(5, 6)
('100289419', 'C0162638')
(9, 6)
('100289419', 'C0169568')
(5, 6)
('100289419', 'C0175996')
(10, 6)
('100289419', 'C0178774')
(15, 6)
('100289419', 'C0206473')
(9, 6)
('100289419', 'C0206536')
(10, 6)
('100289419', 'C0232804')
(5, 6)
('100289419', 'C0233896')
(5, 6)
('100289419', 'C0235415')
(5, 6)
('100289419', 'C0237401')
(10, 6)
('100289419', 'C0243052')
(5, 6)
('100289419', 'C0254328')
(5, 6)
('100289419', 'C0288171')
(5, 6)
('100289419', 'C0291573')
(4, 6)
('100289419', 'C0318356')
(5, 6)
('100289419', 'C0332124')
(5, 6)
('100289419', 'C0337527')
(5, 6)
('100289419', 'C0341177')
(4, 6)
('1

('100302164', 'C0024117')
(4, 6)
('100302164', 'C0032529')
(2, 6)
('100302164', 'C0960880')
(2, 6)
('100302167', 'C0017337')
(2, 6)
('100302167', 'C0596290')
(2, 6)
('100302167', 'C0598934')
(2, 6)
('100302167', 'C0677626')
(2, 6)
('100302167', 'C0919267')
(2, 6)
('100302167', 'C1411500')
(4, 6)
('100302167', 'C1417768')
(4, 6)
('100302171', 'C1523298')
(2, 6)
('100302172', 'C0006826')
(2, 6)
('100302172', 'C0686619')
(2, 6)
('100302177', 'C0006826')
(2, 6)
('100302177', 'C0007600')
(2, 6)
('100302177', 'C0014859')
(2, 6)
('100302177', 'C0017638')
(4, 6)
('100302177', 'C0040300')
(2, 6)
('100302177', 'C0242379')
(2, 6)
('100302177', 'C0279626')
(2, 6)
('100302177', 'C0346429')
(2, 6)
('100302177', 'C0596290')
(2, 6)
('100302177', 'C1412546')
(4, 6)
('100302177', 'C1455553')
(4, 6)
('100302177', 'C1817666')
(2, 6)
('100302177', 'C2939419')
(2, 6)
('100302179', 'C0006282')
(2, 6)
('100302179', 'C2239176')
(2, 6)
('100302181', 'C0079189')
(2, 6)
('100302181', 'C0665687')
(4, 6)
('10030218

('100316868', 'C0008838')
(6, 6)
('100316868', 'C0009402')
(4, 6)
('100316868', 'C0012634')
(2, 6)
('100316868', 'C0013081')
(2, 6)
('100316868', 'C0014653')
(2, 6)
('100316868', 'C0014859')
(2, 6)
('100316868', 'C0016030')
(2, 6)
('100316868', 'C0016059')
(4, 6)
('100316868', 'C0018270')
(6, 6)
('100316868', 'C0019168')
(2, 6)
('100316868', 'C0020517')
(4, 6)
('100316868', 'C0021368')
(4, 6)
('100316868', 'C0021760')
(2, 6)
('100316868', 'C0024623')
(4, 6)
('100316868', 'C0029408')
(2, 6)
('100316868', 'C0034069')
(2, 6)
('100316868', 'C0035335')
(4, 6)
('100316868', 'C0036421')
(2, 6)
('100316868', 'C0040300')
(2, 6)
('100316868', 'C0040624')
(2, 6)
('100316868', 'C0045093')
(2, 6)
('100316868', 'C0105770')
(2, 6)
('100316868', 'C0153381')
(2, 6)
('100316868', 'C0161479')
(2, 6)
('100316868', 'C0162638')
(6, 6)
('100316868', 'C0165609')
(4, 6)
('100316868', 'C0206698')
(2, 6)
('100316868', 'C0225360')
(2, 6)
('100316868', 'C0229671')
(2, 6)
('100316868', 'C0238463')
(2, 6)
('10031686

('100422830', '100506365')
(4, 6)
('100422836', 'C0002395')
(2, 6)
('100422837', 'C0040300')
(2, 6)
('100422837', 'C1414293')
(4, 6)
('100422838', 'C1460675')
(4, 6)
('100422838', 'C1519595')
(2, 6)
('100422841', 'C1318751')
(4, 6)
('100422851', 'C0078058')
(2, 6)
('100422853', 'C2931822')
(2, 6)
('100422861', 'C0009402')
(2, 6)
('100422861', 'C3584793')
(4, 6)
('100422865', 'C0040300')
(2, 6)
('100422865', 'C1574460')
(4, 6)
('100422869', 'C0077157')
(4, 6)
('100422872', 'C0010346')
(2, 6)
('100422876', 'C0017638')
(4, 6)
('100422876', 'C1622501')
(2, 6)
('100422885', 'C0010346')
(2, 6)
('100422909', 'C0017952')
(2, 6)
('100422909', 'C0596290')
(2, 6)
('100422909', 'C0598934')
(2, 6)
('100422909', 'C1310413')
(4, 6)
('100422910', 'C0027055')
(2, 6)
('100422921', 'C1855593')
(4, 6)
('100422928', 'C0007595')
(2, 6)
('100422928', 'C0007621')
(2, 6)
('100422928', 'C0178874')
(2, 6)
('100422928', 'C0600600')
(2, 6)
('100422932', 'C0016030')
(2, 6)
('100422932', 'C0022548')
(2, 6)
('1004229

('100505633', 'C0040300')
(6, 6)
('100505633', 'C0074414')
(2, 6)
('100505633', 'C0085668')
(2, 6)
('100505633', 'C0103404')
(2, 6)
('100505633', 'C0162638')
(2, 6)
('100505633', 'C0235974')
(2, 6)
('100505633', 'C0343641')
(2, 6)
('100505633', 'C0346429')
(2, 6)
('100505633', 'C0476089')
(4, 6)
('100505633', 'C0596290')
(2, 6)
('100505633', 'C0677886')
(2, 6)
('100505633', 'C1171362')
(2, 6)
('100505633', 'C1172146')
(4, 6)
('100505633', 'C1335302')
(2, 6)
('100505633', 'C1412297')
(4, 6)
('100505633', 'C1419860')
(4, 6)
('100505633', 'C1520113')
(4, 6)
('100505633', 'C2239176')
(4, 6)
('100505633', 'C2350332')
(2, 6)
('100505633', 'C2939419')
(2, 6)
('100505641', '100126340')
(4, 6)
('100505641', 'C0001678')
(4, 6)
('100505641', 'C0006142')
(4, 6)
('100505641', 'C0006667')
(2, 6)
('100505641', 'C0006826')
(8, 6)
('100505641', 'C0007131')
(8, 6)
('100505641', 'C0007134')
(2, 6)
('100505641', 'C0007137')
(2, 6)
('100505641', 'C0007301')
(2, 6)
('100505641', 'C0007595')
(2, 6)
('1005056

('100505994', 'C0279626')
(2, 6)
('100505994', 'C0280100')
(2, 6)
('100505994', 'C0346388')
(2, 6)
('100505994', 'C0440744')
(2, 6)
('100505994', 'C0549473')
(2, 6)
('100505994', 'C0596290')
(6, 6)
('100505994', 'C0600472')
(2, 6)
('100505994', 'C0682523')
(2, 6)
('100505994', 'C0746730')
(2, 6)
('100505994', 'C0815089')
(2, 6)
('100505994', 'C1336093')
(4, 6)
('100505994', 'C1412102')
(4, 6)
('100505994', 'C1415725')
(4, 6)
('100505994', 'C1512505')
(2, 6)
('100505994', 'C1523298')
(6, 6)
('100505994', 'C1948023')
(2, 6)
('100505994', 'C4038423')
(2, 6)
('100505994', 'C4298990')
(2, 6)
('100505994', 'C4511001')
(2, 6)
('100505994', 'C4703567')
(2, 6)
('100505994', 'C5236984')
(2, 6)
('100506013', '100506013')
(8, 6)
('100506013', 'C0002871')
(2, 6)
('100506013', 'C0003009')
(2, 6)
('100506013', 'C0003261')
(4, 6)
('100506013', 'C0003320')
(2, 6)
('100506013', 'C0003483')
(2, 6)
('100506013', 'C0004096')
(4, 6)
('100506013', 'C0006826')
(2, 6)
('100506013', 'C0007222')
(2, 6)
('1005060

('100506311', 'C0040649')
(2, 6)
('100506311', 'C0040690')
(2, 6)
('100506311', 'C0056280')
(4, 6)
('100506311', 'C0086860')
(2, 6)
('100506311', 'C0126037')
(2, 6)
('100506311', 'C0126732')
(2, 6)
('100506311', 'C0162638')
(2, 6)
('100506311', 'C0164207')
(4, 6)
('100506311', 'C0220630')
(2, 6)
('100506311', 'C0220633')
(2, 6)
('100506311', 'C0225369')
(2, 6)
('100506311', 'C0238461')
(4, 6)
('100506311', 'C0238463')
(2, 6)
('100506311', 'C0334094')
(2, 6)
('100506311', 'C0439662')
(2, 6)
('100506311', 'C0504196')
(2, 6)
('100506311', 'C0525006')
(2, 6)
('100506311', 'C0596290')
(6, 6)
('100506311', 'C0596890')
(2, 6)
('100506311', 'C0677626')
(2, 6)
('100506311', 'C0678723')
(2, 6)
('100506311', 'C0686619')
(2, 6)
('100506311', 'C0730328')
(2, 6)
('100506311', 'C1158770')
(2, 6)
('100506311', 'C1326082')
(2, 6)
('100506311', 'C1332146')
(4, 6)
('100506311', 'C1411958')
(4, 6)
('100506311', 'C1413198')
(8, 6)
('100506311', 'C1413609')
(4, 6)
('100506311', 'C1415293')
(12, 6)
('1005063

('100506658', 'C0014609')
(6, 6)
('100506658', 'C0014939')
(2, 6)
('100506658', 'C0015161')
(2, 6)
('100506658', 'C0015350')
(2, 6)
('100506658', 'C0015376')
(6, 6)
('100506658', 'C0016030')
(6, 6)
('100506658', 'C0016712')
(2, 6)
('100506658', 'C0017082')
(2, 6)
('100506658', 'C0017262')
(8, 6)
('100506658', 'C0017337')
(4, 6)
('100506658', 'C0017558')
(4, 6)
('100506658', 'C0017626')
(2, 6)
('100506658', 'C0017710')
(2, 6)
('100506658', 'C0017725')
(2, 6)
('100506658', 'C0017797')
(2, 6)
('100506658', 'C0017822')
(2, 6)
('100506658', 'C0017824')
(2, 6)
('100506658', 'C0017837')
(2, 6)
('100506658', 'C0017911')
(2, 6)
('100506658', 'C0017968')
(2, 6)
('100506658', 'C0018207')
(2, 6)
('100506658', 'C0019080')
(2, 6)
('100506658', 'C0019196')
(2, 6)
('100506658', 'C0019204')
(2, 6)
('100506658', 'C0019693')
(2, 6)
('100506658', 'C0019699')
(2, 6)
('100506658', 'C0019868')
(2, 6)
('100506658', 'C0020114')
(4, 6)
('100506658', 'C0020456')
(2, 6)
('100506658', 'C0020838')
(2, 6)
('10050665

('100506658', 'C1336093')
(8, 6)
('100506658', 'C1336789')
(2, 6)
('100506658', 'C1337010')
(4, 6)
('100506658', 'C1365599')
(4, 6)
('100506658', 'C1367958')
(4, 6)
('100506658', 'C1419069')
(4, 6)
('100506658', 'C1419075')
(4, 6)
('100506658', 'C1419076')
(8, 6)
('100506658', 'C1427291')
(4, 6)
('100506658', 'C1440080')
(4, 6)
('100506658', 'C1443082')
(4, 6)
('100506658', 'C1447124')
(8, 6)
('100506658', 'C1449838')
(2, 6)
('100506658', 'C1451432')
(4, 6)
('100506658', 'C1515670')
(2, 6)
('100506658', 'C1515979')
(2, 6)
('100506658', 'C1516370')
(2, 6)
('100506658', 'C1519619')
(2, 6)
('100506658', 'C1519697')
(4, 6)
('100506658', 'C1519751')
(4, 6)
('100506658', 'C1521475')
(2, 6)
('100506658', 'C1527311')
(2, 6)
('100506658', 'C1565434')
(4, 6)
('100506658', 'C1571705')
(2, 6)
('100506658', 'C1608322')
(4, 6)
('100506658', 'C3896693')
(4, 6)
('100506660', 'C0005684')
(4, 6)
('100506660', 'C0006826')
(2, 6)
('100506660', 'C0007090')
(2, 6)
('100506660', 'C0007131')
(6, 6)
('10050666

('100506994', 'C0677886')
(2, 6)
('100506994', 'C0919759')
(4, 6)
('100506994', 'C1519697')
(2, 6)
('100506994', 'C3494264')
(2, 6)
('100507008', 'C0017337')
(2, 6)
('100507008', 'C0023884')
(2, 6)
('100507008', 'C0058217')
(2, 6)
('100507008', 'C0376358')
(2, 6)
('100507008', 'C2239176')
(2, 6)
('100507027', '100507027')
(35, 6)
('100507027', 'C0000146')
(3, 6)
('100507027', 'C0000959')
(2, 6)
('100507027', 'C0000996')
(2, 6)
('100507027', 'C0001041')
(6, 6)
('100507027', 'C0001044')
(3, 6)
('100507027', 'C0001418')
(6, 6)
('100507027', 'C0001734')
(2, 6)
('100507027', 'C0001927')
(2, 6)
('100507027', 'C0001962')
(2, 6)
('100507027', 'C0001963')
(2, 6)
('100507027', 'C0002069')
(2, 6)
('100507027', 'C0002074')
(3, 6)
('100507027', 'C0002395')
(6, 6)
('100507027', 'C0003232')
(3, 6)
('100507027', 'C0003241')
(2, 6)
('100507027', 'C0003250')
(2, 6)
('100507027', 'C0003297')
(3, 6)
('100507027', 'C0003759')
(3, 6)
('100507027', 'C0003808')
(3, 6)
('100507027', 'C0004096')
(6, 6)
('100507

('100507056', 'C0023810')
(2, 6)
('100507056', 'C0027651')
(4, 6)
('100507056', 'C0029418')
(2, 6)
('100507056', 'C0029463')
(2, 6)
('100507056', 'C0040300')
(2, 6)
('100507056', 'C0040649')
(2, 6)
('100507056', 'C0144576')
(2, 6)
('100507056', 'C0162638')
(4, 6)
('100507056', 'C0270922')
(2, 6)
('100507056', 'C0306657')
(4, 6)
('100507056', 'C0376358')
(2, 6)
('100507056', 'C0391978')
(2, 6)
('100507056', 'C0400966')
(2, 6)
('100507056', 'C0596290')
(4, 6)
('100507056', 'C0598934')
(6, 6)
('100507056', 'C1347689')
(4, 6)
('100507056', 'C1410725')
(4, 6)
('100507056', 'C1419860')
(4, 6)
('100507056', 'C1426624')
(4, 6)
('100507056', 'C1429839')
(4, 6)
('100507056', 'C1516170')
(2, 6)
('100507056', 'C2239176')
(2, 6)
('100507056', 'C2247140')
(2, 6)
('100507056', 'C2937421')
(2, 6)
('100507056', 'C2939419')
(2, 6)
('100507056', 'C3401577')
(4, 6)
('100507056', 'C3820615')
(2, 6)
('100507056', 'C3826170')
(2, 6)
('100507057', 'C0029463')
(2, 6)
('100507057', 'C0035696')
(2, 6)
('10050705

('100507436', 'C0025281')
(2, 6)
('100507436', 'C0028514')
(6, 6)
('100507436', 'C0030705')
(4, 6)
('100507436', 'C0032529')
(2, 6)
('100507436', 'C0033860')
(2, 6)
('100507436', 'C0038013')
(2, 6)
('100507436', 'C0039263')
(2, 6)
('100507436', 'C0040021')
(2, 6)
('100507436', 'C0040649')
(4, 6)
('100507436', 'C0042210')
(2, 6)
('100507436', 'C0062837')
(2, 6)
('100507436', 'C0066503')
(6, 6)
('100507436', 'C0085247')
(2, 6)
('100507436', 'C0086418')
(2, 6)
('100507436', 'C0162326')
(2, 6)
('100507436', 'C0162493')
(2, 6)
('100507436', 'C0225336')
(2, 6)
('100507436', 'C0232804')
(2, 6)
('100507436', 'C0239946')
(2, 6)
('100507436', 'C0263361')
(2, 6)
('100507436', 'C0301872')
(2, 6)
('100507436', 'C0376659')
(2, 6)
('100507436', 'C0393080')
(2, 6)
('100507436', 'C0443640')
(2, 6)
('100507436', 'C0494165')
(2, 6)
('100507436', 'C0518948')
(2, 6)
('100507436', 'C0699748')
(4, 6)
('100507436', 'C0699893')
(4, 6)
('100507436', 'C1167322')
(2, 6)
('100507436', 'C1337641')
(4, 6)
('10050743

('100616100', 'C0021757')
(3, 6)
('100616100', 'C0021759')
(5, 6)
('100616100', 'C0021760')
(12, 6)
('100616100', 'C0021764')
(4, 6)
('100616100', 'C0021770')
(2, 6)
('100616100', 'C0021948')
(2, 6)
('100616100', 'C0022567')
(2, 6)
('100616100', 'C0022646')
(6, 6)
('100616100', 'C0022658')
(7, 6)
('100616100', 'C0022663')
(2, 6)
('100616100', 'C0022917')
(2, 6)
('100616100', 'C0022942')
(2, 6)
('100616100', 'C0023516')
(6, 6)
('100616100', 'C0023530')
(4, 6)
('100616100', 'C0023545')
(4, 6)
('100616100', 'C0023546')
(2, 6)
('100616100', 'C0023553')
(22, 6)
('100616100', 'C0023558')
(4, 6)
('100616100', 'C0023562')
(4, 6)
('100616100', 'C0023810')
(6, 6)
('100616100', 'C0023823')
(2, 6)
('100616100', 'C0023828')
(2, 6)
('100616100', 'C0023837')
(2, 6)
('100616100', 'C0023884')
(2, 6)
('100616100', 'C0023907')
(2, 6)
('100616100', 'C0024109')
(3, 6)
('100616100', 'C0024117')
(4, 6)
('100616100', 'C0024141')
(2, 6)
('100616100', 'C0024198')
(2, 6)
('100616100', 'C0024353')
(4, 6)
('100616

('100616100', 'C0687129')
(2, 6)
('100616100', 'C0699748')
(6, 6)
('100616100', 'C0699918')
(4, 6)
('100616100', 'C0699949')
(2, 6)
('100616100', 'C0700198')
(2, 6)
('100616100', 'C0733439')
(4, 6)
('100616100', 'C0744471')
(2, 6)
('100616100', 'C0750016')
(2, 6)
('100616100', 'C0751982')
(2, 6)
('100616100', 'C0752312')
(2, 6)
('100616100', 'C0752320')
(2, 6)
('100616100', 'C0754893')
(2, 6)
('100616100', 'C0814002')
(2, 6)
('100616100', 'C0815089')
(2, 6)
('100616100', 'C0817124')
(4, 6)
('100616100', 'C0857121')
(2, 6)
('100616100', 'C0871786')
(2, 6)
('100616100', 'C0876973')
(2, 6)
('100616100', 'C0887899')
(2, 6)
('100616100', 'C0910167')
(2, 6)
('100616100', 'C0917798')
(2, 6)
('100616100', 'C0947693')
(2, 6)
('100616100', 'C0947751')
(2, 6)
('100616100', 'C0947858')
(2, 6)
('100616100', 'C0948393')
(2, 6)
('100616100', 'C0948853')
(2, 6)
('100616100', 'C0963088')
(2, 6)
('100616100', 'C0971858')
(4, 6)
('100616100', 'C100861467')
(2, 6)
('100616100', 'C105375913')
(2, 6)
('1006

('100616443', 'C0291573')
(3, 6)
('100616443', 'C0295247')
(3, 6)
('100616443', 'C0301641')
(3, 6)
('100616443', 'C0301871')
(3, 6)
('100616443', 'C0311474')
(3, 6)
('100616443', 'C0317872')
(3, 6)
('100616443', 'C0376358')
(3, 6)
('100616443', 'C0385463')
(3, 6)
('100616443', 'C0486805')
(3, 6)
('100616443', 'C0544910')
(3, 6)
('100616443', 'C0565125')
(15, 6)
('100616443', 'C0580247')
(3, 6)
('100616443', 'C0596290')
(6, 6)
('100616443', 'C0596922')
(3, 6)
('100616443', 'C0596981')
(4, 6)
('100616443', 'C0597295')
(3, 6)
('100616443', 'C0597357')
(3, 6)
('100616443', 'C0598312')
(3, 6)
('100616443', 'C0598496')
(4, 6)
('100616443', 'C0598934')
(7, 6)
('100616443', 'C0599155')
(3, 6)
('100616443', 'C0599781')
(3, 6)
('100616443', 'C0599893')
(3, 6)
('100616443', 'C0599946')
(6, 6)
('100616443', 'C0600388')
(3, 6)
('100616443', 'C0600493')
(3, 6)
('100616443', 'C0600680')
(3, 6)
('100616443', 'C0600688')
(3, 6)
('100616443', 'C0678572')
(3, 6)
('100616443', 'C0695728')
(12, 6)
('100616

('100616496', 'C0036773')
(4, 6)
('100616496', 'C0036974')
(2, 6)
('100616496', 'C0037080')
(4, 6)
('100616496', 'C0037144')
(2, 6)
('100616496', 'C0037179')
(2, 6)
('100616496', 'C0037224')
(2, 6)
('100616496', 'C0037380')
(4, 6)
('100616496', 'C0037949')
(2, 6)
('100616496', 'C0037994')
(2, 6)
('100616496', 'C0038172')
(2, 6)
('100616496', 'C0038838')
(2, 6)
('100616496', 'C0039194')
(14, 6)
('100616496', 'C0039195')
(8, 6)
('100616496', 'C0039231')
(2, 6)
('100616496', 'C0039286')
(2, 6)
('100616496', 'C0039597')
(2, 6)
('100616496', 'C0040053')
(2, 6)
('100616496', 'C0040079')
(2, 6)
('100616496', 'C0040300')
(2, 6)
('100616496', 'C0040624')
(2, 6)
('100616496', 'C0040648')
(4, 6)
('100616496', 'C0040649')
(14, 6)
('100616496', 'C0040669')
(2, 6)
('100616496', 'C0040711')
(8, 6)
('100616496', 'C0041538')
(4, 6)
('100616496', 'C0042210')
(2, 6)
('100616496', 'C0042216')
(2, 6)
('100616496', 'C0042333')
(2, 6)
('100616496', 'C0042338')
(2, 6)
('100616496', 'C0042720')
(4, 6)
('100616

('100616496', 'C1411044')
(4, 6)
('100616496', 'C1412099')
(4, 6)
('100616496', 'C1414301')
(4, 6)
('100616496', 'C1416066')
(4, 6)
('100616496', 'C1419252')
(4, 6)
('100616496', 'C1445451')
(4, 6)
('100616496', 'C1456409')
(2, 6)
('100616496', 'C1457124')
(4, 6)
('100616496', 'C1474518')
(2, 6)
('100616496', 'C1511698')
(2, 6)
('100616496', 'C1511760')
(2, 6)
('100616496', 'C1512301')
(2, 6)
('100616496', 'C1512796')
(2, 6)
('100616496', 'C1512977')
(2, 6)
('100616496', 'C1513019')
(2, 6)
('100616496', 'C1513410')
(2, 6)
('100616496', 'C1514570')
(4, 6)
('100616496', 'C1514716')
(2, 6)
('100616496', 'C1514926')
(2, 6)
('100616496', 'C1515670')
(2, 6)
('100616496', 'C1516511')
(2, 6)
('100616496', 'C1516924')
(2, 6)
('100616496', 'C1519346')
(2, 6)
('100616496', 'C1519591')
(6, 6)
('100616496', 'C1519595')
(4, 6)
('100616496', 'C1522642')
(2, 6)
('100616496', 'C1522909')
(2, 6)
('100616496', 'C1523014')
(2, 6)
('100616496', 'C1523347')
(2, 6)
('100616496', 'C1527360')
(2, 6)
('10061649

('100859930', 'C1284424')
(4, 6)
('100859930', 'C1406961')
(4, 6)
('100859930', 'C1411024')
(4, 6)
('100859930', 'C1413068')
(4, 6)
('100859930', 'C1425818')
(8, 6)
('100859930', 'C1515670')
(2, 6)
('100859930', 'C1516119')
(2, 6)
('100859930', 'C2239176')
(2, 6)
('100859930', 'C2931822')
(2, 6)
('100859930', 'C3539878')
(2, 6)
('100861402', 'C0040300')
(2, 6)
('100861402', 'C0162638')
(4, 6)
('100861402', 'C0170936')
(2, 6)
('100861402', 'C0235974')
(4, 6)
('100861402', 'C0596263')
(2, 6)
('100861402', 'C0596290')
(4, 6)
('100861402', 'C0598934')
(2, 6)
('100861402', 'C1427532')
(4, 6)
('100861402', 'C1512505')
(2, 6)
('100861402', 'C2239176')
(2, 6)
('100861402', 'C4729223')
(8, 6)
('100861467', 'C0003261')
(2, 6)
('100861467', 'C0003313')
(2, 6)
('100861467', 'C0004561')
(2, 6)
('100861467', 'C0006675')
(2, 6)
('100861467', 'C0007452')
(2, 6)
('100861467', 'C0008495')
(2, 6)
('100861467', 'C0009498')
(4, 6)
('100861467', 'C0009528')
(2, 6)
('100861467', 'C0014644')
(2, 6)
('10086146

('100862685', 'C0006933')
(10, 6)
('100862685', 'C0007004')
(20, 6)
('100862685', 'C0007009')
(10, 6)
('100862685', 'C0007033')
(10, 6)
('100862685', 'C0007075')
(10, 6)
('100862685', 'C0007090')
(72, 6)
('100862685', 'C0007097')
(118, 6)
('100862685', 'C0007102')
(10, 6)
('100862685', 'C0007103')
(69, 6)
('100862685', 'C0007104')
(28, 6)
('100862685', 'C0007115')
(40, 6)
('100862685', 'C0007124')
(55, 6)
('100862685', 'C0007130')
(14, 6)
('100862685', 'C0007131')
(57, 6)
('100862685', 'C0007137')
(10, 6)
('100862685', 'C0007188')
(10, 6)
('100862685', 'C0007222')
(10, 6)
('100862685', 'C0007284')
(10, 6)
('100862685', 'C0007295')
(20, 6)
('100862685', 'C0007301')
(20, 6)
('100862685', 'C0007332')
(30, 6)
('100862685', 'C0007341')
(20, 6)
('100862685', 'C0007367')
(10, 6)
('100862685', 'C0007382')
(30, 6)
('100862685', 'C0007422')
(10, 6)
('100862685', 'C0007427')
(10, 6)
('100862685', 'C0007428')
(10, 6)
('100862685', 'C0007447')
(10, 6)
('100862685', 'C0007450')
(20, 6)
('100862685',

('100862685', 'C0017534')
(20, 6)
('100862685', 'C0017636')
(20, 6)
('100862685', 'C0017658')
(20, 6)
('100862685', 'C0017710')
(14, 6)
('100862685', 'C0017725')
(20, 6)
('100862685', 'C0017776')
(10, 6)
('100862685', 'C0017786')
(10, 6)
('100862685', 'C0017797')
(10, 6)
('100862685', 'C0017801')
(10, 6)
('100862685', 'C0017817')
(10, 6)
('100862685', 'C0017837')
(20, 6)
('100862685', 'C0017842')
(10, 6)
('100862685', 'C0017857')
(10, 6)
('100862685', 'C0017861')
(20, 6)
('100862685', 'C0017890')
(10, 6)
('100862685', 'C0017916')
(10, 6)
('100862685', 'C0017952')
(10, 6)
('100862685', 'C0017968')
(50, 6)
('100862685', 'C0017973')
(30, 6)
('100862685', 'C0017982')
(10, 6)
('100862685', 'C0018023')
(14, 6)
('100862685', 'C0018150')
(20, 6)
('100862685', 'C0018183')
(10, 6)
('100862685', 'C0018188')
(10, 6)
('100862685', 'C0018207')
(56, 6)
('100862685', 'C0018213')
(14, 6)
('100862685', 'C0018270')
(159, 6)
('100862685', 'C0018284')
(30, 6)
('100862685', 'C0018330')
(10, 6)
('100862685',

KeyboardInterrupt: 

In [97]:
train_df, val_df, test_df = get_split(df_ = df, kg_map = med_kg_cols)

In [99]:
train_df.head()

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,scores
0,21012863,CAUSES,100033819,C0004391,1.0,1164785
1,21012863,CAUSES,100033819,C0004391,1.0,1164785
2,22628456,AUGMENTS,100033819,C0007587,1.0,779216
3,22628456,AUGMENTS,100033819,C0007587,1.0,779216
4,22628462,ASSOCIATED_WITH,100033819,C0019158,1.0,1294627


In [98]:
test_split(train_df, val_df, test_df, med_kg_cols)

ents in val not in train 0
ents in test not in train 0
ents in train not in val 277501
ents in train not in test 273383
rels in val not in train 0
rels in test not in train 0
rels in train not in val 5
rels in train not in test 7
train size:  26444737
val size:  986090
test size:  986090
train %:  0.930598382646506
val %:  0.034700808676747026
test %:  0.034700808676747026
VOCAB_SIZE:  344991
REL_VOCAB_SIZE:  68
Patients
same_as 1532
C5542959
same_as ADMINISTERED_TO
C5544407
same_as ADMINISTERED_TO


In [38]:
df.shape

(28416917, 5)

In [89]:
process_data(df_ = df, kg_map = med_kg_cols, save_path = '')

185977


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_[kg_map['h']] = df_[kg_map['h']].map(all_ents)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_[kg_map['t']] = df_[kg_map['t']].map(all_ents)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_[kg_map['r']] = df_[kg_map['r']].map(all_rels)
A value is trying to be set on a copy of a slice from a

ents in val not in train 12
ents in test not in train 11
ents in train not in val 23715
ents in train not in test 15044
rels in val not in train 1
rels in test not in train 0
rels in train not in val 0
rels in train not in test 0
train size:  22277870
val size:  2784734
test size:  2784734
train %:  0.79999998563597
val %:  0.10000000718201503
test %:  0.10000000718201503
VOCAB_SIZE:  159013
REL_VOCAB_SIZE:  63
159012
62 0
159012
62 0
159012
62 0


## Tests

In [39]:
clean_triples(df_ = df, kg_map = med_kg_cols).shape

185977


(27847338, 5)

In [40]:
get_format_data(df_u = df, kg_map = med_kg_cols).shape

TypeError: get_format_data() got an unexpected keyword argument 'df_u'