In [1]:
# the objective is to splite sentence files and aux_predications into multiple files
# each file needs to contains only sentences and aux_preds of the same triple
import pandas as pd
from glob import glob
from os.path import join
import gc
import gzip
from csv import reader
from collections import Counter
import os
from tqdm.notebook import tqdm
import numpy as np
import json
from multiprocessing.pool import Pool, ThreadPool
from pathos.multiprocessing import ProcessingPool
from concurrent.futures import ProcessPoolExecutor
from more_itertools import batched

SENTENCE_COLS = [
    'SENTENCE_ID','PMID','TYPE','NUMBER','SENT_START_INDEX',
    'SENTENCE','SENT_END_INDEX','NORMALIZED_SECTION_HEADER','SECTION_HEADER']
PREDICATION_AUX_COLS = ['PREDICATION_AUX_ID' ,'PREDICATION_ID' ,'SUBJECT_TEXT','SUBJECT_DIST'
,'SUBJECT_MAXDIST','SUBJECT_START_INDEX'
,'SUBJECT_END_INDEX','SUBJECT_SCORE'
,'INDICATOR_TYPE','PREDICATE_START_INDEX'
,'PREDICATE_END_INDEX','OBJECT_TEXT','OBJECT_DIST'
,'OBJECT_MAXDIST','OBJECT_START_INDEX'
,'OBJECT_END_INDEX','OBJECT_SCORE'
,'CURR_TIMESTAMP']

PREDICATION_COLS = ['PREDICATION_ID','SENTENCE_ID','PMID',
                        'PREDICATE','SUBJECT_CUI','SUBJECT_NAME','SUBJECT_SEMTYPE',
                       'SUBJECT_NOVELTY','OBJECT_CUI','OBJECT_NAME','OBJECT_SEMTYPE','OBJECT_NOVELTY','unk1','unk2','unk3']

SENTENCE_DIR = 'SENTENCE'
PREDICATION_DIR = 'PREDICATION'
PREDICATION_AUX_DIR = 'PREDICATION_AUX'


In [2]:
# !pip install pathos
# !pip install more-itertools
# list(batched(list(range(20)), 3))
# !pip install -U pandas

In [3]:
pd.__version__

'1.5.3'

In [4]:
import gc

gc.collect()

0

In [5]:
def write_json_lines(file_name,dict_data):
    json_string = json.dumps(dict_data)
    with open(file_name, 'a') as f:
        f.write(json_string+"\n")
        
def read_json_lines(file_name):
    lines = []
    with open(file_name) as file_in:
        for line in file_in:
            lines.append(json.loads(line))
    return lines

def save_json(data_path, data):
    with open(data_path, 'w') as f:
        json.dump(data, f)
def read_json(data_path):
    with open(data_path) as json_file:
        data = json.load(json_file)
    return data


In [6]:
def manual_separation(bad_line):
    good_1 = bad_line[:5]
    good_2 = bad_line[-3:]
    sent_parts = bad_line[5:-3]
    sent_parts = [', '.join(sent_parts)]
    all_data = good_1+ sent_parts + good_2
    return all_data
def test_line(bad_line):
    print(bad_line)
    return bad_line

def create_sentence_indexes():
    all_files = sorted(glob(join(SENTENCE_DIR,'*.gz')))
    print(len(all_files))
    for fil in tqdm(all_files):
        df = pd.read_csv(fil, on_bad_lines=manual_separation,
                 compression= 'gzip', encoding='ISO-8859-1', header=None, names = SENTENCE_COLS, engine='python'
        )
        df['file_name'] = fil
        df[['SENTENCE_ID', 'file_name']].to_csv(join(fil.split('.')[0]+'_index.csv'), index = False)

def get_all_index(main_dir):
    all_files = sorted(glob(join(main_dir,'*_index.csv')))
    all_index = None
    for fil in tqdm(all_files):
        df = pd.read_csv(fil)
        if all_index is None:
            all_index = df
        else:
            all_index = pd.concat([all_index, df], ignore_index = True)
    return all_index

def create_pred_aux_indexes():
    all_files = sorted(glob(join(PREDICATION_AUX_DIR,'*.gz')))
    print(len(all_files))
    for fil in tqdm(all_files):
        df = pd.read_csv(fil, on_bad_lines=test_line,
                 compression= 'gzip', encoding='ISO-8859-1', header=None, names = PREDICATION_AUX_COLS, engine='python'
        )
        df['file_name'] = fil
        df[['PREDICATION_AUX_ID', 'PREDICATION_ID', 'file_name']].to_csv(
            join(fil.split('.')[0]+'_index.csv'),compression= 'gzip', index = False)

def create_pred_indexes():
    all_files = sorted(glob(join(PREDICATION_DIR,'*.gz')))
    print(len(all_files))
    for fil in tqdm(all_files):
        df = pd.read_csv(fil, on_bad_lines=test_line,
                 compression= 'gzip', encoding='ISO-8859-1', header=None, names = PREDICATION_COLS, engine='python'
        )
        df['file_name'] = fil
        df[['PREDICATION_AUX_ID', 'PREDICATION_ID', 'file_name']].to_csv(
            join(fil.split('.')[0]+'_index.csv'),compression= 'gzip', index = False)

def create_indexes(main_dir, index_cols):
    all_files = sorted(glob(join(main_dir,'*.gz')))
    print(len(all_files))
    for fil in tqdm(all_files):
        df = pd.read_csv(fil, on_bad_lines=test_line,
                 compression= 'gzip', encoding='ISO-8859-1', header=None, names = PREDICATION_COLS, engine='python'
        )
        df['file_name'] = fil
        df[index_cols].to_csv(join(fil.split('.')[0]+'_index.csv'),compression= 'gzip', index = False)

def mask_triple(sentence, sent_end, s_end, s_start, p_end, p_start, o_end, o_start):
    ends = np.array([s_end, p_end, o_end])
    starts = np.array([s_start, p_start, o_start])
    element = ['s', 'o', 'p']
    el_dict = {'s':'@SUBJECT$', 'o':'@OBJECT$', 'p':'@PREDICAT$'}
    values = [(start, end, el) for start, end, el in zip(starts, ends, element)]
    dtype = [('start', int), ('end', int), ('el', 'S10')]
    a = np.array(values, dtype=dtype)       # create a structured array
    a = np.sort(a, order='start')
#     a[0][0]a[0][1]a[0][2]a[1]a[2]
    s = sent_end
    sent = sentence[:a[0][0]-s]+el_dict[a[0][2].decode('UTF-8')]+\
    sentence[a[0][1]-s:a[1][0]-s]+el_dict[a[1][2].decode('UTF-8')]+\
    sentence[a[1][1]-s:a[2][0]-s]+el_dict[a[2][2].decode('UTF-8')]+sentence[a[2][1]-s:]
    return sent

def get_predicate_sent_factuality(df_sent_idx):
    all_files = sorted(glob(join(PREDICATION_AUX_DIR,'*.gz')))
    print(len(all_files))
    save_data = []
    save_counter = 0
    for fil in tqdm(all_files):
#         print(fil)
        df = pd.read_csv(
            fil, compression= 'gzip', on_bad_lines=test_line, encoding='ISO-8859-1',
            header=None, names = PREDICATION_AUX_COLS, engine='python'
        )
        df = df.merge(df_sent_idx, left_on='PREDICATION_AUX_ID', right_on='PREDICATION_AUX_ID', how = 'inner')
        df = df.groupby('file_name')
        for file_name, pred_df in df:
            sent_df = pd.read_csv(
                file_name, compression= 'gzip', on_bad_lines=manual_separation, encoding='ISO-8859-1',
                header=None, names = SENTENCE_COLS, engine='python'
            )
            for i, rec in pred_df.iterrows():
                try:
                    data = {}
                    SENTENCE_ID = rec['SENTENCE_ID']
                    PREDICATION_AUX_ID = rec['PREDICATION_AUX_ID']
                    PREDICATION_ID = rec['PREDICATION_ID_x']
                    sent_data = sent_df[sent_df['SENTENCE_ID']==SENTENCE_ID].to_dict('records')[0]
                    SENTENCE = sent_data['SENTENCE']
                    SENT_START_INDEX = sent_data['SENT_START_INDEX']
                    SENT_END_INDEX = sent_data['SENT_END_INDEX']
                    SUBJECT_START_INDEX = rec['SUBJECT_START_INDEX']
                    SUBJECT_END_INDEX = rec['SUBJECT_END_INDEX']
                    PREDICATE_START_INDEX = rec['PREDICATE_START_INDEX']
                    PREDICATE_END_INDEX = rec['PREDICATE_END_INDEX']
                    OBJECT_START_INDEX = rec['OBJECT_START_INDEX']
                    OBJECT_END_INDEX = rec['OBJECT_END_INDEX']
                    formated_sent = mask_triple(
                        SENTENCE, SENT_START_INDEX ,
                        s_start=SUBJECT_START_INDEX, s_end=SUBJECT_END_INDEX,
                        p_start=PREDICATE_START_INDEX, p_end=PREDICATE_END_INDEX,
                        o_start=OBJECT_START_INDEX, o_end=OBJECT_END_INDEX)
                    data['PREDICATION_AUX_ID'] = PREDICATION_AUX_ID
                    data['SENTENCE'] = SENTENCE
                    data['FORMATED_SENTENCE'] = formated_sent
                    data['file_name'] = file_name
                    save_data.append(data)
                    if len(save_data)>100:
                        save_data = pd.DataFrame(save_data)
                        save_data.to_csv(join('Format_sents', 'form_sent_{}.csv'.format(save_counter)), compression = 'gzip' , index = False)
                        save_counter += 1
                        save_data = []
#                     write_json_lines('all_sentences.jsonl',data)
                except:
                    print(SENTENCE_ID,' : ',SENTENCE)
            if len(save_data)>0:
                save_data = pd.DataFrame(save_data)
                save_data.to_csv(join('Format_sents', 'form_sent_{}.csv'.format(save_counter)), compression = 'gzip' , index = False)
                save_data = []
#                 return raw_sent
# #                 return sent_data, rec
#             return sent_df
#         return df
def get_format_sentences_examples():
    all_paths = glob(join('Format_sents', '*.csv'))
    

In [7]:
'split_01.csv.gz'.split('.')[0]

'split_01'

## Spliting zipped CSVs into multiple files

In [None]:
# gzip -dc semmedVER43_2021_R_SENTENCE.csv | split -C 21G -d - split_ --filter='gzip > $FILE.csv.gz'
# pigz -dc semmedVER43_2021_R_SENTENCE.csv | split -C 20M -d - split_ --filter='pigz > $FILE.csv.gz'
# pigz -dc semmedVER43_2021_R_PREDICATION_AUX.csv | split -C 20M -d - split_ --filter='pigz > $FILE.csv.gz'

## Creating Sentence File Index

In [None]:
create_sentence_indexes()

In [None]:
df = get_all_index(SENTENCE_DIR)

In [None]:
df.shape

In [None]:
df.to_csv(join(SENTENCE_DIR, 'all_index.csv'),index=False)

## Pred AUX Index Creation

In [None]:
create_pred_aux_indexes()

In [None]:
df = get_all_index(PREDICATION_AUX_DIR)

In [None]:
df.to_csv(join(PREDICATION_AUX_DIR, 'all_index.csv'),compression= 'gzip',index=False)

## Pred index Creation

In [None]:
create_indexes(PREDICATION_DIR, ['SENTENCE_ID', 'PREDICATION_ID', 'file_name'])

In [None]:
df = get_all_index(PREDICATION_DIR)

In [None]:
df.to_csv(join(PREDICATION_DIR, 'all_index.csv'),compression= 'gzip',index=False)

## Combine Pred and Pred AUX Indeces

In [None]:
df_aux = pd.read_csv(join(PREDICATION_AUX_DIR, 'all_index.csv'))
df = pd.read_csv(join(PREDICATION_DIR, 'all_index.csv'))

In [None]:
df_aux.head()

In [None]:
df.head()

In [None]:
df_aux = df_aux.merge(df, left_on='PREDICATION_ID', right_on='PREDICATION_ID', how = 'inner')


In [None]:
df.shape

In [None]:
df_aux.shape

In [None]:
df_aux.to_csv(join(PREDICATION_AUX_DIR, 'all_combo_index.csv'),compression= 'gzip',index=False)

## Combine Preds and Sentences

In [None]:
df_preds = pd.read_csv(join(PREDICATION_AUX_DIR, 'all_combo_index.csv'),compression= 'gzip')

In [None]:
del df_preds['file_name_x']
del df_preds['file_name_y']

In [None]:
df_preds.head()

In [None]:
df_sents = pd.read_csv(join(SENTENCE_DIR, 'all_index.csv'),compression= 'gzip')

In [None]:
# df_sents.to_csv(join(SENTENCE_DIR, 'all_index.csv'),compression= 'gzip',index=False)

In [None]:
df_preds = df_preds.merge(df_sents, left_on='SENTENCE_ID', right_on='SENTENCE_ID', how = 'inner')

In [None]:
df_preds.shape

In [None]:
df_preds.head()

In [None]:
df_preds.to_csv(join(PREDICATION_AUX_DIR, 'all_combo_index_v2.csv'),compression= 'gzip',index=False)

## Creating Sentence Predications

In [None]:
df_index = pd.read_csv(join(PREDICATION_AUX_DIR, 'all_combo_index_v2.csv'),compression= 'gzip')

In [None]:
df_index.head()

In [None]:
df = get_predicate_sent_factuality(df_index)

In [None]:
df

In [None]:
df[0]

In [None]:
df[1]

In [None]:
aa = mask_triple(
    df[0]['SENTENCE'], df[0]['SENT_START_INDEX'] ,
    s_start=df[1]['SUBJECT_START_INDEX'], s_end=df[1]['SUBJECT_END_INDEX'],
    p_start=df[1]['PREDICATE_START_INDEX'], p_end=df[1]['PREDICATE_END_INDEX'],
    o_start=df[1]['OBJECT_START_INDEX'], o_end=df[1]['OBJECT_END_INDEX'])

In [None]:
aa

In [None]:
df[0]['SENTENCE'][69-21:80-21]

In [None]:
for i in df.groupby('file_name'):
    print(i[-1].shape)

In [None]:
df[1]

In [None]:
df.head()

## Explore formatted sentences

In [None]:
all_sent_files = glob(join('Format_sents', '*.csv'))

In [None]:
all_sent_files = os.listdir(join('Format_sents'))

In [None]:
all_sent_files

In [None]:
max([int(i.split('_')[-1].split('.')[0]) for i in all_sent_files])

In [None]:
pd.read_csv(join('Format_sents',all_sent_files[0]), compression = 'gzip')

In [None]:
pd.read_csv(join('Format_sents','labeled_sent'+'_'+all_sent_files[0].split('_')[-1]), compression = 'gzip')

In [5]:
FACTUALITY_INT = ['Uncommitted', 'Fact',
                  'Probable', 'Possible', 'Counterfact',
                  'Doubtful', 'Conditional']
# , 'Conditional':[]
def get_factuality_sentence_samples(max_sample = 20):
    all_data = {'Uncommitted':[], 'Fact':[],
                  'Probable':[], 'Possible':[], 'Counterfact':[],
                  'Doubtful':[]}
    total_runs = 1126784
    pbar = tqdm(total = total_runs)
    for i in range(total_runs):
        sents = pd.read_csv(join('Format_sents', 'form_sent_{}.csv'.format(i)), compression ='gzip')
        labels = pd.read_csv(join('Format_sents', 'labeled_sent_{}.csv'.format(i)), compression ='gzip')
        fact_map = dict(zip(labels['PREDICATION_AUX_ID'], labels['label']))
        sents['labels'] = sents['PREDICATION_AUX_ID'].map(fact_map)
#         print(sents['labels'].unique())
        for label in all_data:
            label_df = sents[sents['labels'] == label]
            sample_size = min([max_sample-len(all_data[label]), label_df.shape[0]-len(all_data[label])])
            if sample_size>0:
                all_data[label] += label_df.sample(sample_size).to_dict('records')
                
        avg_num = {label: len(all_data[label]) for label in all_data}
#         print(avg_num)
        pbar.set_description('|'.join([k+':'+str(v) for k, v in avg_num.items()]))
        avg_num = sum(list(avg_num.values()))/len(all_data)
#         print(avg_num)
        if avg_num == max_sample:
            break
        pbar.update()
        
    return all_data

In [None]:
class_data = get_factuality_sentence_samples()

In [None]:
from IPython.display import display

In [7]:
# !pip install pyarrow

In [None]:
for c_data in class_data:
    print(c_data)
    display(pd.DataFrame(class_data[c_data]))

In [None]:
pd.read_csv(join('all_triples', 'origin_predication.csv'), compression = 'gzip')

In [5]:
pred_aux_index = pd.read_csv(join('PREDICATION_AUX', 'all_combo_index_v2.csv'), compression = 'gzip')

In [6]:
pred_aux_index = pred_aux_index[['PREDICATION_AUX_ID', 'PREDICATION_ID']]

In [7]:
# pred_aux_index = pred_aux_index['PREDICATION_ID'].to_dict()

In [8]:
# save_json(join('all_triples', 'pred_aux_pred_map.json'), pred_aux_index)
pred_aux_index.to_csv(join('all_triples', 'pred_aux_pred_map.csv'), compression = 'gzip', index = False)

In [9]:
pd.read_csv(join('all_triples', 'pred_aux_pred_map.csv'), compression = 'gzip')

Unnamed: 0.1,Unnamed: 0,PREDICATION_AUX_ID,PREDICATION_ID
0,0,10592600,10592604
1,1,10592679,10592697
2,2,10592713,10592728
3,3,10592749,10592759
4,4,10592816,10592832
...,...,...,...
114827822,114827822,197488007,197481959
114827823,114827823,197488008,197481960
114827824,114827824,197488009,197481961
114827825,114827825,197488010,197481962


In [11]:
del pred_aux_index

In [12]:
# pred_aux_index = dict(zip(pred_aux_index['PREDICATION_AUX_ID'], pred_aux_index['PREDICATION_ID']))


In [13]:
o_pred_df = pd.read_csv(join('all_triples', 'origin_predication.csv'), compression = 'gzip')

In [15]:
# o_pred_df = o_pred_df.set_index(['PREDICATION_ID', 'ORIGIN_ID'])
o_pred_df = o_pred_df[['PREDICATION_ID', 'ORIGIN_ID']]

In [16]:
o_pred_df

Unnamed: 0,PREDICATION_ID,ORIGIN_ID
0,76299252,1775772
1,74851732,1302567
2,76177799,1737760
3,72444077,380736
4,15350637,29711
...,...,...
115525914,195239203,23425206
115525915,195239206,23425206
115525916,197412490,23841294
115525917,197412498,23841294


In [17]:
# o_pred_df = o_pred_df['ORIGIN_ID'].to_dict()

In [18]:
# save_json(join('all_triples', 'pred_origin_map.json'), o_pred_df)
o_pred_df.to_csv(join('all_triples', 'pred_origin_map.csv'), compression = 'gzip', index = False)

In [19]:
pd.read_csv(join('all_triples', 'pred_origin_map.csv'), compression = 'gzip')

Unnamed: 0,PREDICATION_ID,ORIGIN_ID
0,76299252,1775772
1,74851732,1302567
2,76177799,1737760
3,72444077,380736
4,15350637,29711
...,...,...
115525914,195239203,23425206
115525915,195239206,23425206
115525916,197412490,23841294
115525917,197412498,23841294


In [20]:
del o_pred_df

In [None]:
o_pred_df = read_json(join('all_triples', 'pred_origin_map.json'))
pred_aux_index = read_json(join('all_triples', 'pred_aux_pred_map.json'))

In [None]:
o_pred_df = {k: o_pred_df[v] for k, v in pred_aux_index.items()}

In [None]:
save_json(join('all_triples', 'pred_aux_origin_map.json'), o_pred_df)

In [5]:
def get_map():
    pred_aux_index = pd.read_csv(join('PREDICATION_AUX', 'all_index.csv'), chunksize = 1000000)
    o_pred_df = pd.read_csv(join('all_triples', 'origin_predication.csv'), compression = 'gzip')
    o_pred_df = o_pred_df.set_index('PREDICATION_ID')
    for df_ in tqdm(pred_aux_index):
        df_ = df_.dropna()
        df_ = df_[['PREDICATION_ID', 'PREDICATION_AUX_ID']]
        df_ = df_[df_.apply(pd.to_numeric, errors='coerce').notna()].dropna().astype(int)
        df_ = df_.set_index('PREDICATION_ID')
        
#         inter_index = list(set(list(df_.index)).intersection(set(list(o_pred_df.index))))
#         o_pred_df.loc[inter_index,'PREDICATION_AUX_ID'] = df_.loc[inter_index, 'PREDICATION_AUX_ID']
        o_pred_df.loc[df_.index,'PREDICATION_AUX_ID'] = df_.loc[df_.index, 'PREDICATION_AUX_ID']
#     o_pred_df['PREDICATION_AUX_ID'] = o_pred_df['PREDICATION_AUX_ID'].astype(int)
    return o_pred_df


In [6]:
or_map_df = get_map()

0it [00:00, ?it/s]

  or_map_df = get_map()


In [7]:
or_map_df

Unnamed: 0_level_0,ORIGIN_ID,SENTENCE_ID,PREDICATION_AUX_ID
PREDICATION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
76299252,1775772,10378470,76299234.0
74851732,1302567,24206336,74851718.0
76177799,1737760,26954710,76177784.0
72444077,380736,19247628,72444062.0
15350637,29711,84375,15350618.0
...,...,...,...
195239203,23425206,371250407,195244650.0
195239206,23425206,371250413,195244653.0
197412490,23841294,376355376,197418519.0
197412498,23841294,376355386,197418527.0


In [8]:
or_map_df[or_map_df['PREDICATION_AUX_ID'].isna()]

Unnamed: 0_level_0,ORIGIN_ID,SENTENCE_ID,PREDICATION_AUX_ID
PREDICATION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
75774500,36006,26116576,
75191084,1418341,24913947,
73742492,862517,21921506,
18299998,47956,135276,
58683523,208098,105126770,
...,...,...,...
196192228,23601739,373470218,
195607944,21383156,372101926,
196175424,23599671,373430428,
197391120,23836422,376304793,


In [9]:
or_map_df.to_csv(join('all_triples', 'origin_pred_pred_aux_map.csv'), compression = 'gzip')

In [11]:
pd.read_csv(join('all_triples', 'origin_pred_pred_aux_map.csv'), compression = 'gzip' )

Unnamed: 0,PREDICATION_ID,ORIGIN_ID,SENTENCE_ID,PREDICATION_AUX_ID
0,76299252,1775772,10378470,76299234.0
1,74851732,1302567,24206336,74851718.0
2,76177799,1737760,26954710,76177784.0
3,72444077,380736,19247628,72444062.0
4,15350637,29711,84375,15350618.0
...,...,...,...,...
115525914,195239203,23425206,371250407,195244650.0
115525915,195239206,23425206,371250413,195244653.0
115525916,197412490,23841294,376355376,197418519.0
115525917,197412498,23841294,376355386,197418527.0


In [8]:
or_map_df = pd.read_csv(join('index_maps','origin_pred_pred_aux_sent_map.csv'), compression = 'gzip', engine = 'pyarrow')

In [9]:
or_map_df

Unnamed: 0,ORIGIN_ID,PREDICATION_ID,SENTENCE_ID,PREDICATION_AUX_ID
0,1775772,76299252,10378470,76299234
1,1302567,74851732,24206336,74851718
2,1737760,76177799,26954710,76177784
3,380736,72444077,19247628,72444062
4,29711,15350637,84375,15350618
...,...,...,...,...
115511865,23425206,195239203,371250407,195244650
115511866,23425206,195239206,371250413,195244653
115511867,23841294,197412490,376355376,197418519
115511868,23841294,197412498,376355386,197418527


In [10]:
# or_map_df = or_map_df[~or_map_df['PREDICATION_AUX_ID'].isna()]

In [11]:
# or_map_df['PREDICATION_AUX_ID'] = or_map_df['PREDICATION_AUX_ID'].astype(int)

In [12]:
or_map_df = or_map_df.set_index('PREDICATION_AUX_ID')

In [13]:
or_map_df

Unnamed: 0_level_0,ORIGIN_ID,PREDICATION_ID,SENTENCE_ID
PREDICATION_AUX_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
76299234,1775772,76299252,10378470
74851718,1302567,74851732,24206336
76177784,1737760,76177799,26954710
72444062,380736,72444077,19247628
15350618,29711,15350637,84375
...,...,...,...
195244650,23425206,195239203,371250407
195244653,23425206,195239206,371250413
197418519,23841294,197412490,376355376
197418527,23841294,197412498,376355386


In [20]:
or_map_df.loc[92150424, :]

ORIGIN_ID            24342
PREDICATION_ID    92150440
SENTENCE_ID        6915448
Name: 92150424, dtype: int64

In [11]:
# aux_to_origin_map = dict(zip(or_map_df['PREDICATION_AUX_ID'], or_map_df['ORIGIN_ID']))

# aux_to_origin_map
# or_map_df.to_dict()
KeyError: '[92150156, 92150158, 92150159, 92150160, 92150161, 92150162, 92150163, 92150165, 92150166, 92150167, 92150170, 92150171, 92150172, 92150174, 92150175, 92150179, 92150182, 92150183, 92150184, 92150185, 92150186, 92150187, 92150188, 92150189, 92150191, 92150193, 92150194, 92150195, 92150196, 92150198, 92150199, 92150200, 92150202, 92150203, 92150204, 92150205, 92150208, 92150209, 92150210, 92150211, 92150212, 92150213, 92150214, 92150215, 92150216, 92150217, 92150218, 92150219, 92150220, 92150221, 92150223, 92150225, 92150226, 92150227, 92150228, 92150229, 92150231, 92150233, 92150235, 92150236, 92150237, 92150241, 92150243, 92150244, 92150246, 92150247, 92150249, 92150250, 92150251, 92150252, 92150253, 92150254, 92150255, 92150258, 92150260, 92150261, 92150262, 92150263, 92150265, 92150266, 92150267, 92150268, 92150272, 92150273, 92150274, 92150275, 92150276, 92150277, 92150278, 92150282, 92150283, 92150284, 92150285, 92150286, 92150287, 92150289, 92150290, 92150291, 92150292, 92150293, 92150295, 92150296, 92150297, 92150299, 92150300, 92150301, 92150304, 92150306, 92150307, 92150308, 92150309, 92150310, 92150311, 92150312, 92150313, 92150316, 92150317, 92150319, 92150320, 92150321, 92150322, 92150323, 92150324, 92150325, 92150326, 92150327, 92150329, 92150334, 92150336, 92150341, 92150343, 92150344, 92150345, 92150346, 92150347, 92150348, 92150349, 92150351, 92150352, 92150353, 92150356, 92150359, 92150360, 92150361, 92150363, 92150365, 92150366, 92150367, 92150369, 92150370, 92150374, 92150375, 92150377, 92150378, 92150379, 92150380, 92150381, 92150382, 92150383, 92150384, 92150386, 92150387, 92150388, 92150392, 92150394, 92150400, 92150401, 92150402, 92150403, 92150404, 92150405, 92150406, 92150408, 92150409, 92150410, 92150412, 92150413, 92150414, 92150415, 92150416, 92150417, 92150419, 92150420, 92150421, 92150423, 92150424, 92150425, 92150426, 92150427, 92150428, 92150432, 92150436, 92150437, 92150440, 92150442, 92150443, 92150444, 92150445, 92150446, 92150447, 92150448, 92150450, 92150452, 92150453, 92150454, 92150455, 92150456, 92150457, 92150458, 92150459, 92150460, 92150461, 92150463, 92150465, 92150467, 92150468, 92150470, 92150471, 92150472, 92150474, 92150475, 92150478, 92150480, 92150481, 92150482, 92150483, 92150485, 92150486, 92150487, 92150488, 92150489, 92150490, 92150493, 92150494, 92150495, 92150497, 92150498, 92150499, 92150500, 92150501, 92150503, 92150504, 92150506, 92150507, 92150508, 92150511, 92150513, 92150514, 92150515, 92150517, 92150518, 92150519, 92150520, 92150521, 92150522, 92150524, 92150525, 92150527, 92150528, 92150529, 92150530, 92150531, 92150532, 92150534, 92150537, 92150538, 92150539, 92150544, 92150545, 92150546, 92150547, 92150549, 92150551, 92150552, 92150555, 92150556, 92150557, 92150558, 92150559, 92150560, 92150562, 92150565, 92150566, 92150568, 92150569, 92150570, 92150571, 92150574, 92150575, 92150576, 92150578, 92150579, 92150580, 92150581, 92150582, 92150583, 92150587, 92150588, 92150590, 92150591, 92150592, 92150594, 92150595, 92150596, 92150598, 92150599, 92150602, 92150603, 92150605, 92150606, 92150608, 92150611, 92150612, 92150614, 92150615, 92150616, 92150617, 92150618, 92150619, 92150620, 92150621, 92150622, 92150623, 92150625, 92150626, 92150628, 92150630, 92150635, 92150636, 92150637, 92150638, 92150639, 92150640, 92150641, 92150642, 92150643, 92150644, 92150645, 92150646, 92150647, 92150649, 92150650, 92150651, 92150652, 92150657, 92150658, 92150659, 92150661, 92150662, 92150663, 92150665, 92150670, 92150671, 92150673, 92150674, 92150675, 92150676, 92150677, 92150678, 92150679, 92150680, 92150681, 92150682, 92150684, 92150685, 92150688, 92150689, 92150690, 92150691, 92150693, 92150695, 92150696, 92150697, 92150698, 92150699, 92150700, 92150701, 92150702, 92150703, 92150704, 92150706, 92150709, 92150711, 92150713, 92150715, 92150718, 92150719, 92150720, 92150721, 92150722, 92150723, 92150724, 92150727, 92150729, 92150730, 92150732, 92150733, 92150734, 92150735, 92150736, 92150737, 92150739, 92150740, 92150741, 92150742, 92150744, 92150745, 92150746, 92150747, 92150748, 92150752, 92150753, 92150755, 92150756, 92150759, 92150760, 92150761, 92150762, 92150763, 92150765, 92150767, 92150769, 92150773, 92150774, 92150776, 92150777, 92150778, 92150779, 92150780, 92150781, 92150783, 92150784, 92150785, 92150787, 92150788, 92150789, 92150791, 92150792, 92150794, 92150795, 92150796, 92150797, 92150798, 92150799, 92150800, 92150801, 92150802, 92150804, 92150809, 92150810, 92150811, 92150812, 92150814, 92150815, 92150816] not in index'

In [14]:
def get_triple_evidences(or_map_df):
#     total_runs = 1126784
    total_runs = len(glob('Format_sents/*'))//2
#     pbar = tqdm(total = total_runs)
    
    def read_csv(meta_path):
        def read_csv_(i):
            return pd.read_csv(meta_path.format(i), compression ='gzip', engine = 'pyarrow')
        return read_csv_
    
    def read_batched_df(batch):
        sents = []
        labels = []
        meta_sent = join('Format_sents', 'form_sent_{}.csv')
        meta_lab = join('Format_sents', 'labeled_sent_{}.csv')
        with ProcessingPool(10) as pool:
            sents = pd.concat(list(pool.map(read_csv(meta_sent), batch)))
            labels = pd.concat(list(pool.map(read_csv(meta_lab), batch)))
        return labels, sents

    def save_rec(rec):
        ORIGIN_ID = rec['ORIGIN_ID']
        write_json_lines(join('triple_evidences', '{}.jsonl'.format(ORIGIN_ID)),rec)
#     def process_sent_csv(i):
#         sents = pd.read_csv(join('Format_sents', 'form_sent_{}.csv'.format(i)), compression ='gzip')
#         labels = pd.read_csv(join('Format_sents', 'labeled_sent_{}.csv'.format(i)), compression ='gzip')
#         sents['labels'] = labels['label']
#         sents = sents.set_index('PREDICATION_AUX_ID')
#         sents['ORIGIN_ID'] = or_map_df.loc[sents.index, 'ORIGIN_ID']
#         sents['SENTENCE_ID'] = or_map_df.loc[sents.index, 'SENTENCE_ID']
#         sents['PREDICATION_ID'] = or_map_df.loc[sents.index, 'PREDICATION_ID']
#         del sents['file_name']
#         sents = sents.reset_index()
#         for i, rec in sents.iterrows():
#             ORIGIN_ID = rec['ORIGIN_ID']
#             write_json_lines(join('triple_evidences', '{}.jsonl'.format(ORIGIN_ID)),rec.to_dict())
# #         with ThreadPool(8) as pool_:
# #             pool_.map(save_rec, sents.to_dict('records'))
#         pbar.update()
        

#     with ProcessingPool(10) as pool:
#         pool.map(process_sent_csv, list(range(total_runs)))
###################
#     for i in range(total_runs):
#         sents = pd.read_csv(join('Format_sents', 'form_sent_{}.csv'.format(i)), compression ='gzip')
#         labels = pd.read_csv(join('Format_sents', 'labeled_sent_{}.csv'.format(i)), compression ='gzip')
#         sents['labels'] = labels['label']
#         sents = sents.set_index('PREDICATION_AUX_ID')
#         sents['ORIGIN_ID'] = or_map_df.loc[sents.index, 'ORIGIN_ID']
#         sents['SENTENCE_ID'] = or_map_df.loc[sents.index, 'SENTENCE_ID']
#         sents['PREDICATION_ID'] = or_map_df.loc[sents.index, 'PREDICATION_ID']
#         del sents['file_name']
#         sents = sents.reset_index()
# #         return sents
# #         with ThreadPool(8) as pool_:
# #             pool_.map(save_rec, sents.to_dict('records'))
#         for i, rec in sents.iterrows():
#             ORIGIN_ID = rec['ORIGIN_ID']
#             write_json_lines(join('triple_evidences', '{}.jsonl'.format(ORIGIN_ID)),rec.to_dict())
# #         return sents
#         pbar.update()
######################
    
    for batch_i in tqdm(list(batched(list(range(total_runs)), 10))):
#         print(batch_i)
        labels, sents = read_batched_df(batch_i)
        sents['labels'] = labels['label']
        sents = sents.set_index('PREDICATION_AUX_ID')
        sents['ORIGIN_ID'] = or_map_df.loc[sents.index, 'ORIGIN_ID']
        sents['SENTENCE_ID'] = or_map_df.loc[sents.index, 'SENTENCE_ID']
        sents['PREDICATION_ID'] = or_map_df.loc[sents.index, 'PREDICATION_ID']
        del sents['file_name']
        sents = sents.reset_index()
        with ThreadPool(100) as pool_:
            pool_.map(save_rec, sents.to_dict('records'))

######################
#     or_map_df = or_map_df.copy().reset_index()
#     for i in range(total_runs):
#         sents = pd.read_csv(join('Format_sents', 'form_sent_{}.csv'.format(i)), compression ='gzip')
#         labels = pd.read_csv(join('Format_sents', 'labeled_sent_{}.csv'.format(i)), compression ='gzip')
#         sents['labels'] = labels['label']
#         index_select = or_map_df['PREDICATION_AUX_ID'].isin(sents['PREDICATION_AUX_ID'])
#         sents['ORIGIN_ID'] = list(or_map_df.loc[index_select, :]['ORIGIN_ID'])
# #         sents['SENTENCE_ID'] = or_map_df.loc[index_select, 'SENTENCE_ID']
# #         sents['PREDICATION_ID'] = or_map_df.loc[index_select, 'PREDICATION_ID']
#         del sents['file_name']
#         return sents
#         sents = sents.reset_index()
#         with ThreadPool(8) as pool_:
#             pool_.map(save_rec, sents.to_dict('records'))
# #         for i, rec in sents.iterrows():
# #             ORIGIN_ID = rec['ORIGIN_ID']
# #             write_json_lines(join('triple_evidences', '{}.jsonl'.format(ORIGIN_ID)),rec.to_dict())
# #         return sents
        

In [15]:
get_triple_evidences(or_map_df)

  0%|          | 0/112679 [00:00<?, ?it/s]

Process ForkPoolWorker-3:
Process ForkPoolWorker-10:
Process ForkPoolWorker-5:
Process ForkPoolWorker-2:
Process ForkPoolWorker-6:
Process ForkPoolWorker-7:
Process ForkPoolWorker-8:
Process ForkPoolWorker-4:
Process ForkPoolWorker-1:
Process ForkPoolWorker-9:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/vortex/anaconda3/envs/TestEnv/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/home/vortex/anaconda3/envs/TestEnv/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/home/vortex/anaconda3/envs/TestEnv/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/home/vortex/anaconda3/envs/TestEnv/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/home/vo

KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt


KeyboardInterrupt: 

In [7]:
len(glob('Format_sents/*'))

KeyboardInterrupt: 

In [None]:
os.listdir('Format_sents/*')

In [23]:
or_map_df[or_map_df['PREDICATION_ID']==92150811]

Unnamed: 0_level_0,PREDICATION_ID,ORIGIN_ID,SENTENCE_ID
PREDICATION_AUX_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [None]:
KeyError: '[92150156, 92150158, 92150159, 92150160, 92150161, 92150162, 92150163, 92150165, 92150166, 92150167, 92150170, 92150171, 92150172, 92150174, 92150175, 92150179, 92150182, 92150183, 92150184, 92150185, 92150186, 92150187, 92150188, 92150189, 92150191, 92150193, 92150194, 92150195, 92150196, 92150198, 92150199, 92150200, 92150202, 92150203, 92150204, 92150205, 92150208, 92150209, 92150210, 92150211, 92150212, 92150213, 92150214, 92150215, 92150216, 92150217, 92150218, 92150219, 92150220, 92150221, 92150223, 92150225, 92150226, 92150227, 92150228, 92150229, 92150231, 92150233, 92150235, 92150236, 92150237, 92150241, 92150243, 92150244, 92150246, 92150247, 92150249, 92150250, 92150251, 92150252, 92150253, 92150254, 92150255, 92150258, 92150260, 92150261, 92150262, 92150263, 92150265, 92150266, 92150267, 92150268, 92150272, 92150273, 92150274, 92150275, 92150276, 92150277, 92150278, 92150282, 92150283, 92150284, 92150285, 92150286, 92150287, 92150289, 92150290, 92150291, 92150292, 92150293, 92150295, 92150296, 92150297, 92150299, 92150300, 92150301, 92150304, 92150306, 92150307, 92150308, 92150309, 92150310, 92150311, 92150312, 92150313, 92150316, 92150317, 92150319, 92150320, 92150321, 92150322, 92150323, 92150324, 92150325, 92150326, 92150327, 92150329, 92150334, 92150336, 92150341, 92150343, 92150344, 92150345, 92150346, 92150347, 92150348, 92150349, 92150351, 92150352, 92150353, 92150356, 92150359, 92150360, 92150361, 92150363, 92150365, 92150366, 92150367, 92150369, 92150370, 92150374, 92150375, 92150377, 92150378, 92150379, 92150380, 92150381, 92150382, 92150383, 92150384, 92150386, 92150387, 92150388, 92150392, 92150394, 92150400, 92150401, 92150402, 92150403, 92150404, 92150405, 92150406, 92150408, 92150409, 92150410, 92150412, 92150413, 92150414, 92150415, 92150416, 92150417, 92150419, 92150420, 92150421, 92150423, 92150424, 92150425, 92150426, 92150427, 92150428, 92150432, 92150436, 92150437, 92150440, 92150442, 92150443, 92150444, 92150445, 92150446, 92150447, 92150448, 92150450, 92150452, 92150453, 92150454, 92150455, 92150456, 92150457, 92150458, 92150459, 92150460, 92150461, 92150463, 92150465, 92150467, 92150468, 92150470, 92150471, 92150472, 92150474, 92150475, 92150478, 92150480, 92150481, 92150482, 92150483, 92150485, 92150486, 92150487, 92150488, 92150489, 92150490, 92150493, 92150494, 92150495, 92150497, 92150498, 92150499, 92150500, 92150501, 92150503, 92150504, 92150506, 92150507, 92150508, 92150511, 92150513, 92150514, 92150515, 92150517, 92150518, 92150519, 92150520, 92150521, 92150522, 92150524, 92150525, 92150527, 92150528, 92150529, 92150530, 92150531, 92150532, 92150534, 92150537, 92150538, 92150539, 92150544, 92150545, 92150546, 92150547, 92150549, 92150551, 92150552, 92150555, 92150556, 92150557, 92150558, 92150559, 92150560, 92150562, 92150565, 92150566, 92150568, 92150569, 92150570, 92150571, 92150574, 92150575, 92150576, 92150578, 92150579, 92150580, 92150581, 92150582, 92150583, 92150587, 92150588, 92150590, 92150591, 92150592, 92150594, 92150595, 92150596, 92150598, 92150599, 92150602, 92150603, 92150605, 92150606, 92150608, 92150611, 92150612, 92150614, 92150615, 92150616, 92150617, 92150618, 92150619, 92150620, 92150621, 92150622, 92150623, 92150625, 92150626, 92150628, 92150630, 92150635, 92150636, 92150637, 92150638, 92150639, 92150640, 92150641, 92150642, 92150643, 92150644, 92150645, 92150646, 92150647, 92150649, 92150650, 92150651, 92150652, 92150657, 92150658, 92150659, 92150661, 92150662, 92150663, 92150665, 92150670, 92150671, 92150673, 92150674, 92150675, 92150676, 92150677, 92150678, 92150679, 92150680, 92150681, 92150682, 92150684, 92150685, 92150688, 92150689, 92150690, 92150691, 92150693, 92150695, 92150696, 92150697, 92150698, 92150699, 92150700, 92150701, 92150702, 92150703, 92150704, 92150706, 92150709, 92150711, 92150713, 92150715, 92150718, 92150719, 92150720, 92150721, 92150722, 92150723, 92150724, 92150727, 92150729, 92150730, 92150732, 92150733, 92150734, 92150735, 92150736, 92150737, 92150739, 92150740, 92150741, 92150742, 92150744, 92150745, 92150746, 92150747, 92150748, 92150752, 92150753, 92150755, 92150756, 92150759, 92150760, 92150761, 92150762, 92150763, 92150765, 92150767, 92150769, 92150773, 92150774, 92150776, 92150777, 92150778, 92150779, 92150780, 92150781, 92150783, 92150784, 92150785, 92150787, 92150788, 92150789, 92150791, 92150792, 92150794, 92150795, 92150796, 92150797, 92150798, 92150799, 92150800, 92150801, 92150802, 92150804, 92150809, 92150810, 92150811, 92150812, 92150814, 92150815, 92150816] not in index'

In [10]:
pred_aux_index = pd.read_csv(join('PREDICATION_AUX', 'all_index.csv'),chunksize = 1000000)

In [11]:
pred_aux_index

TypeError: 'TextFileReader' object is not subscriptable

In [1]:
for i in tqdm(pred_aux_index):
    i_i = i[i['PREDICATION_AUX_ID']=='92150804']
    i_j = i[i['PREDICATION_ID']=='92150804']
    if i_i.shape[0]>0:
        print('found i_i')
    if i_j.shape[0]>0:
        print('found i_j')

NameError: name 'tqdm' is not defined

In [16]:
def read_batched_df(i):
    meta_sent = pd.read_csv(join('Format_sents', 'form_sent_{}.csv'.format(i)), compression = 'gzip')
    meta_lab = pd.read_csv(join('Format_sents', 'labeled_sent_{}.csv'.format(i)), compression = 'gzip')
    return meta_sent, meta_lab

In [17]:
read_batched_df(1)

(     PREDICATION_AUX_ID                                           SENTENCE  \
 0              10603322  There were 582 patients with primary hyperpara...   
 1              10603447  This rate did not exceed the incidence of deve...   
 2              10603511  Although thyroid cancer was about 4 times the ...   
 3              10603623  None of the 126 patients with osteosarcoma had...   
 4              10603759  Since PTH may contribute to tumor invasiveness...   
 ..                  ...                                                ...   
 96             10618109  Patients with chronic renal failure without co...   
 97             10618137  Patients with chronic renal failure without co...   
 98             10618180  Patients with chronic renal failure without co...   
 99             10618454  Altered intracellular amino acid transport kin...   
 100            10618585  An intradialytic increase in albumin and fibri...   
 
                                      FORMATED_SEN