In [1]:
# the objective is to splite sentence files and aux_predications into multiple files
# each file needs to contains only sentences and aux_preds of the same triple
import pandas as pd
from glob import glob
from os.path import join
import gc
import gzip
from csv import reader
from collections import Counter
import os
from tqdm.notebook import tqdm

PREDICATION_COLS = [
    'PREDICATION_ID', 'SENTENCE_ID', 'PMID', 'PREDICATE', 
    'SUBJECT_CUI', 'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY', 'OBJECT_CUI',
    'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY', 'Nan1', 'Nan2', 'Nan3']
SENTENCE_COLS = ['SENTENCE_ID','PMID','TYPE','NUMBER','SENT_START_INDEX','SENTENCE',
                 'SENT_END_INDEX','NORMALIZED_SECTION_HEADER','SECTION_HEADER']
CITATIONS_COLS = ['PMID','ISSN','DP','EDAT','PYEAR']

PREDICATION_DIR = 'PREDICATION'
SENTENCE_DIR = 'SENTENCE'

TRIPLES_DIR = 'all_triples'
TRIP_SENT_DIR = 'TRIPLE_SENTENCES'
CITATIONS_DIR = 'CITATIONS'

In [4]:
def collect_garbage():
    print(gc.get_count())
    gc.collect()
    print(gc.get_count())


In [5]:
collect_garbage()

(182, 0, 0)
(18, 0, 0)


In [3]:
or_preds = pd.read_csv(join(TRIPLES_DIR,'origin_predication.csv'), compression = 'gzip')

In [5]:
or_preds.shape

(115525919, 3)

In [6]:
or_preds.drop_duplicates().shape

(115525919, 3)

In [23]:
def manual_separation(bad_line):
    good_1 = bad_line[:5]
    good_2 = bad_line[-3:]
    sent_parts = bad_line[5:-3]
    sent_parts = [', '.join(sent_parts)]
    all_data = good_1+ sent_parts + good_2
    return all_data

def get_triple_sentences(groups_save):
    all_files = sorted(glob(join(SENTENCE_DIR,'*.gz')))
    print(len(all_files))
    for fil in tqdm(all_files):
        df = pd.read_csv(fil, on_bad_lines=manual_separation,
                 compression= 'gzip', encoding='ISO-8859-1', header=None, names = SENTENCE_COLS, engine='python'
                        )
        df = pd.merge(or_preds,df,on='SENTENCE_ID')
        for name, group in df.groupby('ORIGIN_ID'):
            group_dir = str(name)
            if not os.path.exists(join(groups_save,group_dir)):
                os.mkdir(join(groups_save,group_dir))
            num_files = len(glob(join(groups_save,group_dir,'*')))
            save_file = str(num_files + 1)+'.csv'
#             print('*'*15,name)
#             print('num_files:',num_files)
#             print(group)
            
            group.to_csv(join(groups_save,group_dir,save_file), compression = 'gzip')

def map_sentences_2_files():
    all_files = sorted(glob(join(SENTENCE_DIR,'*.gz')))
    all_res = []
    for fil in tqdm(all_files):
        f_name = fil.split('/')[-1].split('.')[0]
        df = pd.read_csv(fil, on_bad_lines=manual_separation,
                 compression= 'gzip', encoding='ISO-8859-1', header=None, names = SENTENCE_COLS, engine='python'
        )
        df['f_name'] = f_name
        all_res.append(df[['f_name', 'SENTENCE_ID']])
    return pd.concat(all_res, ignore_index=False)

In [24]:
sent_file_map = map_sentences_2_files()

  0%|          | 0/87 [00:00<?, ?it/s]

In [27]:
sent_file_map.to_csv(join(TRIPLES_DIR, 'sentence_index.csv'), index = False, compression = 'gzip')

In [28]:
#test_ = get_triple_sentences(TRIP_SENT_DIR)

In [190]:
test_

Unnamed: 0,SENTENCE_ID,PMID,TYPE,NUMBER,SENT_START_INDEX,SENTENCE,SENT_END_INDEX,NORMALIZED_SECTION_HEADER,SECTION_HEADER
0,6,16530473,ti,1,21,Fluoride-selective colorimetric sensor based o...,119,,
1,7,16530473,ab,1,125,"A structurally simple colorimetric sensor, N-4...",302,,
2,8,16530473,ab,2,302,"In acetonitrile, the addition of F(-) changed ...",385,,
3,9,16530473,ab,3,385,In the presence of other anions such as CH(3)C...,578,,
4,10,16530473,ab,4,578,The association constants of anionic complexes...,757,,
...,...,...,...,...,...,...,...,...,...
3199461,3725090,3057670,ab,8,1163,Depressed release of insulin was seen in 58% (...,1328,,
3199462,3725091,3417639,ab,6,968,The sensitivity of Ca2+ is expressed in two st...,1125,,
3199463,3725092,4663183,ti,1,20,[Effect of individual unfavorable factors on t...,170,,
3199464,3725093,4297905,ti,1,20,Japanese encephalitis vaccine including a prel...,122,,


## Triple Time Validity

In [9]:
all_files = glob(join(TRIP_SENT_DIR, '*', '*'))

KeyboardInterrupt: 

In [10]:
all_files = glob(join('TRIPLE_SENTENCES/1805287', '*'))

In [13]:
pd.read_csv(all_files[0], compression = 'gzip')

Unnamed: 0.1,Unnamed: 0,ORIGIN_ID,PREDICATION_ID,SENTENCE_ID,PMID,TYPE,NUMBER,SENT_START_INDEX,SENTENCE,SENT_END_INDEX,NORMALIZED_SECTION_HEADER,SECTION_HEADER
0,1361658,1805287,90768563,3772671,1346806,ab,1,77,We have compared the kinetic properties of NMD...,210,,


In [14]:
or_pred = pd.read_csv(join(TRIPLES_DIR, 'origin_predication.csv'), compression = 'gzip')

In [4]:
or_pred.head()

Unnamed: 0,ORIGIN_ID,PREDICATION_ID,SENTENCE_ID
0,1775772,76299252,10378470
1,1302567,74851732,24206336
2,1737760,76177799,26954710
3,380736,72444077,19247628
4,29711,15350637,84375


In [11]:
citations = pd.read_csv(join(CITATIONS_DIR, 'semmedVER43_2021_R_CITATIONS.csv'), compression = 'gzip',
                        encoding='ISO-8859-1', header = None, names = CITATIONS_COLS)

In [12]:
citations.head()

Unnamed: 0,PMID,ISSN,DP,EDAT,PYEAR
0,1,0006-2944,1975 Jun,1975-6-1,1975
1,10,1873-2968,1975 Sep 01,1975-9-1,1975
2,100,0547-6844,1975,1975-1-1,1975
3,1000,0264-6021,1975 Sep,1975-9-1,1975
4,10000,0006-3002,1976 Sep 28,1976-9-28,1976


In [13]:
citations['EDAT'].str.split('-')

0            [1975, 6, 1]
1            [1975, 9, 1]
2            [1975, 1, 1]
3            [1975, 9, 1]
4           [1976, 9, 28]
                ...      
33404961    [1991, 8, 15]
33404962    [1991, 8, 15]
33404963    [1991, 8, 15]
33404964    [1991, 8, 15]
33404965    [1991, 8, 15]
Name: EDAT, Length: 33404966, dtype: object