In [14]:
import pandas as pd
from os.path import join
from glob import glob
from tqdm.notebook import tqdm
from multiprocessing import Pool # for reading the CSVs faster

from multiprocessing.pool import ThreadPool
import json

TRIPLES_DIR = 'all_triples'
CITATIONS_DIR = 'CITATIONS'
TRIP_SENT_DIR = 'TRIPLE_SENTENCES'
CITATIONS_COLS = ['PMID','ISSN','DP','EDAT','PYEAR']


In [2]:
proba_df = pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')

In [3]:
proba_df.head()

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba
0,0,PROCESS_OF,C0003725,C0999630,1.0
1,1,ISA,C0039258,C0446169,1.0
2,2,ISA,C0318627,C0206590,1.0
3,3,ISA,C0446169,C0003725,1.0
4,4,PROCESS_OF,C0012634,C0020114,0.989018


In [4]:
def process_citations():
    cit_df = pd.read_csv(join(CITATIONS_DIR, 'semmedVER43_2021_R_CITATIONS.csv'), compression = 'gzip',
                        encoding='ISO-8859-1', header = None, names = CITATIONS_COLS)
    cit_df_ = cit_df[~cit_df['EDAT'].isna()]
    cit_df_['EDAT'] = pd.to_datetime(cit_df_['EDAT'])
    cit_df_no = cit_df[cit_df['EDAT'].isna()]
    cit_df_no['EDAT'] = pd.to_datetime(cit_df_no['DP'])
    return pd.concat([cit_df_, cit_df_no])

In [5]:
clean_cite = process_citations()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cit_df_['EDAT'] = pd.to_datetime(cit_df_['EDAT'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cit_df_no['EDAT'] = pd.to_datetime(cit_df_no['DP'])


In [6]:
clean_cite.shape

(33404966, 5)

In [7]:
clean_cite.head()

Unnamed: 0,PMID,ISSN,DP,EDAT,PYEAR
0,1,0006-2944,1975 Jun,1975-06-01,1975
1,10,1873-2968,1975 Sep 01,1975-09-01,1975
2,100,0547-6844,1975,1975-01-01,1975
3,1000,0264-6021,1975 Sep,1975-09-01,1975
4,10000,0006-3002,1976 Sep 28,1976-09-28,1976


In [8]:
cite_date_map = dict(zip(clean_cite['PMID'], clean_cite['EDAT']))

In [9]:
cite_date_map

{1: Timestamp('1975-06-01 00:00:00'),
 10: Timestamp('1975-09-01 00:00:00'),
 100: Timestamp('1975-01-01 00:00:00'),
 1000: Timestamp('1975-09-01 00:00:00'),
 10000: Timestamp('1976-09-28 00:00:00'),
 100000: Timestamp('1978-09-01 00:00:00'),
 1000000: Timestamp('1976-12-01 00:00:00'),
 10000000: Timestamp('1991-08-15 00:00:00'),
 10000001: Timestamp('1991-08-15 00:00:00'),
 10000002: Timestamp('1991-08-15 00:00:00'),
 10000003: Timestamp('1991-08-15 00:00:00'),
 10000004: Timestamp('1991-08-15 00:00:00'),
 10000005: Timestamp('1991-08-15 00:00:00'),
 10000006: Timestamp('1991-08-15 00:00:00'),
 10000007: Timestamp('1991-08-15 00:00:00'),
 10000008: Timestamp('1991-08-15 00:00:00'),
 10000009: Timestamp('1991-08-15 00:00:00'),
 1000001: Timestamp('1976-12-01 00:00:00'),
 10000010: Timestamp('1991-08-15 00:00:00'),
 10000011: Timestamp('1991-08-15 00:00:00'),
 10000012: Timestamp('1991-08-15 00:00:00'),
 10000013: Timestamp('1991-08-15 00:00:00'),
 10000014: Timestamp('1991-08-15 00:00:

In [10]:
clean_cite['EDAT'].min()

Timestamp('1865-01-01 00:00:00')

In [11]:
clean_cite['EDAT'].max()

Timestamp('2021-11-28 00:00:00')

In [27]:
def write_json_lines(file_name,dict_data):
    json_string = json.dumps(dict_data)
    with open(file_name, 'a') as f:
        f.write(json_string+"\n")
        
def read_json_lines(file_name):
    lines = []
    with open(file_name) as file_in:
        for line in file_in:
            lines.append(json.loads(line))
    return lines

def my_read_csv(filename):
    # Helper function for the parellel load_csvs
    return pd.read_csv(filename, compression = 'gzip')

def load_csvs(parent_path):
    """Reads and joins all our CSV files into one big dataframe.
    We do it in parallel to make it faster, since otherwise it takes some time.
    Idea from: https://stackoverflow.com/questions/36587211/easiest-way-to-read-csv-files-with-multiprocessing-in-pandas
    
    """
    # set up your pool
    pool = ThreadPool(10) 
    file_list = glob(join(parent_path, '*'))
    df_list = pool.map(my_read_csv, file_list)
    if len(df_list)>0:
        return pd.concat(df_list, ignore_index=True)
    else:
        return None

def get_all_csvs(parent_path):
    all_dfs = []
    for i in glob(join(parent_path, '*')):
        df_ = pd.read_csv(i, compression = 'gzip')
        all_dfs.append(df_)
    if len(all_dfs)>0:
        return pd.concat(all_dfs)
    else:
        return None
    
def get_el(search_dict):
    def get_el_(i):
        return search_dict[i]
    return get_el_
def time_validity_triples(p_df, pmid_time_map, save_file):
    res = []
    for ix, ORIGIN_ID in enumerate(tqdm(list(set(p_df['ORIGIN_ID'])))):
        res_ = {}
#         all_df = get_all_csvs(join(TRIP_SENT_DIR, str(ORIGIN_ID)))
        all_df = load_csvs(join(TRIP_SENT_DIR, str(ORIGIN_ID)))
        res_['ORIGIN_ID'] = ORIGIN_ID
        if all_df is not None:
            mapped_time = all_df['PMID'].apply(get_el(pmid_time_map))
            res_['time_min'] = str(mapped_time.min())
            res_['time_max'] = str(mapped_time.max())
            res_['true_time'] = True
        else:
            res_['time_min'] = ''
            res_['time_max'] = ''
            res_['true_time'] = False
#         res.append(res_)
        write_json_lines(file_name = save_file,dict_data = res_)
#     return pd.DataFrame(res)

In [28]:
origin_time = time_validity_triples(p_df = proba_df,
                                    pmid_time_map = cite_date_map, save_file=join(TRIPLES_DIR, 'origin_time.jsonl'))

  0%|          | 0/23601734 [00:00<?, ?it/s]

In [22]:
str(origin_time)

'2006-03-15 00:00:00'