In [1]:
import pandas as pd
import glob
import json

In [2]:
"""
Preprocessing Cord-19 text files

Goal: Filter text and vector files containing publications (1 sentence per row) to publications 
relevant to Covid-19 and specified columns.  Merge into a single file for extraction of sentences/vectors
belonging to specified papers/sections.

Steps:
    1. Filter metadata.csv (from Kaggle Cord-19 dataset) to paper cord_uids for papers where:
        -Title or Abstract contains 1 or more Covid-19 synonyms.
        -Publication date is after 2019-10-01.
        This has been moved to notebook covid_vt_contra_metadata_filtering.ipynb
        
    2.  Load all full text files (parquet format) as pandas dataframes and truncate them to:
            -Rows containing cord_uids from step 1
            -Columns cord_uid, sentence_id, sentence, and section

        Write truncated text dataframes to csv files.

    3.  Merge truncated text dataframes and extract list of sentence_ids for truncating vector files.
    
    4.  Load all vector files (parquet format) as pandas dataframes and truncate them to:
            -Rows containing sentence_ids from step 3.
        
        Write truncated vector dataframes to csv files.
    
    5.  Merge truncated vector dataframes and join with merged truncated text files (merged_text_vector_df).
    
    6.  Write merged, truncated vector and text dataframe to csv file.
"""

'\nPreprocessing Cord-19 text files\n\nGoal: Filter text and vector files containing publications (1 sentence per row) to publications \nrelevant to Covid-19 and specified columns.  Merge into a single file for extraction of sentences/vectors\nbelonging to specified papers/sections.\n\nSteps:\n    1. Filter metadata.csv (from Kaggle Cord-19 dataset) to paper cord_uids for papers where:\n        -Title or Abstract contains 1 or more Covid-19 synonyms.\n        -Publication date is after 2019-10-01.\n        This has been moved to notebook covid_vt_contra_filtering_input_metadata.ipynb\n        \n    2.  Load all full text files (parquet format) as pandas dataframes and truncate them to:\n            -Rows containing cord_uids from step 1\n            -Columns cord_uid, sentence_id, sentence, and section\n\n        Write truncated text dataframes to csv files.\n\n    3.  Merge truncated text dataframes and extract list of sentence_ids for truncating vector files.\n    \n    4.  Load all 

In [3]:
"""
Input and Output file paths.  This will be turned into a config file that can be passed as an argument.
"""

#Input
metadata_csv_path = 'resources/covid19_date_filt_metadata_200430.csv'
covid_19_term_list_file = 'resources/covid_19_terms_200427.txt'

pq_text_file_dir = 'resources/v8_preprocessed/'
pq_vec_file_dir = 'resources/v8_vectors/'

pub_date_cutoff = '2019-10-01'

#Columns to truncate pq full text files to
pq_text_files_cols_oi = ['cord_uid', 'sentence_id', 'section', 'sentence']

#Output

covid19_filtered_uid_list_file = 'resources/covid19_cord_uids_200429.txt'

csv_trunc_text_file_dir = 'resources/v8_truncated_text_files/'
csv_trunc_vec_file_dir = 'resources/v8_truncated_vec_files/'

full_merged_text_vec_df_outfile_path = 'resources/full_merged_text_vector_df_200430.csv'
filt_merged_text_vec_df_outfile_path = 'resources/filt_merged_text_vector_df_200430.csv'

cord_uid_text_file_map = 'resources/v8_id_file_maps/cord_uid_text_file_map.json'
sent_id_text_file_map = 'resources/v8_id_file_maps/sent_id_text_file_map.json'
sent_id_vec_file_map = 'resources/v8_id_file_maps/sent_id_vec_file_map.json'

In [4]:
#Step 1
#Separated this into notebook covid_vt_contra_metadata_filtering.ipynb
"""
def extract_filtered_uid_list_from_metadata_csv(metadata_csv_path, covid_19_term_list_file, pub_date_cutoff):
    Filter metadata.csv (from Kaggle Cord-19 dataset) to:
        -Papers where 1 or more Covid-19 synonyms is used in title or abstract column.
        -Papers published after pub_date_cutoff.
    
    Return uid_list
    metadata_df = pd.read_csv(metadata_csv_path)
    metadata_df = metadata_df.fillna('')
    metadata_df.loc[:, 'title_abstract'] = metadata_df.loc[:, 'title'].str.lower() + ' ' + metadata_df.loc[:, 'abstract'].str.lower()
    metadata_df.loc[:, 'title_abstract'] = metadata_df.loc[:, 'title_abstract'].fillna('')

    with open(covid_19_term_list_file) as f:
        covid_19_terms = f.read().splitlines()
        covid_19_term_pattern = '|'.join([i.lower() for i in covid_19_terms])

    covid19_df = metadata_df.loc[metadata_df.title_abstract.str.contains(covid_19_term_pattern)]

    covid19_uid_list = covid19_df.cord_uid.tolist()
    print("Covid-19 uids filtered by use of Covid-19 synonyms: %d" % len(covid19_uid_list))
    
    covid19_date_filtered_df = covid19_df.loc[covid19_df['publish_time'] > pub_date_cutoff]
    print("Covid-19 df filtered to publication dates after %s: %d" % (pub_date_cutoff, len(covid19_date_filtered_df)))

    covid19_date_incl_cord_uids = set(covid19_date_filtered_df.cord_uid.tolist())
    date_excl_uids = set(covid19_uid_list) - set(covid19_date_incl_cord_uids)

    print("Papers included by date cutoff filter: %d" % len(covid19_date_incl_cord_uids))
    print("Papers excluded by date cutoff filter: %d" % len(date_excl_uids))
    
    return covid19_date_incl_cord_uids"""

'\ndef extract_filtered_uid_list_from_metadata_csv(metadata_csv_path, covid_19_term_list_file, pub_date_cutoff):\n    Filter metadata.csv (from Kaggle Cord-19 dataset) to:\n        -Papers where 1 or more Covid-19 synonyms is used in title or abstract column.\n        -Papers published after pub_date_cutoff.\n    \n    Return uid_list\n    metadata_df = pd.read_csv(metadata_csv_path)\n    metadata_df = metadata_df.fillna(\'\')\n    metadata_df.loc[:, \'title_abstract\'] = metadata_df.loc[:, \'title\'].str.lower() + \' \' + metadata_df.loc[:, \'abstract\'].str.lower()\n    metadata_df.loc[:, \'title_abstract\'] = metadata_df.loc[:, \'title_abstract\'].fillna(\'\')\n\n    with open(covid_19_term_list_file) as f:\n        covid_19_terms = f.read().splitlines()\n        covid_19_term_pattern = \'|\'.join([i.lower() for i in covid_19_terms])\n\n    covid19_df = metadata_df.loc[metadata_df.title_abstract.str.contains(covid_19_term_pattern)]\n\n    covid19_uid_list = covid19_df.cord_uid.tolis

In [5]:
#Step 2

def truncate_pq_text_file(pq_text_file, pq_text_files_cols_oi, uid_list):
    """
    Load text file as dataframe, filter to relevant columns (cord_uid, sentence_id, section, sentence) and 
    cord_uids for Covid-19 papers.    
    
    Return truncated dataframe.
    
    """
    print("Loading dataframe from %s" % pq_text_file)
    text_file_df = pd.read_parquet(pq_text_file)
    text_file_trunc_df = text_file_df[pq_text_files_cols_oi]
    text_file_trunc_df = text_file_trunc_df.loc[text_file_trunc_df.cord_uid.isin(uid_list)]
    
    return text_file_trunc_df
    
def truncate_pq_text_files(pq_text_file_dir, pq_text_files_cols_oi, csv_trunc_text_file_dir, uid_list):
    """
    For each full text parquet file in pq_text_file_dir, load as dataframe, 
    filter to specified cord_uids and columns of interest, and write to csv file in csv_trunc_text_file_dir
    """
    
    print("Parquet text files input directory: %s" % pq_text_file_dir)
    pq_text_files = glob.glob('%s*' % pq_text_file_dir)
    print("Files in text files input directory: %d" % len(pq_text_files))

    print("Parquet text files output directory: %s" % csv_trunc_text_file_dir)
    print('\n')

    for pq_text_file in pq_text_files:
        
        text_file_trunc_df = truncate_pq_text_file(pq_text_file, pq_text_files_cols_oi, uid_list)
        text_file_name = pq_text_file.split('.')[0].split('/')[-1]
        outfile = '%s%s_trunc.csv' %(csv_trunc_text_file_dir, text_file_name)
    
        print("Writing truncated text dataframe to %s" % outfile)
        text_file_trunc_df.to_csv(outfile)

In [6]:
#Step 3, 5

def concat_dataframes_from_dir_csvs(trunc_csv_dir):

    trunc_csv_files = glob.glob('%s*' % trunc_csv_dir)
    print("Files in truncated csv output directory: %d" % len(trunc_csv_files))

    trunc_dfs = []
    for trunc_csv_file in trunc_csv_files:
        trunc_dfs.append(pd.read_csv(trunc_csv_file, index_col=0))
    
    merged_trunc_df = pd.concat(trunc_dfs)
    return merged_trunc_df

In [7]:
#Step 4

def truncate_pq_vec_file(pq_vec_file, sentence_ids):
    vec_file_df = pd.read_parquet(pq_vec_file)
    vec_file_trunc_df = vec_file_df.loc[vec_file_df.sentence_id.isin(sentence_ids)]
    
    return vec_file_trunc_df

def truncate_pq_vec_files(pq_vec_file_dir, sentence_ids, csv_trunc_vec_file_dir):
    pq_vec_files = glob.glob('%s*' % pq_vec_file_dir)
    print("Files in vector file input directory: %d" % len(pq_vec_files))

    for pq_vec_file in pq_vec_files:
        vec_file_trunc_df = truncate_pq_vec_file(pq_vec_file, sentence_ids)
        
        vec_file_name = pq_vec_file.split('.')[0].split('/')[-1]
        outfile = '%s%s_trunc.csv' % (csv_trunc_vec_file_dir, vec_file_name)

        print("Writing truncated vector dataframe to %s" % outfile)
        vec_file_trunc_df.to_csv(outfile)

In [8]:
#Main

print("Step 1: Filtering metadata csv by Covid-19 synoynms and publication date...")
#uid_list = extract_filtered_uid_list_from_metadata_csv(metadata_csv_path, covid_19_term_list_file, pub_date_cutoff)
#print('\n')

#Replacing this step with the output of notebook covid_vt_contra_metadata_filtering.ipynb
covid19_df = pd.read_csv(metadata_csv_path)
uid_list = covid19_df.cord_uid.tolist()

print("Step 2: Loading full text dataframes, filtering to extracted uids, and writing to csv files...")
truncate_pq_text_files(pq_text_file_dir, pq_text_files_cols_oi, csv_trunc_text_file_dir, uid_list)    
print('\n')

Step 1: Filtering metadata csv by Covid-19 synoynms and publication date...
Step 2: Loading full text dataframes, filtering to extracted uids, and writing to csv files...
Parquet text files input directory: resources/v8_preprocessed/
Files in text files input directory: 20
Parquet text files output directory: resources/v8_truncated_text_files/


Loading dataframe from resources/v8_preprocessed/v8processedText1.parquet
Writing truncated text dataframe to resources/v8_truncated_text_files/v8processedText1_trunc.csv
Loading dataframe from resources/v8_preprocessed/v8processedText4.parquet
Writing truncated text dataframe to resources/v8_truncated_text_files/v8processedText4_trunc.csv
Loading dataframe from resources/v8_preprocessed/v8processedText8.parquet
Writing truncated text dataframe to resources/v8_truncated_text_files/v8processedText8_trunc.csv
Loading dataframe from resources/v8_preprocessed/v8processedText10.parquet
Writing truncated text dataframe to resources/v8_truncated_text_

In [9]:
print("Step 3: Merging truncated text dataframes and extracting sentence ids...")
concat_text_trunc_df = concat_dataframes_from_dir_csvs(csv_trunc_text_file_dir)
text_sentence_ids = concat_text_trunc_df.sentence_id.tolist()
print("Number of sentence ids: %d" % len(text_sentence_ids))
print("Number of unique sentence ids: %d" % len(set(text_sentence_ids)))
print('\n')

Step 3: Merging truncated text dataframes and extracting sentence ids...
Files in truncated csv output directory: 20
Number of sentence ids: 294597
Number of unique sentence ids: 294597




In [10]:
print("Step 4: Loading vector dataframes, filtering to extracted sentence ids, and writing to csv files...")
truncate_pq_vec_files(pq_vec_file_dir, text_sentence_ids, csv_trunc_vec_file_dir)
print('\n')

Step 4: Loading vector dataframes, filtering to extracted sentence ids, and writing to csv files...
Files in vector file input directory: 20
Writing truncated vector dataframe to resources/v8_truncated_vec_files/v8processedVecs16_trunc.csv
Writing truncated vector dataframe to resources/v8_truncated_vec_files/v8processedVecs19_trunc.csv
Writing truncated vector dataframe to resources/v8_truncated_vec_files/v8processedVecs1_trunc.csv
Writing truncated vector dataframe to resources/v8_truncated_vec_files/v8processedVecs18_trunc.csv
Writing truncated vector dataframe to resources/v8_truncated_vec_files/v8processedVecs17_trunc.csv
Writing truncated vector dataframe to resources/v8_truncated_vec_files/v8processedVecs4_trunc.csv
Writing truncated vector dataframe to resources/v8_truncated_vec_files/v8processedVecs5_trunc.csv
Writing truncated vector dataframe to resources/v8_truncated_vec_files/v8processedVecs2_trunc.csv
Writing truncated vector dataframe to resources/v8_truncated_vec_files/

In [11]:
print("Step 5: Merging truncated vector dataframes...")
concat_vec_trunc_df = concat_dataframes_from_dir_csvs(csv_trunc_vec_file_dir)
print('\n')

print("Step 6: Merging concatenated text dataframe and concatenated vector dataframe on sentence id...")
merged_text_vec_df = pd.DataFrame.merge(concat_text_trunc_df, concat_vec_trunc_df, on='sentence_id')
merged_text_vec_df.to_csv(full_merged_text_vec_df_outfile_path)

Step 5: Merging truncated vector dataframes...
Files in truncated csv output directory: 20


Step 6: Merging concatenated text dataframe and concatenated vector dataframe on sentence id...


In [12]:
"""
Functions to generate regex match patterns from synonymous words/phrases for filtering subject headers
"""

import re

def extract_regex_pattern(section_list, pattern):
    r = re.compile(pattern, re.IGNORECASE)
    extracted_list = list(filter(r.match, section_list))
    remaining_list = list(set(section_list) - set(extracted_list))
    
    return remaining_list, extracted_list

def construct_regex_match_pattern(terms_dict):
    fuzzy_terms = ['.*%s.*' % i for i in terms_dict['fuzzy']]
    exact_terms = terms_dict['exact']
    
    fuzzy_pattern = '|'.join(fuzzy_terms)
    #exact_pattern = '|'.join(exact_terms)
    
    full_pattern = fuzzy_pattern
    
    return full_pattern

In [13]:
"""
Extract putative discussion headers.
"""

#Construct regex pattern for discussion header terms and extract list of matching headers.

disc_terms_dict = {
    'exact': [],
    'fuzzy' : [
        'conclusion',
        'discussion',
        'interpretation',
        'added value of this study',
        'research in context',
        'concluding',
        'closing remarks',
        'summary of findings',
        'outcome'
    ]
}

In [14]:
conc_pattern = construct_regex_match_pattern(disc_terms_dict)
print(conc_pattern)

.*conclusion.*|.*discussion.*|.*interpretation.*|.*added value of this study.*|.*research in context.*|.*concluding.*|.*closing remarks.*|.*summary of findings.*|.*outcome.*


In [15]:
import pprint

unique_sections = set(merged_text_vec_df.section.tolist())

rem_header_list, ext_header_list = extract_regex_pattern(unique_sections, conc_pattern)

print("Number of discussion headers: %d" % len(ext_header_list))
print("Example discussion headers:")
pprint.pprint(ext_header_list[:20])


Number of discussion headers: 375
Example discussion headers:
['Study Outcomes ::: Methods',
 'Conclusion & Future Directions 132',
 'Milling and Optical Transparency ::: 3.2. Fabrication ::: 3. Results and '
 'Discussion',
 'Final Concluding Remarks based on our COVID-19 Predictions in China',
 'IMAGINE INTERPRETATION',
 'V. DISCUSSION',
 'Discussions',
 'Interpretation:',
 'Summary of policy measures on key outcome measures',
 '4. Conclusions',
 'CONCLUSIONS',
 '3.3. Metal Adhesion ::: 3. Results and Discussion',
 'What is the causative agent of disease? ::: DISCUSSION',
 'Discussion 173',
 'Efficacy until Day 6 outcomes',
 'Overall clinical features and outcome',
 'Outcome of patients treated with CP as compared to a recent historic control '
 'groupAhistoric',
 'Outcome definitions',
 'Graph interpretation',
 'Outcomes pool']


In [16]:
#For now, lower-casing all section headers for matching

section_exact_match_list = set([i.lower() for i in ext_header_list])
section_exact_match_list.add('abstract')
section_exact_match_list.add('title')

print("Number of sections filtering to: %d" % len(section_exact_match_list))

Number of sections filtering to: 336


In [20]:
merged_text_vec_df = pd.read_csv(full_merged_text_vec_df_outfile_path, index_col=0)

In [27]:
merged_df_filtered_to_sections_oi = merged_text_vec_df.loc[merged_text_vec_df.section.str.lower().isin(section_exact_match_list)]

merged_df_filtered_to_sections_oi.to_csv(filt_merged_text_vec_df_outfile_path, index=False)

In [19]:
"""
QC:
    -sentence_id and w2vVector pairings match original vector files
    -sentence_id and sentence pairings match original text files
    -sentence_id and cord_uid pairings matches original text files

    -sentences in df are in appropriate order
    -filtered sections are logical

    -recalculate number of cord_uids with title, abstract, and discussion sections
"""

import random 

#Random sample of sentence_ids
sample_sent_ids = random.sample(merged_df_filtered_to_sections_oi.sentence_id.tolist(), 20)
sample_cord_uids = random.sample(merged_df_filtered_to_sections_oi.cord_uid.tolist(), 20)

In [None]:
"""
Map cord_uids to text files and write to json.
"""

pq_text_files = glob.glob('%s/*' % pq_text_file_dir)

text_file_cord_uids_dict = {}
cord_uids_text_file_dict = {}

sent_id_sentence_dict = {}
sentence_sent_id_dict = {}

for pq_text_file in pq_text_files:
    print(pq_text_file)
    pq_text_df = pd.read_parquet(pq_text_file)
    
    text_file_uids = list(set(pq_text_df.cord_uid.tolist()))
    
    for text_file_uid in text_file_uids:
        text_file_cord_uids_dict.setdefault(pq_text_file, set([])).add(text_file_uid)
        cord_uids_text_file_dict.setdefault(text_file_uid, set([])).add(pq_text_file)
        
with open(cord_uid_text_file_map, 'w') as f:
    json.dump(cord_uids_text_file_dict, f)

resources/v8_preprocessed/v8processedText1.parquet


In [33]:
sample_text_files = set([])

for cord_uid in sample_cord_uids:
    sample_text_files.add(list(cord_uids_text_file_dict[cord_uid])[0])

for text_file in list(sample_text_files):
    print(text_file)
    text_file_df = pd.read_parquet(text_file)
    
    subset_df = text_file_df.loc[text_file_df.cord_uid.isin(sample_cord_uids)]
    print(subset_df[['cord_uid', 'sentence_id', 'sentence']])

resources/v8_preprocessed/v8processedText16.parquet
      cord_uid     sentence_id  \
1146  g8saag2o   g8saag2o01146   
1147  g8saag2o   g8saag2o01147   
1148  g8saag2o   g8saag2o01148   
1149  g8saag2o   g8saag2o01149   
1150  g8saag2o   g8saag2o01150   
...        ...             ...   
1279  g8saag2o   g8saag2o71279   
1280  g8saag2o  g8saag2o111280   
1281  g8saag2o  g8saag2o111281   
1282  g8saag2o  g8saag2o121282   
1283  g8saag2o  g8saag2o121283   

                                               sentence  
1146  Care of patients with liver disease during the...  
1147  [{'text': 'The coronavirus disease 2019 pandem...  
1148  Older patients and those with pre-existing med...  
1149  It remains unclear at this point to what exten...  
1150  However, patients with advanced liver disease ...  
...                                                 ...  
1279  Routine laboratory testing can be performed lo...  
1280  General considerations Care should be maintain...  
1281  Listing for

KeyboardInterrupt: 

In [97]:
pq_vec_files = glob.glob('%s/*' % pq_vec_file_dir)

vec_file_sent_ids_dict = {}
sent_ids_vec_file_dict = {}

for pq_vec_file in pq_vec_files:
    print(pq_vec_file)
    pq_vec_df = pd.read_parquet(pq_vec_file)
    
    vec_sent_ids = list(set(pq_vec_df.sentence_id.tolist()))
    
    for vec_sent_id in vec_sent_ids:
        vec_file_sent_ids_dict.setdefault(pq_vec_file, set([])).add(vec_sent_id)
        sent_ids_vec_file_dict.setdefault(vec_sent_id, set([])).add(pq_vec_file)

resources/v8_vectors/v8processedVecs16.parquet
resources/v8_vectors/v8processedVecs19.parquet
resources/v8_vectors/v8processedVecs1.parquet
resources/v8_vectors/v8processedVecs18.parquet
resources/v8_vectors/v8processedVecs17.parquet
resources/v8_vectors/v8processedVecs4.parquet
resources/v8_vectors/v8processedVecs5.parquet
resources/v8_vectors/v8processedVecs2.parquet
resources/v8_vectors/v8processedVecs14.parquet
resources/v8_vectors/v8processedVecs11.parquet
resources/v8_vectors/v8processedVecs9.parquet
resources/v8_vectors/v8processedVecs10.parquet
resources/v8_vectors/v8processedVecs7.parquet
resources/v8_vectors/v8processedVecs3.parquet
resources/v8_vectors/v8processedVecs12.parquet
resources/v8_vectors/v8processedVecs13.parquet
resources/v8_vectors/v8processedVecs15.parquet
resources/v8_vectors/v8processedVecs0.parquet
resources/v8_vectors/v8processedVecs8.parquet
resources/v8_vectors/v8processedVecs6.parquet


In [4]:
#Check uniqueness of sentence ids
csv_text_files = glob.glob('%s/*' % csv_trunc_text_file_dir)

for csv_text_file in csv_text_files:
    
    text_df = pd.read_csv(csv_text_file)
    
    print(csv_text_file)
    print(len(text_df.sentence_id.tolist()))
    print(len(set(text_df.sentence_id.tolist())))

    mask = text_df.sentence_id.duplicated(keep=False)
    display(text_df[mask])

resources/v8_truncated_text_files/v8processedText3_trunc.csv
4408
4408


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText1_trunc.csv
66
66


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText11_trunc.csv
178895
178895


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText16_trunc.csv
2709
2709


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText9_trunc.csv
369
369


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText14_trunc.csv
1808
1808


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText17_trunc.csv
2508
2508


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText13_trunc.csv
845
845


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText6_trunc.csv
539
539


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText2_trunc.csv
922
922


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText7_trunc.csv
2721
2721


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText18_trunc.csv
2508
2508


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText5_trunc.csv
1306
1306


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText4_trunc.csv
2753
2753


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText8_trunc.csv
3075
3075


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText10_trunc.csv
20455
20455


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText19_trunc.csv
34003
34003


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText0_trunc.csv
6240
6240


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText12_trunc.csv
2257
2257


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


resources/v8_truncated_text_files/v8processedText15_trunc.csv
2879
2879


Unnamed: 0.1,Unnamed: 0,cord_uid,sentence_id,section,sentence


In [8]:
pq_text_files = glob.glob('%s/*' % pq_text_file_dir)

for pq_text_file in pq_text_files:
    print(pq_text_file)
    text_df = pd.read_parquet(pq_text_file)
    
    mask = text_df.sentence_id.duplicated(keep=False)
    display(text_df[mask])

resources/v8_preprocessed/v8processedText1.parquet


Unnamed: 0,cord_uid,language,sentence_id,section,subsection,sentence,lemma,UMLS,translated,GGP,...,GENE_OR_GENE_PRODUCT,SIMPLE_CHEMICAL,ANATOMICAL_SYSTEM,IMMATERIAL_ANATOMICAL_ENTITY,MULTI-TISSUE_STRUCTURE,DEVELOPING_ANATOMICAL_STRUCTURE,ORGANISM_SUBDIVISION,CELLULAR_COMPONENT,PATHOLOGICAL_FORMATION,ORGANISM_SUBSTANCE
99,thobsldp,en,thobsldp4199,Competitive binding between host immune compon...,41,The statistical significance was determined fr...,"[statistical, significance, determine, Student...","[Statistical Significance, Package Dosing Unit]",False,[],...,[’s],[],[],[],[],[],[],[],[],[]
199,thobsldp,en,thobsldp4199,Confirmation of the immunoglobulin-binding beh...,4,Their interaction with Ig was analyzed using a...,"[confirm, Ig-binding, behavior, protein, ident...","[Drug Interactions, Intramuscular immunoglobul...",False,[],...,[Ig],[],[],[],[],[],[],[],[],[]


resources/v8_preprocessed/v8processedText4.parquet


Unnamed: 0,cord_uid,language,sentence_id,section,subsection,sentence,lemma,UMLS,translated,GGP,...,GENE_OR_GENE_PRODUCT,SIMPLE_CHEMICAL,ANATOMICAL_SYSTEM,IMMATERIAL_ANATOMICAL_ENTITY,MULTI-TISSUE_STRUCTURE,DEVELOPING_ANATOMICAL_STRUCTURE,ORGANISM_SUBDIVISION,CELLULAR_COMPONENT,PATHOLOGICAL_FORMATION,ORGANISM_SUBSTANCE
20,a3k56ulv,en,a3k56ulv2120,Purification and cleavage of SUMO-FGF23 ::: Re...,21,According to the isoelectric point of fusion p...,"[According, isoelectric, point, fusion, protei...","[Isoelectric Point, Fusion protein, 2-diethyla...",False,[],...,[SUMO-FGF23],[],[],[],[],[],[],[],[],[]
120,a3k56ulv,en,a3k56ulv2120,Introduction,2,"Thus, we also cloned a SUMO fragment and const...","[recent, year, small, ubiquitin-related, modif...","[Clone Cells, SUMO-1 Protein, Fragment, Gene E...",False,"[SUMO fragment, SUMO]",...,[],[],[],[],[],[],[],[],[],[]
51,3djfwd0y,en,3djfwd0y1251,History of Sewage Disposal in Antarctica ::: S...,12,"More recently, increasing station populations ...","[recently, increase, station, population, comm...","[Increasing, geographic population, Increase, ...",False,[],...,[Reed],"[piped, Arcone]",[],[],[],[],[],[],[],[]
52,3djfwd0y,en,3djfwd0y1252,History of Sewage Disposal in Antarctica ::: S...,12,"At coastal bases these were, and largely remai...","[recently, increase, station, population, comm...","[Surface, Body Fluid Discharge, Ascend (action...",False,[],...,[Reed],[],[],[Bleasel],[],[],[],[],[],[]
251,3djfwd0y,en,3djfwd0y1251,Sewage and Wastewater ::: Sewage Disposal from...,1,"For the purposes of this discussion, the term ...","[purpose, discussion, term, sewage, refer, hum...","[Purpose, Discussion (procedure), Sewage, Homo...",False,[],...,[],[],[],[],[],[],[],[],[],[]
252,3djfwd0y,en,3djfwd0y1252,Sewage and Wastewater ::: Sewage Disposal from...,1,"Elsewhere in the world, sewage treatment plant...","[purpose, discussion, term, sewage, refer, hum...","[World, Sewage, Treatment Plan, Wastewater, In...",False,[],...,[],[EPA],[],[],[],[],[],[surface],[],[]


resources/v8_preprocessed/v8processedText8.parquet


Unnamed: 0,cord_uid,language,sentence_id,section,subsection,sentence,lemma,UMLS,translated,GGP,...,GENE_OR_GENE_PRODUCT,SIMPLE_CHEMICAL,ANATOMICAL_SYSTEM,IMMATERIAL_ANATOMICAL_ENTITY,MULTI-TISSUE_STRUCTURE,DEVELOPING_ANATOMICAL_STRUCTURE,ORGANISM_SUBDIVISION,CELLULAR_COMPONENT,PATHOLOGICAL_FORMATION,ORGANISM_SUBSTANCE


resources/v8_preprocessed/v8processedText10.parquet


Unnamed: 0,cord_uid,language,sentence_id,section,subsection,sentence,lemma,UMLS,translated,GGP,...,GENE_OR_GENE_PRODUCT,SIMPLE_CHEMICAL,ANATOMICAL_SYSTEM,IMMATERIAL_ANATOMICAL_ENTITY,MULTI-TISSUE_STRUCTURE,DEVELOPING_ANATOMICAL_STRUCTURE,ORGANISM_SUBDIVISION,CELLULAR_COMPONENT,PATHOLOGICAL_FORMATION,ORGANISM_SUBSTANCE
0,hdf2mwpc,en,hdf2mwpc180,Foal Diarrhea 679,18,The diagnosis of Salmonella infection is tradi...,"[diagnosis, Salmonella, infection, traditional...","[Diagnosis Study, Salmonella infections, Stool...",False,[],...,[],[],[],[],[],[],[],[],[],[]
1,hdf2mwpc,en,hdf2mwpc181,Foal Diarrhea 679,18,Samples should be transported using suitable t...,"[diagnosis, Salmonella, infection, traditional...","[Specimen, Membrane Transport Proteins, Transp...",False,[],...,[],[],[],[],[],[],[],[],[],[]
2,hdf2mwpc,en,hdf2mwpc182,Foal Diarrhea 679,18,Samples can be transported in selenite broth i...,"[diagnosis, Salmonella, infection, traditional...","[Specimen, Membrane Transport Proteins, Collec...",False,[],...,[],[selenite],[],[],[],[],[],[],[],[]
3,hdf2mwpc,en,hdf2mwpc183,Foal Diarrhea 679,18,Blood culture is worthwhile in foals less than...,"[diagnosis, Salmonella, infection, traditional...","[Blood culture, month, Age]",False,[],...,[],[],[],[],[],[],[],[],[],[]
80,hdf2mwpc,en,hdf2mwpc180,FOAL HEAT DIARRHEA,1,The basis of foal heat diarrhea is not certain...,"[Foal, heat, diarrhea, arguably, common, cause...","[Horse under one year old, Biologic Developmen...",False,[],...,[],[],[],[],[],[],[],[],[],[]
81,hdf2mwpc,en,hdf2mwpc181,FOAL HEAT DIARRHEA,1,"The temporal association between coprophagy, t...","[Foal, heat, diarrhea, arguably, common, cause...","[Temporal - Regional site descriptor, Relation...",False,[],...,[],[],[],[],[],[],[],[],[intestinal flora],[]
82,hdf2mwpc,en,hdf2mwpc182,FOAL HEAT DIARRHEA,1,The diarrhea does not appear to be associated ...,"[Foal, heat, diarrhea, arguably, common, cause...","[Diarrhea, Associated with, Alteration, Compos...",False,[],...,[],[],[],[],[],[],[],[],[],[]
83,hdf2mwpc,en,hdf2mwpc183,FOAL HEAT DIARRHEA,1,Anecdotal reports suggest that the feeding of ...,"[Foal, heat, diarrhea, arguably, common, cause...","[Anecdotal Report, Feeding patient, Biotin, In...",False,[],...,[],[],[],[],[],[],[],[],[],[]


resources/v8_preprocessed/v8processedText3.parquet


KeyboardInterrupt: 

In [13]:
csv_vec_files = glob.glob('%s/*' % csv_trunc_vec_file_dir)

for csv_vec_file in csv_vec_files:
    
    text_df = pd.read_csv(csv_vec_file)

    print(csv_vec_file)
    
    mask = text_df.sentence_id.duplicated(keep=False)
    display(text_df[mask])

UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 12-13: invalid continuation byte

In [17]:
pq_vec_files = glob.glob('%s/*' % pq_vec_file_dir)

for pq_vec_file in pq_vec_files:
    
    text_df = pd.read_parquet(pq_vec_file)

    sentence_ids = text_df.sentence_id.tolist()
    print(len(sentence_ids))
    if "thobsldp4199" in text_df.sentence_id.tolist():
        print(pq_vec_file)

328552
328552
330021
328552
328552
330021
resources/v8_vectors/v8processedVecs4.parquet
330021
330021
328553
328553
330020
328553
330020
330021
328553
328553
328553
330021
330020
330020


In [18]:
vec_df = pd.read_parquet('resources/v8_vectors/v8processedVecs4.parquet')

In [21]:
#Compare prior and current dataframe

prior_df = pd.read_csv('resources/filtered_text_vec_df_200429.csv')
current_df = pd.read_csv('resources/filt_merged_text_vector_df_200430.csv')


In [28]:
print(len(set(current_df.cord_uid.tolist())))
print(len(set(prior_df.cord_uid.tolist())))

4677
3393
