In [1]:
import pandas as pd
import numpy as np
import os, json

from os import listdir
from fuzzywuzzy import fuzz



In [46]:
DATAPATH = 'data'
VERSION = 'cord19_v20'
META_PATH ="/".join([DATAPATH, VERSION, 'metadata.csv'])

In [47]:
meta_df = pd.read_csv(META_PATH)
print(meta_df.shape)
# meta_df.head()

(63571, 18)


In [48]:
cuid_dup = meta_df[meta_df.duplicated(['cord_uid'])]
cuid_dup_set = set(cuid_dup['cord_uid'])
print("All cuid duplicate records:", len(cuid_dup))

All cuid duplicate records: 44


In [49]:
for uid in cuid_dup_set:
    dup_df = meta_df[meta_df['cord_uid'] == uid]
    if len(dup_df['pubmed_id'].unique()) != 1 and len(dup_df['doi'].unique()) != 1:
        print(dup_df['pubmed_id'].unique())
        print(dup_df['doi'].unique())

In [50]:
has_pdfpar_df = meta_df[meta_df['pdf_json_files'].isna() == False]
pdfpar_dup = has_pdfpar_df[has_pdfpar_df.duplicated(['pdf_json_files'])]
print("Duplicate pdf paths:", len(pdfpar_dup))
print("Total pdf paths", len(has_pdfpar_df))

Duplicate pdf paths: 13
Total pdf paths 48924


In [51]:
has_pmcpar_df = meta_df[meta_df['pmc_json_files'].isna() == False]
pmcpar_dup = has_pmcpar_df[has_pmcpar_df.duplicated(['pmc_json_files'])]
print("Duplicte pmc paths:", len(pmcpar_dup))
print("Total pmc paths", len(has_pmcpar_df))

Duplicte pmc paths: 0
Total pmc paths 33503


In [52]:
has_sha_df = meta_df[meta_df['sha'].isna() == False]
mult_sha_df = has_sha_df[has_sha_df['sha'].str.len() != 40]
sing_sha_df = has_sha_df[has_sha_df['sha'].str.len() == 40]
print("# of records with multiple associated sha values:", len(mult_sha_df))

# of records with multiple associated sha values: 2517


In [53]:
has_pmc_df = meta_df[meta_df['pmcid'].isna() == False]
print("# of weird pmcid values:", len(has_pmc_df[(has_pmc_df['pmcid'].str.len() != 9) & (has_pmc_df['pmcid'].str.len() != 10) & (has_pmc_df['pmcid'].str.len() != 8)]))

# of weird pmcid values: 0


In [54]:
PARSE_PATH ="/".join([DATAPATH, VERSION, 'document_parses'])

In [55]:
pdf_path = "/".join([PARSE_PATH, 'pdf_json'])
pmc_path = "/".join([PARSE_PATH, 'pmc_json'])
pdf_parses = listdir(pdf_path)
pmc_parses = listdir(pmc_path)
print("Total # pdfs:", len(pmc_parses))
print("Total # pmcs:", len(pdf_parses))

Total # pdfs: 33503
Total # pmcs: 51868


In [56]:
sha_list = []
for row in mult_sha_df.iterrows():
    new_l = row[1].sha.split("; ")
    sha_list.extend(new_l)

multi_sha_list = sha_list.copy()
print("All sha values in multi sha record list:", len(multi_sha_list))
print("All UNIQUE sha values in multi sha record list:", len(set(multi_sha_list)))

print("All sha values in single sha record list:", len(list(sing_sha_df.sha)))
print("All UNIQUE sha values in single sha record list:", len(set(sing_sha_df.sha)))

sha_list.extend(list(sing_sha_df.sha))
print("All sha values in combined sha record list:", len(sha_list))
print("All UNIQUE sha values in combined sha record list:", len(set(sha_list)))
total_unique_sha = len(set(sha_list))

All sha values in multi sha record list: 5475
All UNIQUE sha values in multi sha record list: 5475
All sha values in single sha record list: 46407
All UNIQUE sha values in single sha record list: 46394
All sha values in combined sha record list: 51882
All UNIQUE sha values in combined sha record list: 51868


In [57]:
has_path_df = meta_df[(meta_df["pdf_json_files"].isna() == False) | (meta_df["pmc_json_files"].isna() == False)]
print("Total full text records:", len(has_path_df))

Total full text records: 49862


In [58]:
def get_text_and_ratios(data, record, field):
    if field == 'title':
        json_text = data['metadata'][field]
    else:
        if field not in data.keys():
            json_text = ''
        else:
            json_text = ''
            for sub_dict in data[field]:
                json_text  = json_text + sub_dict['text'] + ' '
    
    json_text = json_text.lower()
        
    try:
        meta_text = record[field].lower()
    except:
        meta_text = ''
    try:    
        full_ratio = fuzz.ratio(meta_text, json_text)
        partial_ratio = fuzz.partial_ratio(meta_text, json_text)
    except:
        print('META:   ', meta_text)
        print('JSON:   ', json_text)
        print('P_ID:   ', data['paper_id'] )
        
    return json_text, meta_text, full_ratio, partial_ratio

def set_text_and_ratios(sub_dict, field, json_text, meta_text, full_ratio, partial_ratio):
    sub_dict[field]['meta_text'] = meta_text
    sub_dict[field]['json_text'] = json_text
    sub_dict[field]['full_ratio'] = full_ratio
    sub_dict[field]['partial_ratio'] = partial_ratio
    
    if full_ratio >= 85 or partial_ratio >= 85:
        sub_dict[field]['is_error'] = False
    else:
        sub_dict[field]['is_error'] = True

    

In [59]:
errors_dict = dict()
for row in has_path_df.iterrows():
    record = row[1]
    if record.cord_uid in errors_dict.keys():
        continue
    else:
        errors_dict[record.cord_uid] = {}
    
    uid_dict = errors_dict[record.cord_uid]
    
    pmc_path = None
    pmcid = None
    pdf_paths = None
    pdf_ids = None
    data = None
    
    if not isinstance(record.pmc_json_files, float):
        pmc_path = "/".join([DATAPATH, VERSION, record.pmc_json_files])
        pmcid = record.pmcid
    
    if not isinstance(record.pdf_json_files, float):
        pdf_paths = record.pdf_json_files.split("; ")
        pdf_paths = ["/".join([DATAPATH, VERSION, x]) for x in pdf_paths]
        pdf_ids = record.sha.split("; ")
    
    if pmcid is not None:
        uid_dict[pmcid] = {'title': {}}
        with open(pmc_path) as f:
            data = json.load(f)
        
        json_text, meta_text, full_ratio, partial_ratio = get_text_and_ratios(data, record, 'title')
        set_text_and_ratios(uid_dict[pmcid], 'title', json_text, meta_text, full_ratio, partial_ratio)
    
    if pdf_ids is not None:
        for i, path in enumerate(pdf_paths):
            
            uid_dict[pdf_ids[i]] = {'title': {}, 'abstract':{}}
            with open(path) as f:
                data = json.load(f)
            
            json_text, meta_text, full_ratio, partial_ratio = get_text_and_ratios(data, record, 'title')
            set_text_and_ratios(uid_dict[pdf_ids[i]], 'title', json_text, meta_text, full_ratio, partial_ratio)
            
            json_text, meta_text, full_ratio, partial_ratio = get_text_and_ratios(data, record, 'abstract')
            set_text_and_ratios(uid_dict[pdf_ids[i]], 'abstract', json_text, meta_text, full_ratio, partial_ratio)
            
print(len(errors_dict))

49849


In [60]:
for record in meta_df[meta_df['sha'] == '4b0e97f3c2c4402b3174623db30ce1a1cde46b50'].iterrows():
    meta_text = record[1]['abstract'].lower()
    
    path = "/".join([DATAPATH, VERSION, record[1]['pdf_json_files']])
    with open(path) as f:
        data = json.load(f)
    
    json_text = ''
    for sub_dict in data['abstract']:
        json_text  = json_text + sub_dict['text'] + ' '
    
    json_text = json_text.lower()
    
    full_ratio = fuzz.ratio(meta_text, json_text)
    partial_ratio = fuzz.partial_ratio(json_text, meta_text)

In [61]:
def get_errors(errors_dict, main_dict, pid):
    if main_dict['is_error']:
        errors_dict['total_errors'] += 1
        if main_dict['meta_text'] == '':
            errors_dict['empty_meta_text'] += 1
        if main_dict['json_text'] == '':
            errors_dict['empty_json_text'] += 1
        if errors_dict['empty_meta_text'] != '' and main_dict['json_text'] !='':
            errors_dict['other_errors'] += 1
        if len(pid) == 40:
            errors_dict['sha_errors'] += 1
        if len(pid) != 40:
            errors_dict['pmc_errors'] += 1

In [62]:
title_errors_dict = {"total_errors": 0, "empty_json_text": 0, "empty_meta_text": 0,
                     "other_errors": 0, "sha_errors": 0, "pmc_errors": 0}

abstract_errors_dict = {"total_errors": 0, "empty_json_text": 0, "empty_meta_text": 0,
                     "other_errors": 0, "sha_errors": 0, "pmc_errors": 0}

for cord_uid in errors_dict:
    for pid in errors_dict[cord_uid]:
        title_dict = errors_dict[cord_uid][pid]['title']
        get_errors(title_errors_dict, title_dict, pid)
        
        if len(pid) == 40:
            abstract_dict = errors_dict[cord_uid][pid]['abstract']
            get_errors(abstract_errors_dict, abstract_dict, pid)
        
print(title_errors_dict)
print(abstract_errors_dict)

{'total_errors': 9066, 'empty_json_text': 5911, 'empty_meta_text': 6, 'other_errors': 3155, 'sha_errors': 9045, 'pmc_errors': 21}
{'total_errors': 13602, 'empty_json_text': 8952, 'empty_meta_text': 1043, 'other_errors': 4650, 'sha_errors': 13602, 'pmc_errors': 0}


In [63]:
if not os.path.isdir('qc_results'):
    os.mkdir('qc_results')
if not os.path.isdir('qc_results/' + VERSION):
    os.mkdir('qc_results/' + VERSION)

RESULTS_PATH = 'qc_results/' + VERSION + '/'

all_titles_and_abstracts_path = RESULTS_PATH + 'all_titles_and_abstracts.json'
with open(all_titles_and_abstracts_path, 'w') as fp:
    json.dump(errors_dict, fp)

title_errors_info_path = RESULTS_PATH + 'title_errors_info.json'
with open(title_errors_info_path, 'w') as fp:
    json.dump(title_errors_dict, fp)
    
abstract_errors_info_path = RESULTS_PATH + 'abstract_errors_info.json'
with open(abstract_errors_info_path, 'w') as fp:
    json.dump(abstract_errors_dict, fp)

In [64]:
log_path = RESULTS_PATH + "quality_check_results_log.txt"
f = open(log_path, "w")
f.write("Total records in metadata: {}\n".format(meta_df.shape[0]))
f.write("Total full text records: {}\n".format(len(has_path_df)))
f.write("All cuid duplicate records: {}\n".format(len(cuid_dup)))

f.write("\nTotal pdf paths: {}\n".format(len(has_pdfpar_df)))
f.write("Duplicte pdf paths: {}\n".format(len(pdfpar_dup)))
f.write("Total pmc paths: {}\n".format(len(has_pmcpar_df)))
f.write("Duplicte pmc paths: {}\n".format(len(pmcpar_dup)))

f.write("\nRecords with multiple sha values: {}\n".format(len(mult_sha_df)))
f.write("Record with weird pmcid values: {}\n".format(len(has_pmc_df[(has_pmc_df['pmcid'].str.len() != 9) & (has_pmc_df['pmcid'].str.len() != 10) & (has_pmc_df['pmcid'].str.len() != 8)])))

f.write("\nTotal # of pdfs: {}\n".format(len(pmc_parses)))
f.write("Total # of pmcs: {}\n".format(len(pdf_parses)))

f.write("\nAll sha values in multi sha record list: {}\n".format(len(multi_sha_list)))
f.write("All UNIQUE sha values in multi sha record list: {}\n".format(len(set(multi_sha_list))))
f.write("All sha values in single sha record list: {}\n".format(len(list(sing_sha_df.sha))))
f.write("All UNIQUE sha values in single sha record list: {}\n".format(len(set(sing_sha_df.sha))))
f.write("All sha values in combined sha record list: {}\n".format(len(sha_list)))
f.write("All UNIQUE sha values in combined sha record list: {}\n".format(len(set(sha_list))))

f.write("\nAll title discrepencies between metadata and jsons: {}\n".format(title_errors_dict['total_errors']))
f.write("All empty title text fields in jsons: {}\n".format(title_errors_dict['empty_json_text']))
f.write("All empty title text fields in metadata: {}\n".format(title_errors_dict['empty_meta_text']))
f.write("All other title text field discrepencies: {}\n".format(title_errors_dict['other_errors']))
f.write("All title errors that occured with pdfs: {}\n".format(title_errors_dict['sha_errors']))
f.write("All title errors that occured with pmcs: {}\n".format(title_errors_dict['pmc_errors']))

f.write("\nAll abstract discrepencies between metadata and jsons: {}\n".format(abstract_errors_dict['total_errors']))
f.write("All empty abstract text fields in jsons: {}\n".format(abstract_errors_dict['empty_json_text']))
f.write("All empty absract text fields in metadata: {}\n".format(abstract_errors_dict['empty_meta_text']))
f.write("All other abstract text field discrepencies: {}\n".format(abstract_errors_dict['other_errors']))

f.close()