## Imports

In [128]:
from pathlib import Path
import string
from glob import glob
import json
import pandas as pd
from pprint import pprint
from tqdm.notebook import tqdm
from collections import Counter

from ipywidgets import interact

## Read Data

In [2]:
data_dir = Path("../input/CORD-19-research-challenge")
assert data_dir.exists()

In [11]:
all_json_files = glob(str(data_dir / "**/*.json"), recursive=True)
len(all_json_files)

29315

In [12]:
all_json_files

['../input/CORD-19-research-challenge/custom_license/custom_license/ab680d5dbc4f51252da3473109a7885dd6b5eb6f.json',
 '../input/CORD-19-research-challenge/custom_license/custom_license/6599ebbef3d868afac9daa4f80fa075675cf03bc.json',
 '../input/CORD-19-research-challenge/custom_license/custom_license/eb5c7f3ff921ad6469b79cc8a3c122648204ece4.json',
 '../input/CORD-19-research-challenge/custom_license/custom_license/b87b790c96c75faa22a085cb560f7b3d8e018b24.json',
 '../input/CORD-19-research-challenge/custom_license/custom_license/68c0bb1989b6ca2b38da32a0d992027db39f80bc.json',
 '../input/CORD-19-research-challenge/custom_license/custom_license/5ad06be75e9c306aa10fea704f93d4ba90623a15.json',
 '../input/CORD-19-research-challenge/custom_license/custom_license/6c9a692eb00e9563f550ab57838c7b29d6731e54.json',
 '../input/CORD-19-research-challenge/custom_license/custom_license/45e7c863c8a0bf2f373a64c3d7ba1546ca26d672.json',
 '../input/CORD-19-research-challenge/custom_license/custom_license/4539

In [13]:
class JsonDoc:
    
    def __init__(self, filename):
        with open(filename, 'r') as f:
            self.raw_contents = json.load(f)
            self.paper_id = self.raw_contents['paper_id']
            self.title = self.raw_contents['metadata']['title']
            abstract_paragraphs = [entry['text'] for entry in self.raw_contents['abstract']]
            self.abstract = '\n'.join(abstract_paragraphs)
            text_paragraphs = [entry['text'] for entry in self.raw_contents['body_text']]
            self.body_text = '\n'.join(text_paragraphs)
            all_sections = set([entry['section'] for entry in self.raw_contents['body_text']])
            self.sections = '\n'.join(all_sections)
            all_ref_captions = [entry['text'] for entry in self.raw_contents['ref_entries'].values()]
            self.ref_captions = '\n'.join(all_ref_captions)
            
    def __repr__(self):
        return f"{self.paper_id}: {self.title}"

In [18]:
all_jsons = []
for json_filename in tqdm(all_json_files):
    all_jsons.append(JsonDoc(json_filename))

HBox(children=(FloatProgress(value=0.0, max=29315.0), HTML(value='')))




In [None]:
all_jsons = [JsonDoc(json_filename) for json_filename in all_json_files]

Read CSV file

In [32]:
meta_df = pd.read_csv(data_dir / "metadata.csv", dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file
0,,Elsevier,Intrauterine virus infections and congenital h...,10.1016/0002-8703(72)90077-4,,4361535,els-covid,Abstract The etiologic basis for the vast majo...,1972-12-31,"Overall, James C.",American Heart Journal,,,False,custom_license
1,,Elsevier,Coronaviruses in Balkan nephritis,10.1016/0002-8703(80)90355-5,,6243850,els-covid,,1980-03-31,"Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;...",American Heart Journal,,,False,custom_license
2,,Elsevier,Cigarette smoking and coronary heart disease: ...,10.1016/0002-8703(80)90356-7,,7355701,els-covid,,1980-03-31,"Friedman, Gary D",American Heart Journal,,,False,custom_license
3,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,Clinical and immunologic studies in identical ...,10.1016/0002-9343(73)90176-9,,4579077,els-covid,"Abstract Middle-aged female identical twins, o...",1973-08-31,"Brunner, Carolyn M.; Horwitz, David A.; Shann,...",The American Journal of Medicine,,,True,custom_license
4,,Elsevier,Epidemiology of community-acquired respiratory...,10.1016/0002-9343(85)90361-4,,4014285,els-covid,Abstract Upper respiratory tract infections ar...,1985-06-28,"Garibaldi, Richard A.",The American Journal of Medicine,,,False,custom_license


In [38]:
meta_df.columns

Index(['sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license',
       'abstract', 'publish_time', 'authors', 'journal',
       'Microsoft Academic Paper ID', 'WHO #Covidence', 'has_full_text',
       'full_text_file'],
      dtype='object')

In [39]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44220 entries, 0 to 44219
Data columns (total 15 columns):
sha                            28462 non-null object
source_x                       44220 non-null object
title                          43996 non-null object
doi                            40750 non-null object
pmcid                          23319 non-null object
pubmed_id                      22943 non-null object
license                        44220 non-null object
abstract                       35806 non-null object
publish_time                   34197 non-null object
authors                        41074 non-null object
journal                        33173 non-null object
Microsoft Academic Paper ID    964 non-null object
WHO #Covidence                 1767 non-null object
has_full_text                  44220 non-null bool
full_text_file                 32829 non-null object
dtypes: bool(1), object(14)
memory usage: 4.8+ MB


In [66]:
any(meta_df[meta_df.has_full_text == True].sha.isna())

False

In [59]:
sum(meta_df.has_full_text == 'TRUE'

0        False
1        False
2        False
3        False
4        False
         ...  
44215    False
44216    False
44217    False
44218    False
44219    False
Name: has_full_text, Length: 44220, dtype: bool

## Create Dataframe

In [20]:
json_doc = all_jsons[0]

In [27]:
json_df = pd.DataFrame([{
    'sha': json_doc.paper_id,
    'title': json_doc.title,
    'abstract': json_doc.abstract,
    'text': json_doc.body_text,
    'sections': json_doc.sections,
    'ref_captions': json_doc.ref_captions
} for json_doc in all_jsons])

In [28]:
len(json_df)

29315

In [29]:
json_df.head()

Unnamed: 0,sha,title,abstract,text,sections,ref_captions
0,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,Evolutionary Medicine IV. Evolution and Emerge...,,The evolutionary history of humans is characte...,Using Phylodyanmics to Study Disease Reporting...,Figure 2 Estimates from genomic data suggest t...
1,6599ebbef3d868afac9daa4f80fa075675cf03bc,International aviation emissions to 2025: Can ...,"International aviation is growing rapidly, res...","Sixty years ago, civil aviation was an infant ...",Comparison with previous projections\nRecent t...,Total international and domestic revenue tonne...
2,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,2 Mechanisms of diarrhoea,,Acute infections of the gastrointestinal tract...,Heat-labile enterotoxin (LT)\nCholera toxin (C...,and chloride absorption take place through two...
3,b87b790c96c75faa22a085cb560f7b3d8e018b24,Features of Host Cells: Cellular and Molecular...,,"There are three domains of life-Bacteria, Arch...",PROMOTION OF VIRAL TRANSCRIPTION AND TRANSLATI...,A typical eukaryotic cell.\nElectron micrograp...
4,68c0bb1989b6ca2b38da32a0d992027db39f80bc,Spring 2020 | 1 Beijing's Hard and Soft Repres...,Hong Kong's new Police Commissioner Chris Tang...,"It is also noteworthy that Tang, who was once ...",\nHard and Soft Repression\nProspects\nTargeti...,"32 Shibani Mahtani, Timothy McLaughlin, Tiffan..."


In [36]:
json_df.to_pickle("json_df.pickle")
json_df.to_csv("json_df.csv")

In [42]:
json_df.drop_duplicates(['abstract', 'text'], inplace=True)

In [43]:
len(json_df)

29123

In [48]:
json_df_small = json_df.iloc[:1000]

## Do something with dataset

Add new column with all text

**Task: Gather word counts and statistics (co-occurrence, ...) on mentions of entities of interest in CORD-19**

- Word counts
    - 

In [72]:
keywords = [
    'corona',
    'coronavirus',
    'glyco',
    'covid',
    'risk',
]

columns=['abstract', 'text', 'sections', 'ref_captions', 'all_text']

In [139]:
json_df_small['all_text'] = json_df_small[
    ['abstract', 'text', 'sections', 'ref_captions']].agg('\n'.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [73]:
json_df_small.head()

Unnamed: 0,sha,title,abstract,text,sections,ref_captions,all_text
0,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,Evolutionary Medicine IV. Evolution and Emerge...,,The evolutionary history of humans is characte...,Using Phylodyanmics to Study Disease Reporting...,Figure 2 Estimates from genomic data suggest t...,\nThe evolutionary history of humans is charac...
1,6599ebbef3d868afac9daa4f80fa075675cf03bc,International aviation emissions to 2025: Can ...,"International aviation is growing rapidly, res...","Sixty years ago, civil aviation was an infant ...",Comparison with previous projections\nRecent t...,Total international and domestic revenue tonne...,"International aviation is growing rapidly, res..."
2,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,2 Mechanisms of diarrhoea,,Acute infections of the gastrointestinal tract...,Heat-labile enterotoxin (LT)\nCholera toxin (C...,and chloride absorption take place through two...,\nAcute infections of the gastrointestinal tra...
3,b87b790c96c75faa22a085cb560f7b3d8e018b24,Features of Host Cells: Cellular and Molecular...,,"There are three domains of life-Bacteria, Arch...",PROMOTION OF VIRAL TRANSCRIPTION AND TRANSLATI...,A typical eukaryotic cell.\nElectron micrograp...,"\nThere are three domains of life-Bacteria, Ar..."
4,68c0bb1989b6ca2b38da32a0d992027db39f80bc,Spring 2020 | 1 Beijing's Hard and Soft Repres...,Hong Kong's new Police Commissioner Chris Tang...,"It is also noteworthy that Tang, who was once ...",\nHard and Soft Repression\nProspects\nTargeti...,"32 Shibani Mahtani, Timothy McLaughlin, Tiffan...",Hong Kong's new Police Commissioner Chris Tang...


In [74]:
def get_counts(keyword, column):
    df = json_df_small
    counts = df[column].str.contains(keyword, case=False).sum()
    return counts / len(df)

get_counts(keywords[0], 'text')

0.656

In [75]:
df_counts = pd.DataFrame(index=keywords, columns=columns)
df_counts.head()

Unnamed: 0,abstract,text,sections,ref_captions,all_text
corona,,,,,
coronavirus,,,,,
glyco,,,,,
covid,,,,,
risk,,,,,


In [76]:
for keyword in keywords:
    for column in columns:
        df_counts.loc[keyword, column] = get_counts(keyword, column)

In [77]:
df_counts

Unnamed: 0,abstract,text,sections,ref_captions,all_text
corona,0.12,0.656,0.057,0.141,0.696
coronavirus,0.116,0.606,0.051,0.119,0.639
glyco,0.035,0.274,0.019,0.049,0.282
covid,0.017,0.008,0.001,0.004,0.026
risk,0.07,0.474,0.034,0.103,0.502


Co-occurence

In [85]:
def get_cooccurrence(kw1, kw2, column):
    df = json_df_small
    
    kw1_in = df[column].str.contains(kw1, case=False)
    kw2_in = df[column].str.contains(kw2, case=False)
    
    return sum(kw1_in & kw2_in) / len(df)

get_cooccurrence('corona', 'glyco', 'abstract')

0.004

In [82]:
df_cooccurrence = pd.DataFrame(index=keywords, columns=keywords)
df_cooccurrence

Unnamed: 0,corona,coronavirus,glyco,covid,risk
corona,,,,,
coronavirus,,,,,
glyco,,,,,
covid,,,,,
risk,,,,,


In [89]:
df_cooccurrence

Unnamed: 0,corona,coronavirus,glyco,covid,risk
corona,0.696,0.639,0.23,0.017,0.332
coronavirus,0.639,0.639,0.211,0.015,0.3
glyco,0.23,0.211,0.282,0.006,0.131
covid,0.017,0.015,0.006,0.026,0.014
risk,0.332,0.3,0.131,0.014,0.502


In [91]:
results_list = []

for kw1 in df_cooccurrence.index:
    for kw2 in df_cooccurrence.columns:
        for column in df_counts.columns:
            result = get_cooccurrence(kw1, kw2, column)
            results_list.append({'kw1': kw1, 'kw2': kw2, 'column': column, 'result': result})

In [92]:
df_cooccurrence_all = pd.DataFrame(results_list)

In [97]:
df_cooccurrence_all.head(50)

Unnamed: 0,kw1,kw2,column,result
0,corona,corona,abstract,0.12
1,corona,corona,text,0.656
2,corona,corona,sections,0.057
3,corona,corona,ref_captions,0.141
4,corona,corona,all_text,0.696
5,corona,coronavirus,abstract,0.116
6,corona,coronavirus,text,0.606
7,corona,coronavirus,sections,0.051
8,corona,coronavirus,ref_captions,0.119
9,corona,coronavirus,all_text,0.639


In [96]:
df_cooccurrence_all.groupby(['kw1', 'kw2', 'column'])['result'].sum()

kw1     kw2     column      
corona  corona  abstract        0.120
                all_text        0.696
                ref_captions    0.141
                sections        0.057
                text            0.656
                                ...  
risk    risk    abstract        0.070
                all_text        0.502
                ref_captions    0.103
                sections        0.034
                text            0.474
Name: result, Length: 125, dtype: float64

In [102]:
df_cooccurrence_all[df_cooccurrence_all.kw1 == df_cooccurrence_all.kw2].groupby(['kw1', 'column'])['result']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x3347df198>

In [107]:
Out[102].sum().unstack()

column,abstract,all_text,ref_captions,sections,text
kw1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
corona,0.12,0.696,0.141,0.057,0.656
coronavirus,0.116,0.639,0.119,0.051,0.606
covid,0.017,0.026,0.004,0.001,0.008
glyco,0.035,0.282,0.049,0.019,0.274
risk,0.07,0.502,0.103,0.034,0.474


In [108]:
df_counts

Unnamed: 0,abstract,text,sections,ref_captions,all_text
corona,0.12,0.656,0.057,0.141,0.696
coronavirus,0.116,0.606,0.051,0.119,0.639
glyco,0.035,0.274,0.019,0.049,0.282
covid,0.017,0.008,0.001,0.004,0.026
risk,0.07,0.474,0.034,0.103,0.502


In [110]:
df_cooccurrence_all[df_cooccurrence_all.kw1 == df_cooccurrence_all.kw2].drop(columns=['kw2'])

Unnamed: 0,kw1,column,result
0,corona,abstract,0.12
1,corona,text,0.656
2,corona,sections,0.057
3,corona,ref_captions,0.141
4,corona,all_text,0.696
30,coronavirus,abstract,0.116
31,coronavirus,text,0.606
32,coronavirus,sections,0.051
33,coronavirus,ref_captions,0.119
34,coronavirus,all_text,0.639


In [117]:
@interact
def _(column=columns):
    df = df_cooccurrence_all[df_cooccurrence_all.column == column].drop(columns=['column'])
    df = df.groupby(['kw1', 'kw2'])['result'].mean().unstack()
    return df

interactive(children=(Dropdown(description='column', options=('abstract', 'text', 'sections', 'ref_captions', …

**Next**:
- count number of occurences / cooccurrences (instead of booleans)


In [127]:
Counter(json_df_small.loc[0, 'text'].lower().replace("[]".split())['and']

101

In [164]:
def tokenize(s):
    s = s.lower()
    s = s.replace('\n', ' ')
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = s.split()
    
    return s

In [181]:
def get_kw_count(kw, text):
#     counts = Counter(tokenize(text))
#     return counts[kw]
    tokens = tokenize(text)
    return sum(kw in token for token in tokens)

In [153]:
s = json_df_small.loc[0, 'text'].lower()
get_kw_count('the', s)

159

In [184]:
for kw in keywords:
    res = json_df_small['all_text'].apply(lambda s: get_kw_count(kw, s))
    json_df_small[kw] = res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [186]:
json_df_small.sort_values('coronavirus', ascending=False)

Unnamed: 0,sha,title,abstract,text,sections,ref_captions,all_text,corona,coronavirus,glyco,covid,risk
715,888df3a1646290458e16ab2ad3add5ef41b02ecc,SARS-Coronavirus ancestor's foot-prints in Sou...,One of the great challenges in the ecology of ...,One of the great challenges in the ecology of ...,Sampling pressure and host species involved\nS...,Phylogenetic reconstruction of main coronaviru...,One of the great challenges in the ecology of ...,197,182,0,0,1
319,751ffc336c6d7846e4d9018ad8ad6234e06056ed,RNA structure analysis of alphacoronavirus ter...,Coronavirus genome replication is mediated by ...,"Coronaviruses are enveloped, positive-strand R...",3' 5'\nPK-SL1\n3'\nConclusions\nIdentification...,Alignment-based secondary structure prediction...,Coronavirus genome replication is mediated by ...,165,155,0,0,0
796,edc9be58ec035cfcc86c647ebc021d59214d4fd1,MOLECULAR INTERACTIONS IN THE ASSEMBLY OF CORO...,,Viruses are multimolecular assemblies that ran...,Localization of Budding\nA. Viral Budding\nN-R...,FIG 2. The coronavirus life cycle. The replica...,\nViruses are multimolecular assemblies that r...,147,115,30,0,0
114,4cb9c6ef889605b3149ab8b59c8258074067ba04,Detection of Group 1 Coronaviruses in Bats in ...,The epidemic of severe acute respiratory syndr...,E merging diseases are frequently zoonoses cau...,\nRNA Extraction and Reverse Transcription (RT...,Nucleotide sequence alignment of amplicons fro...,The epidemic of severe acute respiratory syndr...,76,75,1,0,0
629,c014fba3340c136cad82c7be51768c6b31c5c962,Detection of human coronavirus 229E-specific a...,Human coronaviruses are known to be a common c...,Human coronaviruses are a common cause of resp...,Western blot analysis using recombinant protei...,"shows a Coomassie blue stained, PAGE analysis ...",Human coronaviruses are known to be a common c...,66,63,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
469,442a1332feedd98d5b471efc73ea917c9451f267,"Separation and analysis of glycyrrhizin, 18␤-g...",Glycyrrhizin is the main active compound of Gl...,Glycirrhiza glabra (liquorice) roots and rhizo...,Analysis of purified GA\nMethod validation 2.4...,"Chemical structures of: (a) glycyrrhizin, (b) ...",Glycyrrhizin is the main active compound of Gl...,1,0,11,0,0
467,494b65b83c62d007f6ebd38d7ae83516a8fd6971,Structural Insights into 5' Flap DNA Unwinding...,Human FANCD2-associated nuclease 1 (FAN1) is a...,FAN1 possesses 5' flap endonuclease and 5'-3' ...,Crystallization and structure determination\nS...,R420E/R424E/K425E/K433E and K482E/N490E/Q492E/...,Human FANCD2-associated nuclease 1 (FAN1) is a...,0,0,0,0,0
462,e7c33e28e57b649ac477f8290b35a3b10e2a1353,"Knowledge, perception, performance, and attitu...","We assessed the current status of knowledge, p...",Health care−associated infections (HAIs) criti...,\nDISCUSSION\nUnivariate analysis\nHH percepti...,"§ 97.4 months, 54.9% of participants had < 24 ...","We assessed the current status of knowledge, p...",0,0,0,0,1
460,5e87d99a155e3a0d518695c7806b8e2c1409e207,A survey of SNOMED CT implementations,The Systematised Nomenclature of Medicine Clin...,"Countries such as the United States, United Ki...",Post-coordination.\nTowards a successful SNOME...,Summary of results of interviews.\nSubsets and...,The Systematised Nomenclature of Medicine Clin...,0,0,0,0,1
