5.4.20 
Risk - Link ontologies to entity output in v9 using snowmed/meddra

In [0]:
# Linking with GDrive
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
%%capture
!pip install owlready2
!pip install pyDataverse 
!pip install ijson

In [0]:
import pandas as pd
import numpy as np
import collections, re, string, ast, os, pickle
from tqdm.notebook import tqdm
from owlready2 import *
from owlready2.pymedtermino2 import *
from owlready2.pymedtermino2.umls import *
tqdm.pandas()

#ulms setup
default_world.set_backend(filename = "pym.sqlite3")
diction = [ "SNOMEDCT_US", "CUI", 'MDR'] #'LNC',"ICD10", if needed
import_umls("gdrive/My Drive/Data archive/umls-2019AB-metathesaurus.zip",
            terminologies = diction)
default_world.save()
PYM = get_ontology("http://PYM/").load()
SNOMEDCT_US = PYM["SNOMEDCT_US"]
CUI = PYM["CUI"]
MDR = PYM['MDR']
#LOINC = PYM['LNC']

Importing UMLS from Zip file gdrive/My Drive/Data archive/umls-2019AB-metathesaurus.zip with Python version 3.6...
  Parsing 2019AB/META/MRRANK.RRF as MRRANK
  Parsing 2019AB/META/MRCONSO.RRF as MRCONSO
  Parsing 2019AB/META/MRDEF.RRF as MRDEF
  Parsing 2019AB/META/MRREL.RRF as MRREL
  Parsing 2019AB/META/MRSAT.RRF as MRSAT
Breaking ORIG cycles...
    SNOMEDCT_US : 0 cycles found: 
    MDR : 0 cycles found: 
    SRC : 0 cycles found: 
Finalizing only properties and restrictions...
Finalizing CUI - ORIG mapping...
FTS Indexing...


In [0]:
# dictionary of all categories and meddra/snowmed
codeBank = {'SNOWMED':{'heart'   : '106063007',
                       'lung'    : '50043002',
                       'diabetes': '73211009'},
            'MEDDRA':{'heart'    : '10007541',
                      'lung'     : '10038738',
                      'diabetes' : '10012653'}}
# generates set of all codes
def codePull(framework, code):
  return set([ next(iter(framework[i.name] >> CUI )).name for i in framework[code].descendant_concepts()] )

In [0]:
#Extracts all descendants to a set 
#SNOMEDCT_US['106063007'].descendant_concepts()
hrt_sm_ext = set([ next(iter(SNOMEDCT_US[i.name] >> CUI )).name for i in SNOMEDCT_US['106063007'].descendant_concepts()] )
hrt_mdr_ext = set([ next(iter(MDR[i.name] >> CUI )).name for i in MDR['10007541'].descendant_concepts()] ) 
hrt_sm_ext.update(  [ next(iter(SNOMEDCT_US[i.name] >> CUI )).name for i in SNOMEDCT_US['73211009'].descendant_concepts()]  )
hrt_mdr_ext.update([ next(iter(MDR[i.name] >> CUI )).name for i in MDR['10012653'].descendant_concepts()] )
lung_sm_ext = set([ next(iter(SNOMEDCT_US[i.name] >> CUI )).name for i in SNOMEDCT_US['50043002'].descendant_concepts()] )
lung_mdr_ext = set([ next(iter(MDR[i.name] >> CUI )).name for i in MDR['10038738'].descendant_concepts()] ) 

Code Connecting Dataverse to Colab - Not used for now

In [0]:
from pyDataverse.api import Api
from pyDataverse.models import Dataverse
import io, gzip, ijson, os
from itertools import islice,chain

#Creating my chunks generator
def chunks(iterable, size):
    iterator = iter(iterable)
    for first in iterator:
        yield chain([first], islice(iterator, size - 1))

#Reading in the JSON and creating chunks
def read_big_json(f, chunk_size=10000):
  obs = ijson.items(f, 'item')
  return chunks(obs,chunk_size)

def preprocess_chunk(original_fun):
  def wrapper(c):
    l=[]
    for el in c:
      l.append(el)
    df = pd.DataFrame(l)
    return original_fun(df)
  return wrapper

@preprocess_chunk
def returndf(df):
  return df

In [0]:
def processes_v(url_doi, filename):
  # Establish connection to Dataverse
  base_url = 'http://datasets.coronawhy.org'
  api = Api(base_url)
  print('Dataverse connection status: ', api.status)

  response = api.get_dataset(url_doi) # in Dataverse 
  dataset_id = response.json()['data']['id']
  print('Dataset ID: ', dataset_id, '\n')

  datafile_id = response.json()['data']['latestVersion']['files'][0]['dataFile']['id']
  print('Datafile ID: ', datafile_id, '\n')

  # Download datafile
  response = api.get_datafile(datafile_id)
  print('File size: ', len(response.content))
  
  #Write the response content to the specified filename
  with open(filename, 'wb') as f: 
    f.write(response.content)

  !gunzip $filename
  print('Files in current directory: ')
  os.listdir()

  #Unzipping the file this way may lead to a better result (no error with position '0x8b')
  buf = io.BytesIO(response.content)
  print('Converted with BytesIO')
  gzip_f = gzip.GzipFile(fileobj=buf)
  print('Unzipped')
  # cntnt = gzip_f.read()

  f = open(filename[:-3])
  return read_big_json(f,10000)

def json_to_df(json):
  l=[]
  c=0
  for chunk in json:
    l.append(returndf(chunk))
    c+=1
    if (c==5):
      break
  return l

In [0]:
v9_text = processes_v('doi:10.5072/FK2/DKZOAH', 'v9text.json.gz')
#v8_meta = processes_v('doi:10.5072/FK2/22MMBN', 'v8_dataset.json.gz') #unprocessed
#v9_meta = processes_v('doi:10.5072/FK2/22MMBN', 'v9_dataset.json.gz') #unprocessed

Dataverse connection status:  OK
Dataset ID:  52 

Datafile ID:  53 

File size:  964007304
Files in current directory: 
Converted with BytesIO
Unzipped


In [0]:
v9 = json_to_df(v9_text)

In [21]:
code_extract = {'SNOWMED':{'heart':[],
                           'lung':[],
                           'diabetes':[]},
                'MEDDRA':{'heart':[],
                           'lung':[],
                           'diabetes':[]}}
for i in range(len(v9)):
  for j,k in codeBank.items():
    for m,n in k.items():
      if j = 'SNOWMED':
        tempCodes = codePull(SNOMEDCT_US,n)
      elif j = 'MEDDRA':
        tempCodes = codePull(MDR,n)
      v9[i]['{}_{}'.format(m,j)] = v9[i]['UMLS_IDS'].progress_apply(lambda x:  code_extract[j][m].append([set(x).intersection(tempCodes)]))

heart
106063007
lung
50043002
diabetes
73211009
heart
10007541
lung
10038738
diabetes
10012653


In [0]:
hrt_sm = []
hrt_mdr = []
lung_sm = []
lung_mdr = []

for i in range(len(v9)):
    v9[i]['hrt_snow'] = v9[i]['UMLS_IDS'].progress_apply(lambda x: set(x).intersection(hrt_sm_ext))
    v9[i]['hrt_mdr'] = v9[i]['UMLS_IDS'].progress_apply(lambda x: set(x).intersection(hrt_mdr_ext))
    v9[i]['lung_snow'] = v9[i]['UMLS_IDS'].progress_apply(lambda x: set(x).intersection(lung_sm_ext))
    v9[i]['lung_mdr'] = v9[i]['UMLS_IDS'].progress_apply(lambda x: set(x).intersection(lung_mdr_ext))
    temp = v9[i]
    hrt_sm.append(temp[temp['hrt_snow'] != set()])
    hrt_mdr.append(temp[temp['hrt_mdr'] != set()])
    lung_sm.append(temp[temp['lung_snow'] != set()])
    lung_mdr.append(temp[temp['lung_mdr'] != set()])

In [0]:
def export_df(snow_mdr, df):
  t = pd.DataFrame(columns = [snow_mdr, 'code'])
  for i in range(len(df)):
    for j in df[i][snow_mdr]:
      t= t.append( pd.DataFrame([[CUI[k].label[0],k] for k in list(j)] , columns = [snow_mdr,'code']))
  t.groupby([snow_mdr, 'code']).size().to_csv('{}.csv'.format(snow_mdr))

In [0]:
export_df('hrt_snow',hrt_sm)
export_df('hrt_mdr',hrt_mdr)
export_df('lung_snow',lung_sm)
export_df('lung_mdr',lung_mdr)

GCP to collab - Can't use as it doesn't have ULMS CUI


In [0]:
pd.DataFrame(t).groupby([0]).size()#.to_csv('freq.csv')

0
Abdominal aortic aneurysm                           1
Acute cor pulmonale                                 1
Acute coronary syndrome                             3
Acute febrile mucocutaneous lymph node syndrome    15
Acute generalized exanthematous pustulosis          6
                                                   ..
Varicella                                           2
Vasculitis                                          5
Vasculitis of large artery                          1
Venous stasis syndrome                              1
Venous thrombosis                                   2
Length: 124, dtype: int64

In [0]:
root_path = 'gdrive/My Drive/Data archive/pkls/'

for p in os.listdir(root_path):
  df = pd.read_pickle(root_path + p, compression="gzip")
  df = df[['ULMS', 'ULMS_IDS']]

In [0]:
 root_path = 'gdrive/My Drive/Data archive/pkls/'
 df = pd.read_pickle(root_path + 'v8processedText0.pkl')

In [0]:
df.head()

Unnamed: 0,cord_uid,language,sentence_id,section,subsection,sentence,lemma,UMLS,translated,GGP,SO,TAXON,CHEBI,GO,CL,DNA,CELL_TYPE,CELL_LINE,RNA,PROTEIN,DISEASE,CHEMICAL,CANCER,ORGAN,TISSUE,ORGANISM,CELL,AMINO_ACID,GENE_OR_GENE_PRODUCT,SIMPLE_CHEMICAL,ANATOMICAL_SYSTEM,IMMATERIAL_ANATOMICAL_ENTITY,MULTI-TISSUE_STRUCTURE,DEVELOPING_ANATOMICAL_STRUCTURE,ORGANISM_SUBDIVISION,CELLULAR_COMPONENT,PATHOLOGICAL_FORMATION,ORGANISM_SUBSTANCE
0,in7he5o4,en,in7he5o4170,Performance characteristics of the STI-MS assa...,17,To further evaluate the clinical performance o...,"[evaluate, clinical, performance, assay, clini...","[Evaluation, performance - action, Biological ...",False,[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[STI-MS],[],[],[],[],[],[],[],[],[]
1,in7he5o4,en,in7he5o4171,Performance characteristics of the STI-MS assa...,17,The primers and probes used in the monoplex re...,"[evaluate, clinical, performance, assay, clini...","[Oligonucleotide Primers, Probes, Real-Time Po...",False,[],"[primers, probes]",[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
2,in7he5o4,en,in7he5o4172,Performance characteristics of the STI-MS assa...,17,"The performance characteristics (sensitivity, ...","[evaluate, clinical, performance, assay, clini...","[performance - action, Characteristics, Statis...",False,[],[],[microorganism],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
3,in7he5o4,en,in7he5o4173,Performance characteristics of the STI-MS assa...,17,"Overall, there was very good sensitivity (rang...","[evaluate, clinical, performance, assay, clini...","[Statistical sensitivity, Diagnostic Specifici...",False,[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
4,in7he5o4,en,in7he5o4174,Performance characteristics of the STI-MS assa...,17,Because the STI-MS has a lower LOD than real-t...,"[evaluate, clinical, performance, assay, clini...","[Limit of Detection, Real-Time Polymerase Chai...",False,[],[],[],[],[],[],[STI-MS],[],[],[],[],[],[],[],[],[],[],[],[],[STI-MS],[],[],[],[],[],[],[],[],[]


In [0]:
# Connecting bioportal's annotater
import urllib.request, urllib.error, urllib.parse
import json
import os
from pprint import pprint

REST_URL = "http://data.bioontology.org"
API_KEY = "7698d65d-9394-4323-b953-25fce0735891"

In [0]:
def get_json(url):
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + API_KEY)]
    return json.loads(opener.open(url).read())

def print_annotations(annotations, get_class=True):
    for result in annotations:
        class_details = result["annotatedClass"]
        if get_class:
            try:
                class_details = get_json(result["annotatedClass"]["links"]["self"])
            except urllib.error.HTTPError:
                print(f"Error retrieving {result['annotatedClass']['@id']}")
                continue
        print("Class details")
        print("\tid: " + class_details["@id"])
        print("\tprefLabel: " + class_details["prefLabel"])
        print("\tontology: " + class_details["links"]["ontology"])

        print("Annotation details")
        for annotation in result["annotations"]:
            print("\tfrom: " + str(annotation["from"]))
            print("\tto: " + str(annotation["to"]))
            print("\tmatch type: " + annotation["matchType"])

        if result["hierarchy"]:
            print("\n\tHierarchy annotations")
            for annotation in result["hierarchy"]:
                try:
                    class_details = get_json(annotation["annotatedClass"]["links"]["self"])
                except urllib.error.HTTPError:
                    print(f"Error retrieving {annotation['annotatedClass']['@id']}")
                    continue
                pref_label = class_details["prefLabel"] or "no label"
                print("\t\tClass details")
                print("\t\t\tid: " + class_details["@id"])
                print("\t\t\tprefLabel: " + class_details["prefLabel"])
                print("\t\t\tontology: " + class_details["links"]["ontology"])
                print("\t\t\tdistance from originally annotated class: " + str(annotation["distance"]))

        print("\n\n")

In [0]:
text_to_annotate = 'BACKGROUND: Isolation of cases and contact tracing is used to control outbreaks of infectious diseases, and has been used for coronavirus disease 2019 (COVID-19). FINDINGS: Simulated outbreaks starting with five initial cases, an R(0) of 1¬∑5, and 0% transmission before symptom onset could be controlled even with low contact tracing probability; however, the probability of controlling an outbreak decreased with the number of initial cases, when R(0) was 2¬∑5 or 3¬∑5 and with more transmission before symptom onset. INTERPRETATION: In most scenarios, highly effective contact tracing and case isolation is enough to control a new outbreak of COVID-19 within 3 months. FUNDING: Wellcome Trust, Global Challenges Research Fund, and Health Data Research UK.'
annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(text_to_annotate) + "&ontologies=OCRE,OBCS")
print_annotations(annotations)

Class details
	id: http://purl.org/net/OCRe/OCRe.owl#OCRE50034
	prefLabel: Funding
	ontology: http://data.bioontology.org/ontologies/OCRE
Annotation details
	from: 673
	to: 679
	match type: PREF



Class details
	id: http://purl.obolibrary.org/obo/IAO_0000027
	prefLabel: data item
	ontology: http://data.bioontology.org/ontologies/OBCS
Annotation details
	from: 742
	to: 745
	match type: SYN





In [0]:
#default example code 
text_to_annotate = "Melanoma is a malignant tumor of melanocytes which are found predominantly in skin but also in the bowel and the eye."

# Annotate using the provided text
annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(text_to_annotate))

# Print out annotation details
print_annotations(annotations)

# Annotate with hierarchy information
annotations = get_json(REST_URL + "/annotator?max_level=3&text=" + urllib.parse.quote(text_to_annotate))
print_annotations(annotations)

# Annotate with prefLabel, synonym, definition returned
annotations = get_json(REST_URL + "/annotator?include=prefLabel,synonym,definition&text=" + urllib.parse.quote(text_to_annotate))
print_annotations(annotations, False)

[Task Risk]Bigram/Trigram Extraction

https://docs.google.com/document/d/1a6yrR9nF4uqS0YT5_TFF7MJ5EwXRr_m_zB4MlN2cxzA/edit

This notebook takes the raw sentences from the v7 dataset and outputs relevant bigrams/trigrams related to risk factors of a particular subdomain in medicine. Each sentence is put through Allen Ai's python nlp package for biomedical/scientific/clinical data processing. The package processes each sentence, tokenizing only parts of the sentence that represents some useful named entity (NER: named entity recognition). Then those list of entities are processed by sci scapys "UmlsEntityLinker". This cross reference the named entity with any entity in the Unified Medical Language System (UMLS) which is developed by the National Institutes of Health. The purpose of this is to correctly distinguish any useful entity with something that has a medical connotation. If an entity receives a medical concept id, it then is then convert into an ICD code. ICD code is International Statistical Classification of Diseases and Related Health Problems, one of many standardized medical terminology bases. The useful thing about ICD code is that it presents any medical term in a hierarchical organization, such that anything related to heart disease will be group together and have a common code denomination. 
 The notebook then captures only the ICD codes that fit within the ICD code range pertaining to what the user requires bigrams/trigrams of. 
The advantages of such a complicated search for bigrams/trigrams is that the output is something that can be traced back and is also backed by actual medical terminology. Furthermore, simpler methods of ngram search wouldn’t find as many bigrams given that medical terminology don’t follow conventional naming and would be hard press to grab such granularity.


# [OLD]Below is simply icd pull for v6 data 

In [0]:
# Restart Runtime after running cell 
!pip install owlready2
#!pip install spacy_langdetect
!pip install nltk
#!pip install scispacy
#!python -m spacy download en_core_web_lg

In [0]:
import pandas as pd
import numpy as np
import multiprocessing as mp
from queue import Empty
import nltk, collections, re, string, ast, spacy, os, scispacy, pickle
from collections import Counter
from tqdm.notebook import tqdm
from spacy_langdetect import LanguageDetector

from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

#nlp = spacy.load('en_core_web_lg', disable=["tagger", "parser",'textcat','sentencizer','merge_noun_chunks','merge_entities','merge_subtokens'])
import en_core_web_lg
nlp = en_core_web_lg.load(disable=["tagger", "parser",'textcat','sentencizer','merge_noun_chunks','merge_entities','merge_subtokens'])

from scispacy.umls_linking import UmlsEntityLinker
linker = UmlsEntityLinker(resolve_abbreviations=True, max_entities_per_mention= 1)
nlp.add_pipe(linker)

from owlready2 import *
from owlready2.pymedtermino2 import *
from owlready2.pymedtermino2.umls import *

default_world.set_backend(filename = "pym.sqlite3")

In [0]:
# Initializes ULM Meta metathesaurus 
import_umls("gdrive/My Drive/Data archive/umls-2019AB-metathesaurus.zip", terminologies = ["ICD10", "SNOMEDCT_US", "CUI"])
default_world.save()
PYM = get_ontology("http://PYM/").load()
SNOMEDCT_US = PYM["SNOMEDCT_US"]
CUI = PYM["CUI"]
ICD = PYM['ICD10']

Importing UMLS from Zip file gdrive/My Drive/Data archive/umls-2019AB-metathesaurus.zip with Python version 3.6...
  Parsing 2019AB/META/MRRANK.RRF as MRRANK
  Parsing 2019AB/META/MRCONSO.RRF as MRCONSO
  Parsing 2019AB/META/MRDEF.RRF as MRDEF
  Parsing 2019AB/META/MRREL.RRF as MRREL
  Parsing 2019AB/META/MRSAT.RRF as MRSAT
Breaking ORIG cycles...
    SNOMEDCT_US : 0 cycles found: 
    ICD10 : 0 cycles found: 
    SRC : 0 cycles found: 
Finalizing only properties and restrictions...
Finalizing CUI - ORIG mapping...
FTS Indexing...


In [0]:
# path for my drive
root_path = 'gdrive/My Drive/Data archive/'

c = Counter() # starts the counter for ngrams 
# setups up words to remove as well as punctuations
customized_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'fig', 'fig.', 'al.', 'q', 'license',
    'di', 'la', 'il', 'del', 'le', 'della', 'dei', 'delle', 'una', 'da',  'dell',  'non', 'si', 'holder',
    'p', 'h'
]
stop_words = list(stopwords.words('english')) + customized_stop_words
punctuationNoPeriod = "[" + re.sub("\.","",string.punctuation) + "]"

In [0]:
# Function takes in a list of strings of icd ranges and then compares the in put if it fits with that region
def true_list(range, value):
  temp = range[0].split('-')
  range_cond = (float(temp[0][1:]) <= float(value[1:]) <= float(temp[1][1:]))
  if range_cond:
    return True 
  elif len(range) > 1:
    return true_list(range[1:],value)
  return range_cond

# Function that takes in a list of tokens and updates the counter if it fits within an icd code 
def update_grams(umls_ent, letter,icd_range):
  if (umls_ent._.umls_ents != []):
    cui_temp = CUI[umls_ent._.umls_ents[0][0]]
    if (cui_temp != None):
      icd = cui_temp >> ICD
      contains = False
      for x in icd:
        if letter in  x.name: 
          if len([i for i in x.name.split('-') if true_list(icd_range,i)]):
            contains = True
      if contains:
        gramtemp = cui_temp.label[0].lower().split(' ')
        if len(gramtemp) <= 3:
          finder = nltk.collocations.TrigramCollocationFinder.from_words(gramtemp)
          gram_measures = nltk.collocations.TrigramAssocMeasures()
          c.update(dict(finder.score_ngrams(gram_measures.raw_freq)).keys())
        if len(gramtemp) == 2:
          finder = nltk.collocations.BigramCollocationFinder.from_words(gramtemp)
          gram_measures = nltk.collocations.BigramAssocMeasures()
          c.update(dict(finder.score_ngrams(gram_measures.raw_freq)).keys())

In [0]:
# Goes through all the pkl files then reads each sentence and processes them
# For every entity that is picked out it is processed to find if it meets the ICD range 
lung_icd_range = ['J40-J47','J60-J70','J30-J34']
heart_icd_Range = ['I5-I52']
letter, icd_range_select = 'J', lung_icd_range

for p in os.listdir(root_path):
  df = pd.read_pickle(root_path + p, compression="gzip")
  print('{0} - Uploaded'.format(p))
  df = df['sentence'].str.replace('[!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~]','')
  for row in tqdm(nlp.pipe(list(df.dropna()) , n_threads=16), total = len(df)):
      [update_grams(x,letter,icd_range_select) for x in row.ents]
  print(c.most_common())

v6_text_7.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=377526), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 2), (('chronic', 'sinusitis'), 1)]
v6_text_8.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=377526), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 3), (('chronic', 'sinusitis'), 2), (('pulmonary', 'emphysema'), 1), (('intrinsic', 'asthma'), 1)]
v6_text_9.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=377526), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 6), (('chronic', 'sinusitis'), 2), (('intrinsic', 'asthma'), 2), (('pulmonary', 'emphysema'), 1)]
v6_text_10.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=371321), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 7), (('chronic', 'sinusitis'), 4), (('intrinsic', 'asthma'), 2), (('pulmonary', 'emphysema'), 1), (('unilateral', 'emphysema'), 1)]
v6_text_11.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=371321), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 11), (('chronic', 'sinusitis'), 5), (('intrinsic', 'asthma'), 2), (('unilateral', 'emphysema'), 2), (('pulmonary', 'emphysema'), 1), (('chronic', 'bronchitis'), 1)]
v6_text_12.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=371321), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 16), (('chronic', 'sinusitis'), 6), (('pulmonary', 'emphysema'), 2), (('intrinsic', 'asthma'), 2), (('unilateral', 'emphysema'), 2), (('pulmonary', 'siderosis'), 2), (('chronic', 'bronchitis'), 1)]
v6_text_13.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=371321), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 17), (('chronic', 'sinusitis'), 6), (('unilateral', 'emphysema'), 4), (('pulmonary', 'emphysema'), 2), (('intrinsic', 'asthma'), 2), (('pulmonary', 'siderosis'), 2), (('chronic', 'bronchitis'), 1)]
v6_text_14.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=371321), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 21), (('chronic', 'sinusitis'), 11), (('unilateral', 'emphysema'), 4), (('pulmonary', 'emphysema'), 2), (('intrinsic', 'asthma'), 2), (('chronic', 'bronchitis'), 2), (('pulmonary', 'siderosis'), 2)]
v6_text_15.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=371321), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 28), (('chronic', 'sinusitis'), 14), (('unilateral', 'emphysema'), 4), (('pulmonary', 'emphysema'), 2), (('intrinsic', 'asthma'), 2), (('chronic', 'bronchitis'), 2), (('pulmonary', 'siderosis'), 2), (('seasonal', 'allergy'), 1)]
v6_text_16.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=371321), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 29), (('chronic', 'sinusitis'), 16), (('unilateral', 'emphysema'), 4), (('pulmonary', 'emphysema'), 2), (('intrinsic', 'asthma'), 2), (('chronic', 'bronchitis'), 2), (('pulmonary', 'siderosis'), 2), (('seasonal', 'allergy'), 1)]
v6_text_17.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=371321), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 32), (('chronic', 'sinusitis'), 16), (('unilateral', 'emphysema'), 4), (('pulmonary', 'emphysema'), 2), (('intrinsic', 'asthma'), 2), (('chronic', 'bronchitis'), 2), (('pulmonary', 'siderosis'), 2), (('seasonal', 'allergy'), 1)]
v6_text_18.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=371320), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 35), (('chronic', 'sinusitis'), 16), (('unilateral', 'emphysema'), 4), (('chronic', 'bronchitis'), 4), (('pulmonary', 'emphysema'), 3), (('intrinsic', 'asthma'), 2), (('pulmonary', 'siderosis'), 2), (('seasonal', 'allergy'), 1)]
v6_text_19.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=371320), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 36), (('chronic', 'sinusitis'), 18), (('unilateral', 'emphysema'), 4), (('chronic', 'bronchitis'), 4), (('pulmonary', 'emphysema'), 3), (('intrinsic', 'asthma'), 2), (('pulmonary', 'siderosis'), 2), (('seasonal', 'allergy'), 1)]
v6_text_0.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=377527), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 37), (('chronic', 'sinusitis'), 21), (('unilateral', 'emphysema'), 4), (('chronic', 'bronchitis'), 4), (('pulmonary', 'emphysema'), 3), (('intrinsic', 'asthma'), 2), (('pulmonary', 'siderosis'), 2), (('seasonal', 'allergy'), 1), (('allergic', 'rhinitis'), 1)]
v6_text_1.pkl - Uploaded


HBox(children=(IntProgress(value=0, max=377526), HTML(value='')))


[(('coal', "workers'", 'pneumoconiosis'), 38), (('chronic', 'sinusitis'), 21), (('unilateral', 'emphysema'), 4), (('chronic', 'bronchitis'), 4), (('pulmonary', 'emphysema'), 3), (('intrinsic', 'asthma'), 2), (('pulmonary', 'siderosis'), 2), (('seasonal', 'allergy'), 1), (('allergic', 'rhinitis'), 1)]
