In [None]:
##########################################################################################
#                                                                                        #
#  888888b.  8888888 .d88888b.   .d8888b.     .d8888b.   .d8888b.      d8888      d8888  #
#  888  "88b   888  d88P" "Y88b d88P  Y88b   d88P  Y88b d88P  Y88b    d8P888     d8P888  #
#  888  .88P   888  888     888 Y88b.        888        888          d8P 888    d8P 888  #
#  8888888K.   888  888     888  "Y888b.     888d888b.  888d888b.   d8P  888   d8P  888  #
#  888  "Y88b  888  888     888     "Y88b.   888P "Y88b 888P "Y88b d88   888  d88   888  #
#  888    888  888  888     888       "888   888    888 888    888 8888888888 8888888888 #
#  888   d88P  888  Y88b. .d88P Y88b  d88P   Y88b  d88P Y88b  d88P       888        888  #
#  8888888P" 8888888 "Y88888P"   "Y8888P"     "Y8888P"   "Y8888P"        888        888  #
#                                                                                        # 
##########################################################################################
#
# Wrangle CDC "Rare Diseases" data
#
##########################################################################################

In [18]:
##################################################################################################################
#
# YouDo:
#    1) Make a copy of this notebook with your name as a prefix:  
#       YourName_BIOS6644_XML_RareDiseases.ipynb
#    2) Do all work in this new notebook.
#    3) Submit completed work via GitHub
#
##################################################################################################################


# Setup Local Environment
- Ensure you have pandas and xml2dict installed (via pip, conda, etc.)
- Make a folder in the same directory as this notebook to hold the data.  Mine is ./Data

In [45]:
# set up local file tree
#!mkdir Data

## Get Data data.gov

Data available at https://data.cdc.gov/api/views/45b4-9j7u/rows.xml



In [46]:
# Download with:
# Info: 
#     https://data.cdc.gov/NNDSS/NNDSS-Table-I-infrequently-reported-notifiable-dis/45b4-9j7u
#
# Also available here: https://data.cdc.gov/api/odata/v4/45b4-9j7u

# The following lines are prefixed with a "!" which tells jupyter that these are shell 
# commands and not meant for python.
# Wget is a command line application that fetches webpages and files
# mv just moves the downloaded file to the Data folder
# 
# You can do this part manually if you prefer

! wget https://data.cdc.gov/api/views/45b4-9j7u/rows.xml
! mv rows.xml ./Data/


Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.
ERROR: could not open HSTS store at '/home/james/.wget-hsts'. HSTS will be disabled.
--2024-03-24 12:55:27--  https://data.cdc.gov/api/views/45b4-9j7u/rows.xml
Resolving data.cdc.gov (data.cdc.gov)... 52.206.140.205, 52.206.68.26, 52.206.140.199
Connecting to data.cdc.gov (data.cdc.gov)|52.206.140.205|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/xml]
Saving to: ‘rows.xml’

rows.xml                [     <=>            ]   2.26M  2.38MB/s    in 0.9s    

2024-03-24 12:55:29 (2.38 MB/s) - ‘rows.xml’ saved [2373297]



XML is basically a generaliztion of HTML, which describes most web pages

Thus, web browsers can display XML in a relatively easy-to-read way.  

Open rows.xml with your browser: [open the file](./Data/rows.xml)

XML has all kinds of "features" which make it ugly and complicated and which have never been useful to me.   You can use a library called [ElementTree] if you want to get into those nuances, but it's not very fun.

I use a library called xmltodict which converts the xml data into native-to-python dictionaries.  


## Review:  Python Dictionaries

In [76]:

# empty dictionary

d = dict()
print(d)

{}


In [77]:
{'firstname':'James', 'surname':'King'}

{'firstname': 'James', 'surname': 'King'}

In [78]:
# Dictionaries make 1-to-1 maps from "keys" to "values"

# keys can be anything that's "hashable" (strings, numbers, dicts, ...)
# Values can be any Python type (including custom-made)

name={'firstname':'James', 'surname':'King'}

d['name'] = name
d['age'] = 'Younger than Greg'
d['Favorite number'] = 3
print(d)


{'name': {'firstname': 'James', 'surname': 'King'}, 'age': 'Younger than Greg', 'Favorite number': 3}


In [83]:
# handy dict methods

print(f'The keys are: {d.keys()}')
print(f'The values are: {d.values()}')

The keys are: dict_keys(['name', 'age', 'Favorite number'])
The values are: dict_values([{'firstname': 'James', 'surname': 'King'}, 'Younger than Greg', 3])


In [80]:
# XML stores data in a similar, but much more difficult-to-read way:
X="""
<Favorite_number>3</Favorite_number>
<age>Younger than Greg</age>
<name>
  <firstname>James</firstname>
  <surname>King</surname>
</name>
"""

In [84]:
# xmltodict reads in XML files and converts them to dicts, which are easier in Python

In [47]:
# Read in the data file with xmltodict

In [87]:
fname = './Data/rows.xml'
with open(fname, 'rb') as thefile:
    dd = xmltodict.parse(thefile)

In [13]:
# Notice there are two levels with the tag "row"  This sort of thing is common in 
# XML and really annoying.  
# This sort of display makes it stand out, though.
# We'll need to remember that for later.

In [91]:

# This data seems to be organized by an attribute called "row", each of 
#  which has several children (probably the columns)

print(dd.keys())

dict_keys(['response'])


In [93]:
print(dd['response'].keys())



dict_keys(['row'])


In [94]:
print(dd['response']['row'].keys())

dict_keys(['row'])


In [95]:
print(dd['response']['row']['row'].keys())

AttributeError: 'list' object has no attribute 'keys'

In [97]:
print(dd['response']['row']['row'])

[{'@_id': 'row-skhd-7gcj-bknc', '@_uuid': '00000000-0000-0000-8068-71B30911A10A', '@_position': '0', '@_address': 'https://data.cdc.gov/resource/_45b4-9j7u/row-skhd-7gcj-bknc', 'disease': 'Anthrax', 'mmwr_year': '2017', 'mmwr_week': '1', 'current_week_flag': '-', 'cum_2017_flag': '-', '_5_year_weekly_average_flag': '-', 'total_cases_reported_2016_flag': '-', 'total_cases_reported_2015_flag': '-', 'total_cases_reported_2014_flag': '-', 'total_cases_reported_2013_flag': '-', 'total_cases_reported_2012_flag': '-'}, {'@_id': 'row-vyec_dmte-gwmz', '@_uuid': '00000000-0000-0000-F7D9-05250D543B3D', '@_position': '0', '@_address': 'https://data.cdc.gov/resource/_45b4-9j7u/row-vyec_dmte-gwmz', 'disease': 'Arboviral diseases¶,** Chikungunya virus disease', 'mmwr_year': '2017', 'mmwr_week': '1', 'current_week_flag': '-', 'cum_2017_flag': '-', '_5_year_weekly_average': '6', 'total_cases_reported_2016': '167', 'total_cases_reported_2015': '896', 'total_cases_reported_2014_flag': 'NN', 'total_cases_

In [98]:
# Ok--here's where the data starts

In [99]:
datlist = dd['response']['row']['row']

In [105]:
len(datlist), type(datlist[0]), datlist[0].keys()

(3329,
 dict,
 dict_keys(['@_id', '@_uuid', '@_position', '@_address', 'disease', 'mmwr_year', 'mmwr_week', 'current_week_flag', 'cum_2017_flag', '_5_year_weekly_average_flag', 'total_cases_reported_2016_flag', 'total_cases_reported_2015_flag', 'total_cases_reported_2014_flag', 'total_cases_reported_2013_flag', 'total_cases_reported_2012_flag']))

In [108]:
# One of the keys is "disease" which might contain the name of the disease
datlist[0]['disease']

'Anthrax'

In [110]:


##################################################################################################################
#
# YouDo:
#
# 1) Loop through the list of nodes and extract the text attribute of each node into a list called diz
#    
#   Stretch goal:  do this with a list comprehension
#
# 2) Make a second list called udiz containing only the unique members of this data set 

#######################################  BEGIN STUDENT CODE  #####################################################

# 1)
diz = [xx['disease'] for xx in datlist]
#print(diz)

#2)

#  Enter the Set Comprehension.  dun. Dun. DUUUUUUN.
udiz = {xx['disease'] for xx in datlist}
udiz
#######################################   END STUDENT CODE   #####################################################

{'Anthrax',
 'Arboviral diseases¶,** Chikungunya virus disease',
 'Arboviral diseases¶,** Chikungunya virus††',
 'Arboviral diseases¶,** Eastern equine encephalitis virus',
 'Arboviral diseases¶,** Eastern equine encephalitis virus disease',
 'Arboviral diseases¶,** Jamestown Canyon virus disease',
 'Arboviral diseases¶,** Jamestown Canyon virus§§',
 'Arboviral diseases¶,** La Crosse virus disease',
 'Arboviral diseases¶,** La Crosse virus§§',
 'Arboviral diseases¶,** Powassan virus',
 'Arboviral diseases¶,** Powassan virus disease',
 'Arboviral diseases¶,** St. Louis encephalitis virus',
 'Arboviral diseases¶,** St. Louis encephalitis virus disease',
 'Arboviral diseases¶,** Western equine encephalitis virus',
 'Arboviral diseases¶,** Western equine encephalitis virus disease',
 'Botulism, foodborne',
 'Botulism, infant',
 'Botulism, other (wound and unspecified)',
 'Botulism, total',
 'Brucellosis',
 'Chancroid',
 'Cholera',
 'Cyclosporiasis**',
 'Diphtheria',
 'Haemophilus influenza

In [113]:
# A list of shallow dicts can be imported directly into pandas

df = pd.DataFrame(datlist)
df

Unnamed: 0,@_id,@_uuid,@_position,@_address,disease,mmwr_year,mmwr_week,current_week_flag,cum_2017_flag,_5_year_weekly_average_flag,...,total_cases_reported_2012_flag,_5_year_weekly_average,total_cases_reported_2016,total_cases_reported_2015,total_cases_reported_2014,total_cases_reported_2013,total_cases_reported_2012,current_week,cum_2017,states_reporting_cases_during_current_week_no
0,row-skhd-7gcj-bknc,00000000-0000-0000-8068-71B30911A10A,0,https://data.cdc.gov/resource/_45b4-9j7u/row-s...,Anthrax,2017,1,-,-,-,...,-,,,,,,,,,
1,row-vyec_dmte-gwmz,00000000-0000-0000-F7D9-05250D543B3D,0,https://data.cdc.gov/resource/_45b4-9j7u/row-v...,"Arboviral diseases¶,** Chikungunya virus disease",2017,1,-,-,,...,NN,6,167,896,,,,,,
2,row-hdxc_43ap~n6yz,00000000-0000-0000-0F1A-DE0D93986652,0,https://data.cdc.gov/resource/_45b4-9j7u/row-h...,"Arboviral diseases¶,** Eastern equine encephal...",2017,1,-,-,,...,,0,6,6,8,8,15,,,
3,row-t9wt_thhz_94py,00000000-0000-0000-E750-7726E41C0121,0,https://data.cdc.gov/resource/_45b4-9j7u/row-t...,"Arboviral diseases¶,** Jamestown Canyon virus ...",2017,1,-,-,-,...,,,4,11,11,22,2,,,
4,row-bkgt~e7aa.8vhi,00000000-0000-0000-84F0-A42719F7BCF1,0,https://data.cdc.gov/resource/_45b4-9j7u/row-b...,"Arboviral diseases¶,** La Crosse virus disease",2017,1,-,-,-,...,,,34,55,80,85,78,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3324,row-wcgm~868t.94ae,00000000-0000-0000-A93D-857517E6E098,0,https://data.cdc.gov/resource/_45b4-9j7u/row-w...,Trichinellosis**,2017,52,-,,,...,,1,26,14,14,22,18,,14,
3325,row-k8nd-iu8c_pd4z,00000000-0000-0000-0FAD-23CDA12DFDA2,0,https://data.cdc.gov/resource/_45b4-9j7u/row-k...,"Hantavirus**, Hantavirus Pulmonary Syndrome (HPS)",2017,52,-,,,...,,0,31,21,32,21,30,,8,
3326,row-er6u~tvsv-s3mi,00000000-0000-0000-7224-762A5A5F1A4F,0,https://data.cdc.gov/resource/_45b4-9j7u/row-e...,"Poliomyelitis, paralytic",2017,52,-,-,-,...,-,,,,,1,,,,
3327,row-3k5p.gsvb~vap3,00000000-0000-0000-F4A7-C02DDA0683C6,0,https://data.cdc.gov/resource/_45b4-9j7u/row-3...,Haemophilus influenzae invasive disease (age <...,2017,51,-,,,...,,1,30,29,40,31,30,,21,


In [139]:
##################################################################################################################
#
# YouDo:  try reading the reports dictionary into a data frame, try importing
# reports into a data frame and sorting it.

#######################################  BEGIN STUDENT CODE  #####################################################

dcounts = df[['disease', 'total_cases_reported_2016']].copy()
dcounts.dropna(inplace=True)
dcounts.drop_duplicates(inplace=True)
dcounts.sort_values('total_cases_reported_2016', inplace=True)
#dcounts.iloc[0:5]
dcounts
#######################################   END STUDENT CODE   #####################################################

Unnamed: 0,disease,total_cases_reported_2016
52,Vancomycin-resistant Staphylococcus aureus**,1
13,Chancroid,10
372,Meningococcal disease (Neisseria meningitidis)...,100
370,Vancomycin-intermediate Staphylococcus aureus**,100
403,Vancomycin-intermediate Staphylococcus aureus**,101
...,...,...
123,Meningococcal disease (Neisseria meningitidis)...,97
85,Vancomycin-intermediate Staphylococcus aureus**,97
51,Vancomycin-intermediate Staphylococcus aureus**,98
132,Meningococcal disease (Neisseria meningitidis)...,98


In [124]:
dcounts.index

Index(['Anthrax', 'Hansen's disease**', 'Leptospirosis**', 'Listeriosis',
       'Plague', 'Poliomyelitis, paralytic',
       'Polio virus Infection, nonparalytic**', 'Psittacosis**',
       'Q fever, total**', 'Q fever, acute**', 'Q fever, chronic**',
       'Rabies, human', 'SARS-CoV', 'Smallpox',
       'Streptococcal toxic-shock syndrome**', 'Trichinellosis**',
       'Vancomycin-intermediate Staphylococcus aureus**',
       'Vancomycin-resistant Staphylococcus aureus**', 'Yellow fever',
       'Hemolytic uremic syndrome, postdiarrheal**', 'Tularemia',
       'Botulism, total', 'Botulism, foodborne', 'Botulism, infant',
       'Botulism, other (wound and unspecified)', 'Brucellosis', 'Chancroid',
       'Cholera', 'Cyclosporiasis**', 'Diphtheria',
       'Haemophilus influenzae invasive disease (age <5 yrs)††, unknown serotype',
       'Arboviral diseases¶,** Powassan virus disease',
       'Typhoid fever (caused by Salmonella typhi)',
       'Arboviral diseases¶,** La Crosse virus

In [121]:
df.iloc[:5,:]

Unnamed: 0,@_id,@_uuid,@_position,@_address,disease,mmwr_year,mmwr_week,current_week_flag,cum_2017_flag,_5_year_weekly_average_flag,...,total_cases_reported_2012_flag,_5_year_weekly_average,total_cases_reported_2016,total_cases_reported_2015,total_cases_reported_2014,total_cases_reported_2013,total_cases_reported_2012,current_week,cum_2017,states_reporting_cases_during_current_week_no
0,row-skhd-7gcj-bknc,00000000-0000-0000-8068-71B30911A10A,0,https://data.cdc.gov/resource/_45b4-9j7u/row-s...,Anthrax,2017,1,-,-,-,...,-,,,,,,,,,
1,row-vyec_dmte-gwmz,00000000-0000-0000-F7D9-05250D543B3D,0,https://data.cdc.gov/resource/_45b4-9j7u/row-v...,"Arboviral diseases¶,** Chikungunya virus disease",2017,1,-,-,,...,NN,6.0,167.0,896.0,,,,,,
2,row-hdxc_43ap~n6yz,00000000-0000-0000-0F1A-DE0D93986652,0,https://data.cdc.gov/resource/_45b4-9j7u/row-h...,"Arboviral diseases¶,** Eastern equine encephal...",2017,1,-,-,,...,,0.0,6.0,6.0,8.0,8.0,15.0,,,
3,row-t9wt_thhz_94py,00000000-0000-0000-E750-7726E41C0121,0,https://data.cdc.gov/resource/_45b4-9j7u/row-t...,"Arboviral diseases¶,** Jamestown Canyon virus ...",2017,1,-,-,-,...,,,4.0,11.0,11.0,22.0,2.0,,,
4,row-bkgt~e7aa.8vhi,00000000-0000-0000-84F0-A42719F7BCF1,0,https://data.cdc.gov/resource/_45b4-9j7u/row-b...,"Arboviral diseases¶,** La Crosse virus disease",2017,1,-,-,-,...,,,34.0,55.0,80.0,85.0,78.0,,,


In [140]:
##################################################################################################################
#
# YouDo: 
#  Make a visualization showing week vs cum_2017 for all of the top 5 diseases
#  Ensure it has appropriate labels, etc. for pulblication
#######################################  BEGIN STUDENT CODE  #####################################################

#######################################   END STUDENT CODE   #####################################################