<h1>Exploring infections through data: Mapping Measles, Mumps, and Rubella in the USA</h1>

In this notebook I'm going to explore creating an interactive map of the historical incidence rates (per 100,000 people) of measles, mumps, and rubella in the USA. The data comes from Project Tycho, a colleciton of National Notifiable Disease Surveillance System reports, and is available for free from <a href="https://www.kaggle.com/pitt/contagious-diseases/home">Kaggle</a>. There are multiple libraries in Python that can be used to create interactive visualisations of data, but in this notebook I will focus on the Bokeh JS API.

In [1]:
#Dependencies
import pandas as pd
import numpy as np

  return f(*args, **kwds)
  return f(*args, **kwds)


In [94]:
#Import Bokeh
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.io import show
from bokeh.models import LogColorMapper, ColumnDataSource, HoverTool, LinearColorMapper, ColorBar
from bokeh.palettes import Viridis6 as palette
from bokeh.plotting import figure
from bokeh.models.widgets import Select, Slider
from bokeh.layouts import column, row, WidgetBox
from bokeh.palettes import Oranges
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application

In [3]:
output_notebook()

In [4]:
#Import US state data from Bokeh sample data
from bokeh.sampledata.us_states import data as States

In [5]:
#Import MMR data
measles = pd.read_csv("measles.csv")
mumps = pd.read_csv("mumps.csv")
rubella = pd.read_csv("rubella.csv")

In [6]:
measles.head()

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita
0,192801,AL,ALABAMA,MEASLES,97,3.67
1,192801,AR,ARKANSAS,MEASLES,76,4.11
2,192801,AZ,ARIZONA,MEASLES,8,1.9
3,192801,CA,CALIFORNIA,MEASLES,74,1.38
4,192801,CO,COLORADO,MEASLES,85,8.38


In [7]:
mumps.head()

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita
0,196801,AK,ALASKA,MUMPS,7,2.46
1,196801,AL,ALABAMA,MUMPS,39,1.13
2,196801,AZ,ARIZONA,MUMPS,19,1.13
3,196801,CA,CALIFORNIA,MUMPS,247,1.27
4,196801,DC,DISTRICT OF COLUMBIA,MUMPS,1,0.13


In [8]:
rubella.head()

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita
0,196601,AL,ALABAMA,RUBELLA,7,0.2
1,196601,AZ,ARIZONA,RUBELLA,29,1.8
2,196601,CA,CALIFORNIA,RUBELLA,7,0.04
3,196601,CT,CONNECTICUT,RUBELLA,11,0.38
4,196601,HI,HAWAII,RUBELLA,1,0.14


In [9]:
#Join the data with an outer join
mmr = pd.merge(measles, mumps, how='outer')

In [10]:
mmr = pd.merge(mmr, rubella, how = 'outer')

In [11]:
mmr.describe()

Unnamed: 0,week,cases,incidence_per_capita
count,268126.0,268126.0,268126.0
mean,197008.487208,74.360838,2.541002
std,1893.399835,295.665568,8.426552
min,192801.0,0.0,0.0
25%,195716.0,0.0,0.0
50%,197332.0,3.0,0.11
75%,198406.0,27.0,1.07
max,200252.0,10402.0,683.06


In [12]:
mmr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268126 entries, 0 to 268125
Data columns (total 6 columns):
week                    268126 non-null int64
state                   268126 non-null object
state_name              268126 non-null object
disease                 268126 non-null object
cases                   268126 non-null int64
incidence_per_capita    268126 non-null float64
dtypes: float64(1), int64(2), object(3)
memory usage: 14.3+ MB


In [13]:
mmr["year"] = mmr["week"].apply(lambda x: int(str(x)[0:4]))

In [14]:
mmr["week_num"] = mmr["week"].apply(lambda x: int(str(x)[4:7]))

In [15]:
mmr.describe()

Unnamed: 0,week,cases,incidence_per_capita,year,week_num
count,268126.0,268126.0,268126.0,268126.0,268126.0
mean,197008.487208,74.360838,2.541002,1969.805528,27.934408
std,1893.399835,295.665568,8.426552,18.914972,14.502689
min,192801.0,0.0,0.0,1928.0,1.0
25%,195716.0,0.0,0.0,1957.0,16.0
50%,197332.0,3.0,0.11,1973.0,28.0
75%,198406.0,27.0,1.07,1984.0,40.0
max,200252.0,10402.0,683.06,2002.0,52.0


In [16]:
mmr.head()

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita,year,week_num
0,192801,AL,ALABAMA,MEASLES,97,3.67,1928,1
1,192801,AR,ARKANSAS,MEASLES,76,4.11,1928,1
2,192801,AZ,ARIZONA,MEASLES,8,1.9,1928,1
3,192801,CA,CALIFORNIA,MEASLES,74,1.38,1928,1
4,192801,CO,COLORADO,MEASLES,85,8.38,1928,1


In [17]:
mmr_grouped = mmr.groupby(by=["year", "disease", "state_name"])

In [18]:
mean_cases_year = mmr_grouped["cases"].mean()

In [19]:
mean_incidence_year = mmr_grouped["incidence_per_capita"].mean()

In [20]:
mmr_summary = pd.merge(pd.DataFrame(mean_incidence_year).reset_index(), pd.DataFrame(mean_cases_year).reset_index())

<h2>Get state location data</h2>

In [21]:
states = {
        state["name"].upper(): state for code, state in States.items() if state["name"] not in ["Hawaii", "Alaska"]
    }
mmr_summary = mmr_summary[~mmr_summary["state_name"].isin(["HAWAII", "ALASKA"])]

In [22]:
for z in ["lons", "lats"]:
    mmr_summary[z] = mmr_summary["state_name"].apply(lambda x: states[x][z])

In [23]:
mmr_summary.head()

Unnamed: 0,year,disease,state_name,incidence_per_capita,cases,lons,lats
0,1928,MEASLES,ALABAMA,6.442115,170.057692,"[-87.58552, -87.59581, -87.62123, -87.6272, -8...","[30.99763, 30.94243, 30.89199, 30.85923, 30.83..."
1,1928,MEASLES,ARIZONA,4.779762,20.166667,"[-114.63332, -114.63349, -114.63423, -114.6089...","[34.87057, 35.00186, 35.00332, 35.07971, 35.11..."
2,1928,MEASLES,ARKANSAS,9.832041,181.612245,"[-94.26958, -94.30425, -94.34879, -94.40149, -...","[33.56679, 33.56656, 33.55479, 33.55887, 33.57..."
3,1928,MEASLES,CALIFORNIA,1.331154,71.115385,"[-123.00111, -122.99754, -122.99509, -122.9874...","[37.77205, 37.77078, 37.76913, 37.76387, 37.75..."
4,1928,MEASLES,COLORADO,4.312083,43.729167,"[-109.04984, -109.06017, -109.06015, -109.0565...","[38.215, 38.40118, 38.60929, 38.81393, 38.9578..."


In [24]:
mmr_summary.describe()

Unnamed: 0,year,incidence_per_capita,cases
count,6070.0,6070.0,6070.0
mean,1971.533937,2.272933,66.266948
std,19.332392,4.84461,183.008354
min,1928.0,0.0,0.0
25%,1959.0,0.013333,0.411765
50%,1975.0,0.157853,4.483811
75%,1986.0,1.981827,40.966346
max,2002.0,70.828333,2545.038462


<h2>Make the dataset</h2>

In [25]:
def create_data(disease, year):
    disease_df = mmr_summary[(mmr_summary["disease"] == disease.upper()) & (mmr_summary["year"] == year)]
    assert len(mmr_summary) > 0, "No data for this disease and year combination"
    return ColumnDataSource(disease_df)

<h2>Build a map</h2>

In [26]:
def build_map(src):

    TOOLS = "pan,wheel_zoom,reset,hover,save"
    
    color_mapper = LinearColorMapper(palette=Blues8, low=0, high=71)
    
    p = figure(
        title="US States", tools=TOOLS,
        x_axis_location=None, y_axis_location=None,
        tooltips=[
            ("Name", "@state_name"), ("Average incidences per capita", "@incidence_per_capita"), ("(Long, Lat)", "($x, $y)")
        ], plot_width=1000, plot_height=600)
    p.grid.grid_line_color = None
    p.hover.point_policy = "follow_mouse"

    p.patches('lats', 'lons', source=src,
              fill_color={'field': 'incidence_per_capita', 'transform': color_mapper},
              fill_alpha=0.7, line_color="white", line_width=0.5)

    return p

<h2>Make widgets for controlling the map</h2>

In [27]:
choose_disease = Select(title="Disease:", value="Measles", options=["Measles", "Mumps", "Rubella"])
choose_year = Slider(start=1928, end=2002, value=1928, step = 1, title = "Year")

In [28]:
show(WidgetBox(choose_disease, choose_year))

In [29]:
def update_map(attr, old, new):
    chosen_disease = choose_disease.value
    choosen_year = choose_year.value
    new_data = create_data(chosen_disease, chosen_year)
    src.data.update(source)

In [30]:
choose_disease.on_change("value", update_map)
choose_year.on_change("value", update_map)

In [31]:
controls = WidgetBox(choose_disease, choose_year)
start_data = create_data("measles", 1928)
p = build_map(start_data)
layout = row(controls, p)

<h2>Package it all together!</h2>

In [113]:
def mapping_mmr(app):
    
    def create_data(disease, year):
        disease_df = mmr_summary[(mmr_summary["disease"] == disease.upper()) & (mmr_summary["year"] == year)]
        disease_df = disease_df.dropna()
        assert len(disease_df) > 0, "No data for this disease and year combination"
        
        data = dict(
            state_name = disease_df["state_name"],
            x = disease_df['lons'].values.tolist(),
            y = disease_df['lats'].values.tolist(),
            incidence_per_capita = disease_df["incidence_per_capita"]
        )
        
        return ColumnDataSource(data)
    
    def build_map(src):

        TOOLS = "pan,wheel_zoom,reset,hover,save"
        #colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
        colors = ["#A7D49B", "#92AC86", "#696047", "#55251D", "#5A1807"]
        color_mapper = LinearColorMapper(palette=colors, low=src.data["incidence_per_capita"].min(), high=src.data["incidence_per_capita"].max())
        p = figure(
            title="US States", tools=TOOLS,
            x_axis_location=None, y_axis_location=None,
            tooltips=[
                ("Name", "@state_name"), ("Average incidences per capita", "@incidence_per_capita"), ("(Long, Lat)", "($x, $y)")
            ], plot_width=1000, plot_height=600)
        p.grid.grid_line_color = None
        p.hover.point_policy = "follow_mouse"
        p.patches('x', 'y', source=src,
                  fill_color={'field': 'incidence_per_capita', 'transform': color_mapper},
                  fill_alpha=0.7, line_color="white", line_width=0.5)

        return p
    
    def update_map(attr, old, new):
        chosen_disease = choose_disease.value
        chosen_year = choose_year.value
        new_data = create_data(chosen_disease, chosen_year)
        src.data.update(new_data.data)
        
    #Define Widgets
    choose_disease = Select(title="Disease:", value="Measles", options=["Measles", "Mumps", "Rubella"])
    choose_year = Slider(start=1928, end=2002, value=1928, step = 1, title = "Year")
    choose_disease.on_change('value', update_map)
    choose_year.on_change('value', update_map)
    
    #Select starting data
    src = create_data("measles", 1928)
    
    #Init plot and set layout
    controls = WidgetBox(choose_disease, choose_year)
    p = build_map(src)
    layout = row(controls, p)
    
    app.add_root(layout)
    

In [114]:
#Setup application
handler = FunctionHandler(mapping_mmr)
app = Application(handler)

In [116]:
show(app)

In [65]:
type(county_xs[0])

list

In [70]:
def create_data(disease, year):
    disease_df = mmr_summary[(mmr_summary["disease"] == disease.upper()) & (mmr_summary["year"] == year)]
    disease_df = disease_df.dropna()
    assert len(mmr_summary) > 0, "No data for this disease and year combination"

    data = dict(
        state_name = disease_df["state_name"],
        x = disease_df['lats'].values.tolist(),
        y = disease_df['lons'].values.tolist(),
        incidence_per_capita = disease_df["incidence_per_capita"]
    )

    return ColumnDataSource(data)

In [71]:
src = create_data("measles", 1928)

In [77]:
mmr_summary.head()

Unnamed: 0,year,disease,state_name,incidence_per_capita,cases,lons,lats
0,1928,MEASLES,ALABAMA,6.442115,170.057692,"[-87.58552, -87.59581, -87.62123, -87.6272, -8...","[30.99763, 30.94243, 30.89199, 30.85923, 30.83..."
1,1928,MEASLES,ARIZONA,4.779762,20.166667,"[-114.63332, -114.63349, -114.63423, -114.6089...","[34.87057, 35.00186, 35.00332, 35.07971, 35.11..."
2,1928,MEASLES,ARKANSAS,9.832041,181.612245,"[-94.26958, -94.30425, -94.34879, -94.40149, -...","[33.56679, 33.56656, 33.55479, 33.55887, 33.57..."
3,1928,MEASLES,CALIFORNIA,1.331154,71.115385,"[-123.00111, -122.99754, -122.99509, -122.9874...","[37.77205, 37.77078, 37.76913, 37.76387, 37.75..."
4,1928,MEASLES,COLORADO,4.312083,43.729167,"[-109.04984, -109.06017, -109.06015, -109.0565...","[38.215, 38.40118, 38.60929, 38.81393, 38.9578..."
