<h1>Exploring infections through data: Mapping Measles</h1>

In this notebook I'm going to explore creating an interactive map of the historical incidence rates (per 100,000 people) of measles in the USA. The data comes from Project Tycho, a colleciton of National Notifiable Disease Surveillance System reports, and is available for free from <a href="https://www.kaggle.com/pitt/contagious-diseases/home">Kaggle</a>. There are multiple libraries in Python that can be used to create interactive visualisations of data, but in this notebook I will focus on the Bokeh JS API.

In [1]:
#Dependencies
import pandas as pd
import numpy as np

In [2]:
#Import Bokeh
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.io import show
from bokeh.models import LogColorMapper, ColumnDataSource, HoverTool, LinearColorMapper, ColorBar
from bokeh.palettes import Viridis6 as palette
from bokeh.plotting import figure
from bokeh.models.widgets import Select, Slider
from bokeh.layouts import column, row, WidgetBox
from bokeh.palettes import Oranges
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application

In [3]:
output_notebook()

In [4]:
#Import US state data from Bokeh sample data
from bokeh.sampledata.us_states import data as States

In [5]:
#Import MMR data
measles = pd.read_csv("measles.csv")

In [6]:
measles.head()

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita
0,192801,AL,ALABAMA,MEASLES,97,3.67
1,192801,AR,ARKANSAS,MEASLES,76,4.11
2,192801,AZ,ARIZONA,MEASLES,8,1.9
3,192801,CA,CALIFORNIA,MEASLES,74,1.38
4,192801,CO,COLORADO,MEASLES,85,8.38


In [9]:
measles["year"] = measles["week"].apply(lambda x: int(str(x)[0:4]))

In [10]:
measles["week_num"] = measles["week"].apply(lambda x: int(str(x)[4:7]))

In [11]:
def summarise(df):
    grouped = df.groupby(by = ["year", "disease", "state_name"])
    avg_cases_year = grouped["cases"].mean()
    avg_incidence_year = grouped["incidence_per_capita"].mean()
    return pd.merge(pd.DataFrame(avg_cases_year).reset_index(), pd.DataFrame(avg_incidence_year).reset_index())

In [12]:
measles_summary = summarise(measles)

<h2>Get state location data</h2>

In [13]:
states = {
        state["name"].upper(): state for code, state in States.items() if state["name"] not in ["Hawaii", "Alaska"]
    }

def state_data(df):
    df_ = df[~df["state_name"].isin(["HAWAII", "ALASKA"])]
    for z in ["lons", "lats"]:
        df_[z] = df_["state_name"].apply(lambda x: states[x][z])
    return df_

In [14]:
measles_summary = state_data(measles_summary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [15]:
measles_summary.head()

Unnamed: 0,year,disease,state_name,cases,incidence_per_capita,lons,lats
0,1928,MEASLES,ALABAMA,170.057692,6.442115,"[-87.58552, -87.59581, -87.62123, -87.6272, -8...","[30.99763, 30.94243, 30.89199, 30.85923, 30.83..."
1,1928,MEASLES,ARIZONA,20.166667,4.779762,"[-114.63332, -114.63349, -114.63423, -114.6089...","[34.87057, 35.00186, 35.00332, 35.07971, 35.11..."
2,1928,MEASLES,ARKANSAS,181.612245,9.832041,"[-94.26958, -94.30425, -94.34879, -94.40149, -...","[33.56679, 33.56656, 33.55479, 33.55887, 33.57..."
3,1928,MEASLES,CALIFORNIA,71.115385,1.331154,"[-123.00111, -122.99754, -122.99509, -122.9874...","[37.77205, 37.77078, 37.76913, 37.76387, 37.75..."
4,1928,MEASLES,COLORADO,43.729167,4.312083,"[-109.04984, -109.06017, -109.06015, -109.0565...","[38.215, 38.40118, 38.60929, 38.81393, 38.9578..."


<h2>Plotting with simple bars and lines</h2>
TODO

<h1>Making maps in Bokeh!</h1>

<h2>Make the Measles dataset</h2>

In [40]:
def create_data(year):
    df = measles_summary[measles_summary["year"] == year]
    df = df.dropna()
    assert len(df) > 0, "No data for this disease and year combination"

    data = dict(
        state_name = df["state_name"],
        x = df['lons'].values.tolist(),
        y = df['lats'].values.tolist(),
        incidence_per_capita = df["incidence_per_capita"],
        cases = df["cases"]
    )

    return ColumnDataSource(data)

<h2>Build a map</h2>

In [17]:
def build_map(src):

    TOOLS = "pan,wheel_zoom,reset,hover,save"
    colors = ["#A7D49B", "#92AC86", "#696047", "#55251D", "#5A1807"]
    color_mapper = LinearColorMapper(palette=colors, low=src.data["incidence_per_capita"].min(), high=src.data["incidence_per_capita"].max())
    p = figure(
        title="US States", tools=TOOLS,
        x_axis_location=None, y_axis_location=None,
        tooltips=[
            ("Name", "@state_name"), ("Average incidences per capita", "@incidence_per_capita"), ("(Long, Lat)", "($x, $y)")
        ], plot_width=1000, plot_height=600)
    p.grid.grid_line_color = None
    p.hover.point_policy = "follow_mouse"
    p.patches('x', 'y', source=src,
              fill_color={'field': 'incidence_per_capita', 'transform': color_mapper},
              fill_alpha=0.7, line_color="white", line_width=0.5)

    return p

<h2>Make widgets for controlling the map</h2>

In [18]:
#Starting data
src = create_data(1928)

In [19]:
choose_year = Slider(start=measles_summary["year"].min(), end=measles_summary["year"].max(), value=measles_summary["year"].min(), step = 1, title = "Year")

In [20]:
show(choose_year)

In [21]:
def update_map(attr, old, new):
    choosen_year = choose_year.value
    new_data = create_data(measles_summary, chosen_year)
    src.data.update(new_data)

In [22]:
choose_year.on_change("value", update_map)

<h2>Package it all together!</h2>

In [41]:
def mapping_mmr(app):
    
    def create_data(year):
        df = measles_summary[measles_summary["year"] == year]
        df = df.dropna()
        assert len(df) > 0, "No data for this disease and year combination"
        
        data = dict(
            state_name = df["state_name"],
            x = df['lons'].values.tolist(),
            y = df['lats'].values.tolist(),
            incidence_per_capita = df["incidence_per_capita"],
            cases = df["cases"]
        )
        
        return ColumnDataSource(data)
    
    def build_map(src):

        TOOLS = "pan,wheel_zoom,reset,hover,save"
        #colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
        colors = ["#A7D49B", "#92AC86", "#696047", "#55251D", "#5A1807"]
        color_mapper = LinearColorMapper(palette=colors, low=src.data["incidence_per_capita"].min(), high=src.data["incidence_per_capita"].max())
        p = figure(
            title="US States", tools=TOOLS,
            x_axis_location=None, y_axis_location=None,
            tooltips=[
                ("Name", "@state_name"), ("Average incidences per capita", "@incidence_per_capita"), 
                ("Average # of cases", "@cases")
            ], plot_width=1000, plot_height=600)
        p.grid.grid_line_color = None
        p.hover.point_policy = "follow_mouse"
        p.patches('x', 'y', source=src,
                  fill_color={'field': 'incidence_per_capita', 'transform': color_mapper},
                  fill_alpha=0.7, line_color="white", line_width=0.5)

        return p
    
    def update_map(attr, old, new):
        chosen_year = choose_year.value
        new_data = create_data(chosen_year)
        src.data.update(new_data.data)
        
    #Define Widgets
    choose_year = Slider(start=1928, end=2002, value=1928, step = 1, title = "Year")
    choose_year.on_change('value', update_map)
    
    #Select starting data
    src = create_data(1928)
    
    #Init plot and set layout
    controls = WidgetBox(choose_year)
    p = build_map(src)
    layout = column(controls, p)
    
    app.add_root(layout)
    

In [42]:
handler = FunctionHandler(mapping_mmr)
app = Application(handler)
show(app)

<h1>Creating a complete dashboard!</h1>