<h1>Exploring infections through data: Mapping Measles</h1>

In this notebook I'm going to explore creating an interactive map of the historical incidence rates (per 100,000 people) of measles in the USA. The data comes from Project Tycho, a colleciton of National Notifiable Disease Surveillance System reports, and is available for free from <a href="https://www.kaggle.com/pitt/contagious-diseases/home">Kaggle</a>. There are multiple libraries in Python that can be used to create interactive visualisations of data, but in this notebook I will focus on the Bokeh JS API.

In [1]:
#Dependencies
import pandas as pd
import numpy as np

In [2]:
#Import Bokeh
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.io import show
from bokeh.models import LogColorMapper, ColumnDataSource, HoverTool, LinearColorMapper, ColorBar
from bokeh.palettes import Viridis6 as palette
from bokeh.plotting import figure
from bokeh.models.widgets import Select, Slider
from bokeh.layouts import column, row, WidgetBox
from bokeh.palettes import Oranges
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application

In [3]:
output_notebook()

<h1>Data Wrangling</h1>

In [4]:
#Import US state data from Bokeh sample data
from bokeh.sampledata.us_states import data as States

In [5]:
#Import MMR data
measles = pd.read_csv("measles.csv")

In [6]:
measles.head()

Unnamed: 0,week,state,state_name,disease,cases,incidence_per_capita
0,192801,AL,ALABAMA,MEASLES,97,3.67
1,192801,AR,ARKANSAS,MEASLES,76,4.11
2,192801,AZ,ARIZONA,MEASLES,8,1.9
3,192801,CA,CALIFORNIA,MEASLES,74,1.38
4,192801,CO,COLORADO,MEASLES,85,8.38


In [7]:
measles["year"] = measles["week"].apply(lambda x: int(str(x)[0:4]))

In [8]:
measles["week_num"] = measles["week"].apply(lambda x: int(str(x)[4:7]))

In [15]:
measles.drop("week", axis = 1, inplace = True)

In [16]:
measles.head()

Unnamed: 0,state,state_name,disease,cases,incidence_per_capita,year,week_num
0,AL,ALABAMA,MEASLES,97,3.67,1928,1
1,AR,ARKANSAS,MEASLES,76,4.11,1928,1
2,AZ,ARIZONA,MEASLES,8,1.9,1928,1
3,CA,CALIFORNIA,MEASLES,74,1.38,1928,1
4,CO,COLORADO,MEASLES,85,8.38,1928,1


In [9]:
def summarise(df):
    #Group data
    grouped = df.groupby(by = ["year", "disease", "state_name"])
    #Summarise data as Series then convert back to Dataframe
    cases_sum = pd.DataFrame(grouped["cases"].sum()).reset_index()
    cases_avg = pd.DataFrame(grouped["cases"].mean()).reset_index()
    avg_incidence_year = pd.DataFrame(grouped["incidence_per_capita"].mean()).reset_index()
    #Give columns sensible names
    avg_incidence_year = avg_incidence_year.rename(columns = {"incidence_per_capita": "avg_incidence_per_week"})
    cases_sum = cases_sum.rename(columns = {"cases": "total_cases_per_year"})
    cases_avg = cases_avg.rename(columns = {"cases": "avg_cases_per_week"})
    #Merge dataframes
    cases = pd.merge(cases_avg, cases_sum)
    new_df = pd.merge(avg_incidence_year, cases)
    return new_df

In [12]:
measles_yealy_data = summarise(measles)

In [13]:
measles_yealy_data.head()

Unnamed: 0,year,disease,state_name,avg_incidence_per_week,avg_cases_per_week,total_cases_per_year
0,1928,MEASLES,ALABAMA,6.442115,170.057692,8843
1,1928,MEASLES,ARIZONA,4.779762,20.166667,847
2,1928,MEASLES,ARKANSAS,9.832041,181.612245,8899
3,1928,MEASLES,CALIFORNIA,1.331154,71.115385,3698
4,1928,MEASLES,COLORADO,4.312083,43.729167,2099


<h2>Get state location data</h2>

In [17]:
states = {
        state["name"].upper(): state for code, state in States.items() if state["name"] not in ["Hawaii", "Alaska"]
    }

def state_data(df):
    df_ = df[~df["state_name"].isin(["HAWAII", "ALASKA"])]
    for z in ["lons", "lats"]:
        df_[z] = df_["state_name"].apply(lambda x: states[x][z])
    return df_

In [18]:
measles_summary = state_data(measles_summary)
measles = state_data(measles)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [19]:
measles_summary.head()

Unnamed: 0,year,disease,state_name,avg_incidence_per_week,avg_cases_per_week,total_cases_per_year,lons,lats
0,1928,MEASLES,ALABAMA,6.442115,170.057692,8843,"[-87.58552, -87.59581, -87.62123, -87.6272, -8...","[30.99763, 30.94243, 30.89199, 30.85923, 30.83..."
1,1928,MEASLES,ARIZONA,4.779762,20.166667,847,"[-114.63332, -114.63349, -114.63423, -114.6089...","[34.87057, 35.00186, 35.00332, 35.07971, 35.11..."
2,1928,MEASLES,ARKANSAS,9.832041,181.612245,8899,"[-94.26958, -94.30425, -94.34879, -94.40149, -...","[33.56679, 33.56656, 33.55479, 33.55887, 33.57..."
3,1928,MEASLES,CALIFORNIA,1.331154,71.115385,3698,"[-123.00111, -122.99754, -122.99509, -122.9874...","[37.77205, 37.77078, 37.76913, 37.76387, 37.75..."
4,1928,MEASLES,COLORADO,4.312083,43.729167,2099,"[-109.04984, -109.06017, -109.06015, -109.0565...","[38.215, 38.40118, 38.60929, 38.81393, 38.9578..."


<h1>Exploring the data with simple bars and lines</h1>

In [27]:
def bar_plot(src):
    states = src["state_name"] 
    bar = figure(plot_width=800, plot_height=500, title="Total cases of Measles in the United States {}".format(src["year"][0]),
           x_range=states, toolbar_location=None, tools="")
    bar.xgrid.grid_line_color = None
    bar.xaxis.axis_label = "States"
    bar.xaxis.major_label_orientation = 1.2

    bar.vbar(x='state_name', top='total_cases_per_year', width=1, source=src,
           line_color="white", fill_color="#3d84f7", fill_alpha = "avg_incidence_per_week", 
           hover_line_color="black", hover_fill_color="#4a81db")

    return bar

In [28]:
def line_plot(src):
    weeks = src["week_num"]
    line = figure(xrange = weeks, plot_width=800, plot_height=500,  title="Incidence of Measles in the United States", 
               toolbar_location=None, tools="")
    
    line.line(x="week_num", y="incidence_per_capita", color = "state_name", line_width=2, source = src)
    
    return line

In [29]:
show(bar_plot(measles_yealy_data[measles_yealy_data["year"] == 1928]))

<h1>Making maps in Bokeh!</h1>

<h2>Make the Measles dataset</h2>

In [14]:
def create_data(year):
    df = measles_summary[measles_summary["year"] == year]
    df = df.dropna()
    assert len(df) > 0, "No data for this disease and year combination"

    data = dict(
        state_name = df["state_name"],
        x = df['lons'].values.tolist(),
        y = df['lats'].values.tolist(),
        incidence_per_capita = df["incidence_per_capita"],
        cases = df["cases"]
    )

    return ColumnDataSource(data)

<h2>Build a map</h2>

In [15]:
def build_map(src):

    TOOLS = "pan,wheel_zoom,reset,hover,save"
    colors = ["#A7D49B", "#92AC86", "#696047", "#55251D", "#5A1807"]
    color_mapper = LinearColorMapper(palette=colors, low=src.data["incidence_per_capita"].min(), high=src.data["incidence_per_capita"].max())
    p = figure(
        title="US States", tools=TOOLS,
        x_axis_location=None, y_axis_location=None,
        tooltips=[
            ("Name", "@state_name"), ("Average incidences per capita", "@incidence_per_capita"), ("(Long, Lat)", "($x, $y)")
        ], plot_width=1000, plot_height=600)
    p.grid.grid_line_color = None
    p.hover.point_policy = "follow_mouse"
    p.patches('x', 'y', source=src,
              fill_color={'field': 'incidence_per_capita', 'transform': color_mapper},
              fill_alpha=0.7, line_color="white", line_width=0.5)

    return p

<h2>Make widgets for controlling the map</h2>

In [16]:
#Starting data
src = create_data(1928)

In [17]:
choose_year = Slider(start=measles_summary["year"].min(), end=measles_summary["year"].max(), value=measles_summary["year"].min(), step = 1, title = "Year")

In [18]:
show(choose_year)

In [19]:
def update_map(attr, old, new):
    choosen_year = choose_year.value
    new_data = create_data(measles_summary, chosen_year)
    src.data.update(new_data)

In [20]:
choose_year.on_change("value", update_map)

<h2>Package it all together!</h2>

In [69]:
def mapping_mmr(app):
    
    def create_data(year):
        df = measles_summary[measles_summary["year"] == year]
        df = df.dropna()
        assert len(df) > 0, "No data for this disease and year combination"
        
        data = dict(
            state_name = df["state_name"],
            x = df['lons'].values.tolist(),
            y = df['lats'].values.tolist(),
            incidence_per_capita = df["avg_incidence_per_week"],
            total_cases = df["total_cases_per_year"],
            avg_cases = df["avg_cases_per_week"]
        )
        
        return ColumnDataSource(data)
    
    def build_map(src):

        TOOLS = "pan,wheel_zoom,reset,hover,save"
        #colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
        colors = ["#A7D49B", "#92AC86", "#696047", "#55251D", "#5A1807"]
        color_mapper = LinearColorMapper(palette=colors, low=src.data["incidence_per_capita"].min(), high=src.data["incidence_per_capita"].max())
        p = figure(
            title="US States", tools=TOOLS,
            x_axis_location=None, y_axis_location=None,
            tooltips=[
                ("Name", "@state_name"), ("Average incidences per capita per week", "@incidence_per_capita{1.11}"), 
                ("Average # of cases per week", "@avg_cases{1.11}"), ("Total cases in year", "@total_cases{1.11}")
            ], plot_width=1000, plot_height=600)
        p.grid.grid_line_color = None
        p.hover.point_policy = "follow_mouse"
        p.patches('x', 'y', source=src,
                  fill_color={'field': 'incidence_per_capita', 'transform': color_mapper},
                  fill_alpha=0.7, line_color="white", line_width=0.5)

        return p
    
    def update_map(attr, old, new):
        chosen_year = choose_year.value
        new_data = create_data(chosen_year)
        src.data.update(new_data.data)
        
    #Define Widgets
    choose_year = Slider(start=1928, end=2002, value=1928, step = 1, title = "Year")
    choose_year.on_change('value', update_map)
    
    #Select starting data
    src = create_data(1928)
    
    #Init plot and set layout
    controls = WidgetBox(choose_year)
    p = build_map(src)
    layout = column(controls, p)
    
    app.add_root(layout)
    

In [70]:
handler = FunctionHandler(mapping_mmr)
app = Application(handler)
show(app)

<h1>Creating a complete dashboard!</h1>