In [None]:
import numpy as np
import pandas as pd

from bokeh.charts import Histogram, Bar, BoxPlot
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

In [None]:
ebola_df = pd.read_csv("data/out/ebola_outbreaks_before_2014-geometry_fixed.csv", encoding="utf-8", index_col=False)

ebola_data = ebola_df.drop(ebola_df.columns[[0, 1, 2, 9]], axis=1)

### Data set description

In [None]:
cols = list(ebola_data.columns)

for column in cols:
    print "column", (cols.index(column) + 1), ":", column

**=> locations info, reported cases, date/time**

### Number of outbreaks per countries

In [None]:
ebola_data.groupby(["country_name"])["country_name"].count().order()

In [None]:
# Bokeh can't seem to handle the unicode for circumflex accent on the 'o' in "Côte d'Ivoire"
# I'm replacing it by a normal 'o'

countries_list = list(ebola_data["country_name"])

for i in range(len(countries_list)): 
    if countries_list[i] == u"C\xf4te d'Ivoire (Ivory Coast)":
       countries_list[i] = u"Cote d'Ivoire (Ivory Coast)"

In [None]:
data_nb = {
    'countries': countries_list,
    'ebola outbreaks': [1] * len(countries_list)
}

bar_nb = Bar(data_nb, values='ebola outbreaks', label='countries', agg='sum', color="#3B6849",
          title="Number of ebola outbreak(s) per country", plot_width=600, plot_height=500)


output_notebook()

show(bar_nb)

### The average number of days for an outbreak per country

In [None]:
ebola_data.groupby(["country_name"])["duration_days"].mean().order()

In [None]:
data_dur = {
    'countries': countries_list,
    'outbreaks duration': list(ebola_data["duration_days"])
}

bar_dur = Bar(data_dur, values='outbreaks duration', label='countries', agg='mean', color="#586996",
              title="Durations of ebola outbreaks (days) per country", plot_width=600, plot_height=500)

output_notebook()

show(bar_dur)

### Ebola virus subtypes

In [None]:
ebola_data.groupby(["ebola_subtype"])["ebola_subtype"].count().order()

In [None]:
# Bokeh can't seem to handle the unicode for trema accent on the 'i' in Taï Forest virus
# I'm replacing it by a normal 'i'

eb_virus_types = list(ebola_data["ebola_subtype"])

for i in range(len(eb_virus_types)): 
    if eb_virus_types[i] == u"Ta\xef Forest virus":
       eb_virus_types[i] = u"Tai Forest virus"

In [None]:
data_vir = {
    'ebola virus subtypes': eb_virus_types,
    'number of ebola outbreaks': [1] * len(eb_virus_types)
}

bar_vir = Bar(data_vir, values='number of ebola outbreaks', label='ebola virus subtypes', agg='sum', color="#E2AE7A",
              title="Number of ebola outbreak(s) per virus subtype", plot_width=600, plot_height=400, bar_width=0.5)

output_notebook()

show(bar_vir)

In [None]:
ebola_data[["country_name", "ebola_subtype"]].sort(["country_name"])

### Duration of epidemy in days by virus subtype

In [None]:
ebola_data.groupby(["ebola_subtype"])["duration_days"].mean().order()

__=> Bundibugyo virus seems to be correlated with shorter outbreaks__

### Human cases for DRC (country with the most cases)

In [None]:
df_drc = ebola_data[ebola_data.country_name == "Democratic Republic of the Congo"]

drc = df_drc[["ebola_subtype", "start_date", "end_date", "reported_number_of_human_cases", "reported_number_of_deaths_among_cases"]]

drc.sort(["start_date"])