In [1]:
%matplotlib inline

import json

from pathlib import Path
import os

import matplotlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import sys
sys.path.append(os.path.join(os.getcwd(), os.pardir, 'src'))

%load_ext autoreload

%autoreload 2
from visualization.visualize import *

pd.set_option('display.max_rows', 101)

In [3]:
from math import pi
import pandas as pd

from bokeh.io import output_notebook, show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LinearColorMapper,
    BasicTicker,
    PrintfTickFormatter,
    ColorBar,
)
from bokeh.plotting import figure
from bokeh.sampledata.unemployment1948 import data


In [4]:
output_notebook()

### Load metrics

In [5]:
who_df = pd.read_csv(Path('../data/interim/calc_cols_added.csv'), index_col=0)
code_to_name = (who_df[['Iso Code', 'Country Name']].drop_duplicates()
                                                    .set_index('Iso Code')
                                                    .to_dict()['Country Name'])

In [6]:
all_series = []
for f in Path("../data/processed/scores/").glob('*.csv'):
    print(f)
    all_series.append(pd.read_csv(f, index_col=0))
    
all_scores = pd.concat(all_series, axis=1)
all_scores.head(2)

../data/processed/scores/country-wb-infant_mortality.csv
../data/processed/scores/country-nasa_pct_diff_of_agg.csv
../data/processed/scores/country-internal-coef_of_var_denom.csv
../data/processed/scores/country-nasa_mean_pct_diff.csv
../data/processed/scores/country-internal-dtp-dropout-agg.csv
../data/processed/scores/country-internal-dtp-dropout-averaged.csv
../data/processed/scores/country-wb-crude_birth_rate.csv
../data/processed/scores/country-internal-coef_of_var.csv
../data/processed/scores/country-weunic_diff_national.csv


Unnamed: 0,infant_mortality_wb_diff,nasa_pct_diff_of_agg,coef_of_var_denom,nasa_mean_pct_diff,dropout_aggregated,dropout_averaged,crude_birth_rate_wb_diff,coef_of_var,wuenic_mean_difference_from_national
AFG,18.28617,0.228407,0.020994,0.548219,11.649611,14.536052,-3.921798,0.065588,15.333333
AGO,2.472073,0.174677,0.05682,0.173109,12.408592,18.459532,-10.282817,0.213387,10.545455


In [10]:
ranks_and_total = (all_scores.rank(ascending=False)
            .assign(total=lambda x: (x.mean(axis=1) * (10**pd.notnull(x).sum(axis=1))).rank()))

In [15]:
rank_df = (pd.melt(ranks_and_total.reset_index(),
             id_vars=['index'],
             value_name='rate',
             var_name='measure')
           .replace({np.inf: np.nan, -np.inf: np.nan})
           .rename(columns={'index': 'ISO_Code'})
           .replace({'ISO_Code': code_to_name}))


# this is the colormap from the original NYTimes plot
colors = [matplotlib.colors.to_hex(c) for c in plt.cm.RdYlGn(np.linspace(0, 1, 100))]
mapper = LinearColorMapper(palette=colors,
                           low=rank_df.rate.min(),
                           high=rank_df.rate.max())

source = ColumnDataSource(rank_df)

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

ordered_countries = (ranks_and_total.sort_values('total', ascending=True, na_position='first')
                                    .rename(index=code_to_name)
                                    .index.values.tolist())


p = figure(title="",
           x_range=rank_df.measure.unique().tolist(),
           y_range=ordered_countries,
           x_axis_location="above",
           plot_width=600,
           plot_height=1500,
           tools=TOOLS, toolbar_location='below')

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "8pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 2.5

p.rect(x="measure",
       y="ISO_Code",
       width=1,
       height=1,
       source=source,
       fill_color={'field': 'rate', 'transform': mapper},
       line_color=None)


p.select_one(HoverTool).tooltips = [
     ('Country', '@ISO_Code'),
     ('rate', '@rate'),
]

show(p)      # show the plot