# Sankey Diagram

Creates Sankey plots and quantifies hierarchy.

## Mount Drive

In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
root = '/content/drive/My Drive/Project/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## Libraries

In [None]:
import numpy as np
import pandas as pd

import holoviews as hv
from holoviews import opts

import branca.colormap as cm
import matplotlib.colors as colors


# import classes
import sys
sys.path.append(root + 'Classes')
from Stability_class import Stability

## Data Processing

Load in data for processing.

In [None]:
stability_data = Stability(root + 'Stability Data/longrun.mat')

# N x T array of cluster labels.
C = stability_data.C
# Array of number of communities.
k = stability_data.k
# Array of Markov times
times = stability_data.t

Assigns chosen times and converts times to 4 s.f.

In [None]:
# Markov Times to 4 s.f
my_times = [7.499e3,3.350e3,188.4,19.95,0.5623]

# function which converts array to 4.s.f
def sf4(x):
    x = float(np.format_float_positional(x, precision=4, unique=False, fractional=False))
    return x
vec_sf4 = np.vectorize(sf4)

# convert tp 4 s.f
times = vec_sf4(times)

Gets labels at the chosen times and assigns each label with a color.

In [None]:
labels_lst = []
cmaps = []
my_k = []
for time in my_times:
    # gets index 
    idx = int(np.where(times == float(time))[0][0])

    # number of communities
    n = k[idx]

    # labels
    labels = C[:, idx]

    #creates categorical colour map
    catcolormap = cm.linear.Spectral_11.to_step(index=np.arange(-0.5,n))

    hex_map = []
    for color in catcolormap.colors:
        hex_map.append(colors.rgb2hex(color))
    
    # add results to lists.
    labels_lst.append(labels)
    cmaps.append(hex_map)
    my_k.append(n)

Create Color Map.

In [None]:
from string import ascii_uppercase
colormap = {}
for i, cmap in enumerate(cmaps):
    for j in range(len(cmap)):
      cat = ascii_uppercase[i] + '_' + str(j)
      colormap[cat] = cmap[j]

Create dataframe for Sankey plot.

In [None]:
data = {'source':[],'target':[],'count':[]}
for idx in range(len(labels_lst)-1):
    # labels to compare 
    arr1 = labels_lst[idx]
    arr2 = labels_lst[idx + 1]
    # sources
    for source_idx in range(my_k[idx]):
        source = ascii_uppercase[idx] + '_' + str(source_idx)
        # targets
        for target_idx in range(my_k[idx + 1]):
            # add source
            data['source'].append(source)
            
            # add target                      
            target = ascii_uppercase[idx + 1] + '_' + str(target_idx)
            data['target'].append(target)
            
            # get count of source to target
            # match source
            source_bool = (arr1 == source_idx)
            # match target
            target_bool = (arr2 == target_idx)
            # match both
            match_bool = np.logical_and(source_bool, target_bool)
            count = np.count_nonzero(match_bool)
            data['count'].append(count)

data = pd.DataFrame(data)
data.head()

Unnamed: 0,source,target,count
0,A_0,B_0,0
1,A_0,B_1,0
2,A_0,B_2,0
3,A_0,B_3,0
4,A_0,B_4,516


## Sankey Plot

Plot Sankey

In [None]:
hv.extension('bokeh')
renderer = hv.renderer('bokeh')
sankey = hv.Sankey(data)
sankey.opts(cmap=colormap,label_position='right',edge_line_width=0,
            edge_color="source",node_sort=True, node_alpha=1.0, node_width=20,
            show_values=False,width=1840, height=2600,padding=0,margin=0,
            tools=['pan','box_zoom','save','reset'],
            toolbar='left')
sankey

Output hidden; open in https://colab.research.google.com to view.

Save plot.

In [None]:
renderer.save(sankey, root + 'Plots/Sankey')

Fine Sankey

In [None]:
hv.extension('bokeh')
renderer = hv.renderer('bokeh')
sankey = hv.Sankey(data)
sankey.opts(cmap=colormap,label_position='right',edge_line_width=0.01,
            edge_color="source",node_sort=True, node_alpha=1.0, node_width=20,
            show_values=False,width=1840, height=2600,padding=0,margin=0,
            tools=['pan','box_zoom','save','reset'],
            toolbar='left')
sankey

## Calculate Conecentrations Clusters

Concentrations.

In [None]:
# array of targets
targets = data['target'].values

# function to calculate concentrations.
def HHI(x):
    pro = x/sum(x)
    HHI = np.sum(pro**2)
    return HHI

# calculate concentrations
group = data.groupby(['target'])
hhis = group.agg(HHI)
hhis.columns = ['HHI']

hhis.head()

Unnamed: 0_level_0,HHI
target,Unnamed: 1_level_1
B_0,0.971692
B_1,1.0
B_2,1.0
B_3,1.0
B_4,0.918299


Creates partition lists.

In [None]:
B_keys = [ascii_uppercase[1] + '_' + str(i) for i in range(9)]
C_keys = [ascii_uppercase[2] + '_' + str(i) for i in range(25)]
D_keys = [ascii_uppercase[3] + '_' + str(i) for i in range(54)]
E_keys = [ascii_uppercase[4] + '_' + str(i) for i in range(148)]
partitions = [B_keys, C_keys, D_keys, E_keys]

Returns top 3 dilute clusters for each partition.

In [None]:
results = []
for part in partitions:
    results.append(hhis[hhis.index.isin(part)].nsmallest(3,'HHI'))

In [None]:
results[0].head()

Unnamed: 0_level_0,HHI
target,Unnamed: 1_level_1
B_4,0.918299
B_0,0.971692
B_7,0.995941


In [None]:
results[0].mean()

HHI    0.961977
dtype: float64

In [None]:
results[1].head()

Unnamed: 0_level_0,HHI
target,Unnamed: 1_level_1
C_5,0.514099
C_2,0.598313
C_19,0.706633


In [None]:
results[1].mean()

HHI    0.606348
dtype: float64

In [None]:
results[2].head()

Unnamed: 0_level_0,HHI
target,Unnamed: 1_level_1
D_6,0.364292
D_20,0.506584
D_13,0.573964


In [None]:
results[2].mean()

HHI    0.481614
dtype: float64

In [None]:
results[3].head()

Unnamed: 0_level_0,HHI
target,Unnamed: 1_level_1
E_135,0.457798
E_82,0.542535
E_102,0.575617


In [None]:
results[3].mean()

HHI    0.525317
dtype: float64

## MSOA HHI 

HHI for MSOAs acrosss clusters.

In [None]:
MSOA_HHI = np.zeros((1, 6790))
for time, part in zip(my_times[1:], partitions):
    # gets index 
    idx = int(np.where(times == float(time))[0][0])

    # labels
    labels = C[:, idx]

    # concentrations of the cluster a node is in
    s_i = hhis[hhis.index.isin(part)]['HHI'].reindex(part)[labels]
    MSOA_HHI += s_i**2
MSOA_HHI = MSOA_HHI/4

100 MSOAs with Lowest HHI 

In [None]:
top_100 = MSOA_HHI.argsort()[0][:100]

In [None]:
%%capture
# load MSOAs
# geopandas import
!pip install geopandas
import geopandas as gpd
MSOAs = gpd.read_file(root + 'MSOAs/MSOAs.shp')['msoa11nm']

Collecting geopandas
[?25l  Downloading https://files.pythonhosted.org/packages/f7/a4/e66aafbefcbb717813bf3a355c8c4fc3ed04ea1dd7feb2920f2f4f868921/geopandas-0.8.1-py2.py3-none-any.whl (962kB)
[K     |████████████████████████████████| 972kB 2.8MB/s 
[?25hCollecting fiona
[?25l  Downloading https://files.pythonhosted.org/packages/35/72/7de5a2179b75242c69a97ef62b8b569197253e3ff85fe70ade62936608fe/Fiona-1.8.14-cp36-cp36m-manylinux1_x86_64.whl (14.7MB)
[K     |████████████████████████████████| 14.8MB 278kB/s 
[?25hCollecting pyproj>=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/e5/c3/071e080230ac4b6c64f1a2e2f9161c9737a2bc7b683d2c90b024825000c0/pyproj-2.6.1.post1-cp36-cp36m-manylinux2010_x86_64.whl (10.9MB)
[K     |████████████████████████████████| 10.9MB 48.3MB/s 
Collecting munch
  Downloading https://files.pythonhosted.org/packages/cc/ab/85d8da5c9a45e072301beb37ad7f833cd344e04c817d97e0cc75681d248f/munch-2.5.0-py2.py3-none-any.whl
Collecting cligj>=0.5
  Download

In [None]:
MSOAs[MSOAs.index.isin(top_100)].reindex(top_100).to_csv(root + 'top_100.csv')

HHI data added to MSOAs

In [None]:
order = MSOA_HHI.argsort()[0]
MSOAs = pd.DataFrame(MSOAs[MSOAs.index.isin(order)])
MSOAs['HHI'] = MSOA_HHI[0]
MSOAs.sort_values('HHI').to_csv(root + 'ordered.csv')