In [1]:
from pathlib import Path # reads paths in the current OS
import pandas as pd
import re
import json
import yaml
import utils as ut
from ipysankeywidget import SankeyWidget
import floweaver as fw
from floweaver import *
import numpy as np



In [2]:
with open(Path('conf') / 'paths.yaml') as file:
    config = yaml.full_load(file)

data = pd.read_csv(Path(config['input_path']) / "all_data.csv", sep=",").fillna('')

locations_df = pd.read_csv(Path(config['input_path']) /  'municipis_merge.csv').fillna('')


In [3]:
specific_subset=False ##(must be set when running the script)
#To create the sankey diagram only for a subset of producer set the parameter specific_subset=True 
# and define below the fields which must necessarily be present (i.e. >0): on_fields
# and those which must be absent (i.e. =0): off_fields
# as well as the label which will appear in the name of the png file

if specific_subset:
    on_fields=list(['n_main_prod']) ## must be set when running the script, if specific_subset=True
    off_fields=list(['n_other_prod']) ## must be set when running the script, if specific_subset=True
    label='onlyMainProd' ## must be set when running the script, if specific_subset=True
    print('The sankey diagram will be produced considering only a subset of the whole dataset.')
    print('Especifically, considering just those producers who do sell: ', on_fields)
    if len(off_fields)>0:
        print('and do not sell: ', off_fields)
else:
    label='all'

    
    
    
    
if specific_subset:

    if len(off_fields)>0:
        n_flags_on=len(on_fields) 
        n_flags_off=len(off_fields)
        
        for i in range(0,n_flags_on):
            df_flag = pd.DataFrame()
            col=on_fields[i]
            df_flag['flag_on_'+str(i)]=np.where(data[col].gt(0), 1, 0)
            data=pd.concat([data, df_flag], axis=1)
       
        for i in range(0,n_flags_off):
            df_flag = pd.DataFrame()
            col=off_fields[i]
            df_flag['flag_off_'+str(i)]=np.where(data[col].eq(0), 1, 0)
            data=pd.concat([data, df_flag], axis=1)
    
        col_list=list()
        for j in range(0,n_flags_on):
            flag_col='flag_on_'+str(j)
            col_list.append(flag_col)  
        for j in range(0,n_flags_off):
            flag_col='flag_off_'+str(j)
            col_list.append(flag_col)
            
        data['final_flag']=np.where(data[col_list].sum(axis=1)==n_flags_on+n_flags_off, 1, 0)  ## & np.where(data[col_off_list].sum(axis=1)==n_flags_off, 1, 0)

    else:
        n_flags_on=len(on_fields) 
        #print('N fields which must be on: ', n_flags_on)
        for i in range(0,n_flags_on):
            df_flag = pd.DataFrame()
            col=on_fields[i]
            df_flag['flag_'+str(i)]=np.where(data[col].gt(0), 1, 0)
            data=pd.concat([data, df_flag], axis=1)
        col_list=list()
        for j in range(0,n_flags_on):
            flag_col='flag_'+str(j)
            col_list.append(flag_col)
        #print(col_list)
        data['final_flag']=np.where(data[col_list].sum(axis=1)==n_flags_on, 1, 0)
else:
    data['final_flag']=1



#data.tail()

 

## Connections between comarcas
Creating the dataframe needed for sankey diagram (i.e. the list of all the edges between two comarcas),
it will have the following columns: source, target, value.

#### Selecting only the desired subset of producers

In [4]:
data_sel=data[(data.final_flag==1) & (~data.comarca_origin.str.contains('NOTFOUND'))]

print('Dimension of the subset: ', data_sel.shape)

##print(data.shape)
##data.drop(data[data.flag == 0].index, inplace=True)
##print(data[data.flag==1].shape)

Dimension of the subset:  (541, 36)


#### Extracting all the target comarcas from the field 'DONDE' 

In [5]:
df=[]

for j in range(0,data_sel.shape[0]):

    targets=data_sel.DONDE.iloc[j].split(", ")
    n_targets=len(targets)
    
    if(n_targets>=40):
        df.append((data_sel.comarca_origin.iloc[j], 'Catalunya', 1))
    else:
        for i in range(0,n_targets):
            df.append((data_sel.comarca_origin.iloc[j], targets[i], 1))
    

df = pd.DataFrame(df, columns=('source', 'target', 'value'))


## Removing records which have no info in the target or in the source field
df=df[~(df.target=='')]
df=df[~(df.source=='')]


## Uniformizing names of comarcas between the pagesos dataset and the cataloninan comarcas dataset (comarcas_df)
standard_names = {'Osona / Lluçanès':'Osona', 'Ribera d’Ebre': 'Ribera d\'Ebre', 
                  'Pla de l’Estany':'Pla de l\'Estany', 'Pla d’Urgell': 'Pla d\'Urgell'}
df['target'] = df['target'].replace(standard_names)
df['source'] = df['source'].replace(standard_names)

#### Creation of the final df by grouping by (source, target) couples  

In [6]:
## Getting the normalization factor (i.e. the total number of connections per comarca of origin)  
df_norm=df.groupby(['source'])['value'] \
                             .sum() \
                             .reset_index(name='norm_factor') 


## Grouping by the connections with same source-target: 
df_edges=df.groupby(['source', 'target'])['value'] \
                             .sum() \
                             .reset_index(name='value') \
                             .sort_values(['value'], ascending=False) \



## Adding the normalized factor to the edges df:
df_edges=pd.merge(df_edges, df_norm, how='inner', left_on='source', right_on='source')
df_edges['norm_value']=df_edges['value'].astype(float)/df_edges['norm_factor'].astype(float)*100


#print(df_edges.head())
#print(df_edges.shape)
#df_edges.sort_values(by=['value'], ascending=False).head(10)



## Sankey diagram

In [7]:
###https://github.com/psychemedia/parlihacks/blob/master/notebooks/MigrantFlow.ipynb)

flows=df_edges[['source', 'target', 'value']]

SankeyWidget(links=flows.to_dict('records'))

size = dict(width=870, height=1000)

nodes = {
    'Comarcas_productoras': ProcessGroup(list(locations_df.Comarca.unique())),
    'Comarcas_entrega': ProcessGroup(list(locations_df.Comarca.unique())),
}

ordering = [
    ['Comarcas_productoras'],       # put "Comarcas_productoras" on the left...
    ['Comarcas_entrega'],   # ... and "Comarcas_entrega" on the right.
]

bundles = [
    Bundle('Comarcas_productoras', 'Comarcas_entrega'),
]


#sdd = SankeyDefinition(nodesA, bundles, ordering)
#weave(sdd, flows).to_widget(**size)


comarcas_out = Partition.Simple('process',list(locations_df.Comarca.unique()))
comarcas_in = Partition.Simple('process',list(locations_df.Comarca.unique()))


# Update the ProcessGroup nodes to use the partitions
nodes['Comarcas_productoras'].partition = comarcas_out
nodes['Comarcas_entrega'].partition = comarcas_in


sdd = SankeyDefinition(nodes, bundles, ordering)

## New Sankey!
#weave(sdd, flows).to_widget(**size) 

## Saving the plot as png
weave(sdd, flows, link_color=QuantitativeScale('value'), \
      measures='value').to_widget(**size).auto_save_png('SankeyDiag_2datasets_'+label+'.png')




SankeyWidget(groups=[{'id': 'Comarcas_productoras', 'type': 'process', 'title': '', 'nodes': ['Comarcas_produc…