In [1]:
from pathlib import Path # reads paths in the current OS
import pandas as pd
import re
import json
import yaml
import utils as ut
from ipysankeywidget import SankeyWidget
import floweaver as fw
from floweaver import *
import numpy as np

In [2]:
with open('paths.yaml') as file:
    config = yaml.full_load(file)

data = pd.read_csv(Path(config['input_path']) / "pagesos_clean.csv", sep=",").fillna('')
#locations_df = pd.read_csv(Path(config['input_path']) /  'municipis_merge.csv').fillna('')

comarcas_df = pd.read_csv(Path(config['input_path']) / "comarcas_catalunya_2019.csv", sep=";")


#comarcas_df.head()
#comarcas_df.Comarca.unique()


In [3]:
specific_subset=False ##(must be set when running the script)
##TODO: specific_subset= True at the moment it is not working correclty, NEED TO BE FIXED!!!


if specific_subset:

    on_fields=list(['fruit', 'vegetables']) ## must be set when running the script, if specific_subset=True
    off_fields=list(['n_other_prod', 'meat']) ## must be set when running the script, if specific_subset=True

    if len(off_fields)>0:
        data['flag']=np.where(data[on_fields].gt(0), 1, 0) & np.where(all(data[off_fields].eq(0)), 1, 0)
    else:
        data['flag']=np.where(data[on_fields].gt(0), 1, 0)
else:
    data['flag']=1
    
#data.tail(10)

## Connections between comarcas
Creating the dataframe needed for sankey diagram (i.e. the list of all the edges between two comarcas),
it will have the following columns: source, target, value.

#### Selecting only the desired subset of producers

In [4]:
data_sel=data[(data.flag==1) & (~data.comarca_origin.str.contains('NOTFOUND'))]

print(data_sel.shape)
#data[['meat', 'vegetables', 'fruit', 'n_main_prod', 'n_other_prod','n_tot_prod', 'flag']].tail(40)



##print(data.shape)
##data.drop(data[data.flag == 0].index, inplace=True)
##print(data[data.flag==1].shape)

(373, 46)


#### Extracting all the target comarcas from the field 'DONDE' 

In [5]:
df=[]

for j in range(0,data_sel.shape[0]):

    targets=data_sel.DONDE.iloc[j].split(", ")
    n_targets=len(targets)
    
    if(n_targets>=40):
        df.append((data_sel.comarca_origin.iloc[j], 'Tota Catalunya', 1))
    else:
        for i in range(0,n_targets):
            df.append((data_sel.comarca_origin.iloc[j], targets[i], 1))
    

df = pd.DataFrame(df, columns=('source', 'target', 'value'))


## Removing records which have no info in the target or in the source field
df=df[~(df.target=='')]
df=df[~(df.source=='')]


## Uniformizing names of comarcas between the pagesos dataset and the cataloninan comarcas dataset (comarcas_df)
standard_names = {'Osona / Lluçanès':'Osona', 'Ribera d’Ebre': 'Ribera d\'Ebre', 
                  'Pla de l’Estany':'Pla de l\'Estany', 'Pla d’Urgell': 'Pla d\'Urgell'}
df['target'] = df['target'].replace(standard_names)
df['source'] = df['source'].replace(standard_names)

#### Creation of the final df by grouping by (source, target) couples  

In [6]:
## Getting the normalization factor (i.e. the total number of connections per comarca of origin)  
df_norm=df.groupby(['source'])['value'] \
                             .sum() \
                             .reset_index(name='norm_factor') 


## Grouping by the connections with same source-target: 
df_edges=df.groupby(['source', 'target'])['value'] \
                             .sum() \
                             .reset_index(name='value') \
                             .sort_values(['value'], ascending=False) \



## Adding the normalized factor to the edges df:
df_edges=pd.merge(df_edges, df_norm, how='inner', left_on='source', right_on='source')
df_edges['norm_value']=df_edges['value'].astype(float)/df_edges['norm_factor'].astype(float)*100


#print(df_edges.head())
#print(df_edges.shape)
df_edges.sort_values(by=['value'], ascending=False).head(10)



Unnamed: 0,source,target,value,norm_factor,norm_value
0,Vallès Oriental,Vallès Oriental,20,55,36.363636
10,Baix Llobregat,Baix Llobregat,17,42,40.47619
19,Alt Penedès,Tota Catalunya,15,64,23.4375
34,Bages,Bages,13,43,30.232558
51,Priorat,Tota Catalunya,12,24,50.0
20,Alt Penedès,Alt Penedès,11,64,17.1875
61,Alt Empordà,Alt Empordà,11,47,23.404255
80,Anoia,Anoia,10,49,20.408163
99,Noguera,Tota Catalunya,9,12,75.0
102,Osona,Osona,9,40,22.5


## Sankey diagram

In [7]:
###https://github.com/psychemedia/parlihacks/blob/master/notebooks/MigrantFlow.ipynb)

flows=df_edges[['source', 'target', 'value']]

SankeyWidget(links=flows.to_dict('records'))

size = dict(width=870, height=1000)

nodes = {
    'Comarcas_productoras': ProcessGroup(list(comarcas_df.Comarca.unique())),
    'Comarcas_entrega': ProcessGroup(list(comarcas_df.Comarca.unique())),
}

ordering = [
    ['Comarcas_productoras'],       # put "Comarcas_productoras" on the left...
    ['Comarcas_entrega'],   # ... and "Comarcas_entrega" on the right.
]

bundles = [
    Bundle('Comarcas_productoras', 'Comarcas_entrega'),
]


#sdd = SankeyDefinition(nodesA, bundles, ordering)
#weave(sdd, flows).to_widget(**size)


comarcas_out = Partition.Simple('process',list(comarcas_df.Comarca.unique()))
comarcas_in = Partition.Simple('process',list(comarcas_df.Comarca.unique()))


# Update the ProcessGroup nodes to use the partitions
nodes['Comarcas_productoras'].partition = comarcas_out
nodes['Comarcas_entrega'].partition = comarcas_in


sdd = SankeyDefinition(nodes, bundles, ordering)

## New Sankey!
#weave(sdd, flows).to_widget(**size) 

## Saving the plot as png
weave(sdd, flows, link_color=QuantitativeScale('value'), \
      measures='value').to_widget(**size).auto_save_png('mySankey.png')




SankeyWidget(groups=[{'id': 'Comarcas_productoras', 'type': 'process', 'title': '', 'nodes': ['Comarcas_produc…