In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.offline as pyo
from scipy import signal
import scipy.cluster.hierarchy as spc
from pandas import read_excel
from ipywidgets import widgets
from ipywidgets import interactive, HBox, VBox
import plotly.io as pio

In [63]:
country_list_iso = ["FRA","BEL","BGR","CYP","CZE","DEU","DNK","EST","ESP","FIN","AUT","GBR","GRC","HRV","HUN","IRL","ITA","LTU","LUX","LVA","MLT","NLD","POL","PRT","ROU","SWE","SVN","SVK"]

#Assume all CSVs are in same folder
PATH = "data_nico_quarterly/"
names = [name for name in os.listdir(PATH) if ".csv" in name]

In [39]:
def proc_cols(df, value_name):
    df = df.rename(columns={"Value": value_name})
    return df.drop(['INDICATOR', 'SUBJECT','MEASURE','FREQUENCY','Flag Codes'], axis=1)

def generate_df(csv_names,path):
    df = None
    str_end = len(".csv")
    for name in csv_names:
        if df is None:
            df = proc_cols(pd.read_csv(path+name),name[:-str_end])
        else:
            new_df = proc_cols(pd.read_csv(path+name),name[:-str_end])
            df = pd.merge(df, new_df,  how='outer', left_on=['LOCATION','TIME'], right_on = ['LOCATION','TIME'])
    return df

def df_date_format(df):
    return df.replace({'-Q1':'-01-01','-Q2':'-04-01','-Q3':'-07-01','-Q4':'-10-01'}, regex=True)

In [40]:
df = generate_df(names,PATH)

In [41]:
df.head()

Unnamed: 0,LOCATION,TIME,GDP_growth,HUR
0,GBR,2004-Q1,0.543784,4.7
1,GBR,2004-Q2,0.357844,4.7
2,GBR,2004-Q3,0.165857,4.6
3,GBR,2004-Q4,0.329016,4.7
4,GBR,2005-Q1,0.840414,4.633333


In [45]:
df_hur = df_date_format(df[['LOCATION','TIME','HUR']])
df_gdp = df_date_format(df[['LOCATION','TIME','GDP_growth']])

In [46]:
df_hur = df_hur.pivot_table(values='HUR', index='TIME', columns='LOCATION')
df_gdp = df_gdp.pivot_table(values='GDP_growth', index='TIME', columns='LOCATION')

In [47]:
df_hur.head()

LOCATION,AUT,BEL,CZE,DEU,DNK,ESP,EST,FIN,FRA,GBR,...,LTU,LUX,LVA,NLD,OECD,POL,PRT,SVK,SVN,SWE
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-01,5.633333,8.533334,8.433333,10.06667,5.666667,11.2,10.1,9.0,8.8,4.7,...,11.4,4.733333,11.76667,5.433333,7.030766,19.9,7.266667,18.9,6.5,7.333333
2004-04-01,5.366667,7.766667,8.333333,10.3,5.466667,11.2,9.733334,8.966666,8.8,4.7,...,10.9,5.0,11.43333,5.766667,6.945943,19.33333,7.666667,18.8,6.333333,7.366667
2004-07-01,5.466667,8.866667,8.233334,10.36667,5.6,10.96667,10.2,8.7,8.8,4.6,...,10.76667,5.066667,11.76667,5.666667,6.889487,18.86667,8.0,18.03333,6.166667,7.433333
2004-10-01,5.5,8.366667,8.266666,10.6,5.333333,10.53333,9.766666,8.666667,8.9,4.7,...,10.5,5.0,12.0,5.766667,6.823681,18.43333,8.166667,17.7,6.3,7.366667
2005-01-01,5.433333,8.466666,8.066667,10.83333,5.233333,9.9,9.033334,8.6,8.7,4.633333,...,9.533334,4.766667,11.0,5.966667,6.710604,18.2,8.4,17.13333,6.433333,7.366667


In [7]:
def plot_lines(df,value_smooth):
    fig = go.Figure()
    for x in df.columns:
        if value_smooth == 0:
            fig.add_trace(go.Scatter(x=df.index, y=df[x], mode='lines',name=x))
        else:
            fig.add_trace(go.Scatter(x=df.index, y=signal.savgol_filter(df[x],value_smooth, 3), mode='lines',name=x))
    return fig

In [84]:
plot_lines(df_hur,0).show()
plot_lines(df_gdp,0).show()

In [108]:
df_corr_hur = df_hur.corr(method ='spearman')
# df_corr_hur = df_gdp.corr(method ='spearman')

In [109]:
corr = df_corr_hur

In [110]:
def getIdx(corr,variableTweak):
    pdist = spc.distance.pdist(corr)
    linkage = spc.linkage(pdist, method='complete')
    idx = spc.fcluster(linkage, variableTweak * pdist.max(), 'distance')
    return idx

def getGroup(number,idx):
    group = []
    for i in range(0,len(idx)):
        x = idx[i]
        if x == number:
            group.append(i)
    return group


def plotDifferentGroup(idx,dfPlot,indicePlot):
    for i in range(1,max(idx)+1):
        df1 = dfPlot[dfPlot.columns[getGroup(i,idx)]]
        i2 = i
        if i==2:
            i2 = 4
        if i==4:
            i2 = 2
        plot_lines(df1,indicePlot).update_layout(title_text = 'Visualization of unemployement for group '+str(i2),).show()

idx = getIdx(corr,0.65)

In [111]:
plotDifferentGroup(idx,df_hur,0)

In [112]:
def plotMapStatic(idx):
    fig = go.Figure(data=go.Choropleth(
        locations=country_list_iso, # Spatial coordinates
        z = idx, # Data to be color-coded
        locationmode = 'ISO-3', # set of locations match entries in `locations`
        #colorscale=[            [0, "rgb(12,140,113)"],[0.5, "rgb(12,140,113)"],[0.5, "rgb(26,188,156)"],[1, "rgb(26,188,156)"]],
        colorscale="Blugrn",
        colorbar_title = "Millions USD",
        showscale = False,
    ))

    fig.update_layout(
        title_text = 'Grouping countries according to similarity in unemployement',
        geo_scope='europe', # limite map scope to USA
        autosize=False,
        width=800,
        height=800,
        dragmode = False,
    )
    fig.show()

idx2=idx.copy()  
plotMapStatic(idx2)