Import the relevant libraries.



In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
# from scipy import optimize
# import statsmodels.api as sm
# import statsmodels.formula.api as smf
# import matplotlib.pyplot as plt
# import tensorflow as tf
# import altair as alt
# from linearmodels.iv import IV2SLS
# from tensorflow.keras import datasets, layers, models
# from tensorflow import keras
# import theano
# import pymc3 as pm
# import arviz as az
# import seaborn as sn


# import covid19pandas as cod
from country_codes import eurostat_dictionary
import eurostat
import matplotlib.pyplot as plt


from warnings import filterwarnings
filterwarnings('ignore')
from sklearn import datasets
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

## Dealing with data



In this section we work with the API of Eurostat to get data directly into the notebook.

We look at the data for each variable, decide on the shape we want to have for the dataframe and then merge a number of data sets together. For this we will be using pandas. Further, we use the python file `country_codes.py` which should be in the same folder as this notebook.



### Countries



We focus on the following countries when looking at the data. For the Eurostat data this does not matter so much. But if you want to combine Eurostat data with OECD data, this selection can be useful.



In [None]:
EU_countries = ['Belgium', 'Bulgaria', 'Czechia', 'Denmark',
'Germany (until 1990 former territory of the FRG)', 'Germany','Estonia',
'Ireland', 'Greece', 'Spain', 'France', 'Croatia', 'Italy',
'Cyprus', 'Latvia', 'Lithuania', 'Luxembourg', 'Hungary', 'Malta',
'Netherlands', 'Austria', 'Poland', 'Portugal', 'Romania',
'Slovenia', 'Slovakia', 'Finland', 'Sweden', 'United Kingdom',
'Iceland', 'Liechtenstein', 'Norway', 'Switzerland',
'Bosnia and Herzegovina']

### Mortality



The [Eurostat website](https://ec.europa.eu/eurostat/data/database?node_code=hlth_cd_apr) has a browser where you can look for data. Here we are looking for data on mortality. You can click on the link to the data browser to see the [details of the variable](https://ec.europa.eu/eurostat/databrowser/view/hlth_cd_apr/default/table?lang=en). At the top-left of the screen you can see the name of the variable in the line &ldquo;`online data code: HLTH_CD_APR`&rdquo;. The name of this variable we use below in the `get_data_df` method.

So we call this method and collect the information in the dataframe `df`. Then we check what `df` looks like:



In [None]:
df = eurostat.get_data_df('hlth_cd_apr')
df.head()

So we have a number of columns with abbreviations in them and then we have data for the years 2011-2017. Use the website of the variable to figure out what the abbreviations mean. To illustrate, the column `mortalit` gives three measures of mortality:



In [None]:
df.mortalit.unique()

We will be interested in preventable &rsquo;PRVT&rsquo; and treatable &rsquo;TRT&rsquo; mortality.

First, let&rsquo;s change the country column &rsquo;geo\time&rsquo; and use country names instead of abbreviations. We need to &ldquo;escape&rdquo; the &rsquo;\\&rsquo; symbol to make sure pandas reads &rsquo;\\&rsquo; literally (not as a symbol). That is why we have &rsquo;\\\\&rsquo; in the code below. We use the `eurostat_dictionary` to turn the country abbreviations into country names.

Note that to change the column name we use `.rename`; to change values in a row, we use `.replace`. The replacements are provided using a python dictionary: `{'old_name':'new_name'}`.

If you are wondering why we use `inplace=True`, just run the code block without this to see the difference.



In [None]:
df.rename({'geo\TIME_PERIOD':'geo'},inplace=True,axis=1)
df['country'] = df['geo'].replace(eurostat_dictionary)
df.head()

Now we will select the values that we are interested in: only EU countries, both males and females, both preventable and treatable mortality, unit of measurement rate &rsquo;RT&rsquo; (not number &rsquo;NR&rsquo;) and all diseases (e.g. not the subset &ldquo; [A00-A09] Intestinal infectious diseases&rdquo;).

For selection, we can use `==` or `.isin()`. With numbers we can also use smaller/greater than `<,>` etc.

After this selection, we can drop some columns to make the dataframe a bit easier to handle.



In [None]:
df = df[df.country.isin(EU_countries) & (df.sex.isin(["M","F"]) ) & (df.mortalit.isin(["PRVT","TRT"])) \
        & (df.unit == "RT") & (df.icd10 == "TOTAL")]
df.drop(["unit","icd10","geo"],axis=1,inplace=True)
df.head()

In [None]:
df = pd.melt(df,id_vars=['country','sex','mortalit'],
                        value_vars= df.columns[2:-1],
                        var_name='year',
                        value_name='rate')
df.head()

Instead of one column `rate` we want to have two columns (i.e. two variables); one corresponds to PRVT, the other to TRT. For this we use [unstack](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.unstack.html). We put the identifying columns in an order such that the last column refers to `mortalit` which contains the two values PRVT and TRT. These columns become the index of the dataframe. Then we unstack the dataframe on the last column of the index, which is the default value of `unstack()`. This pivots the column `mortalit` into two separate columns PRVT and TRT.



In [None]:
df.set_index(['country','year','sex','mortalit'],inplace=True)
df = df.unstack()
df.head()

Finally, we reset the index (such that it no longer features the hierarchy &#x2013;with &rsquo;rate&rsquo; and &rsquo;mortalit&rsquo;&#x2013; shown above) and rename the columns to make them easier to read/understand.



In [None]:
df.reset_index(inplace=True)
df.columns = [' '.join(col).strip() for col in df.columns.values]
df.rename({'rate PRVT':'Preventable mortality', 'rate TRT':'Treatable mortality'},inplace=True,axis=1)
df.head()

Now we consider a number of other variables. There you can practice the steps above to get these dataframes into the right shape. Finally, we merge each dataframe with the one previously created.



### GDP per capita



Go to this [page](https://ec.europa.eu/eurostat/data/database?node_code=nama_10_pc) to find the variable name for &ldquo;Main GDP aggregates per capita&rdquo;; fill in the &ldquo;dots&rdquo; in the following code:



In [None]:
def download_oecd_data(data_code): # 'nama_10_pc'
    df_n = eurostat.get_data_df(data_code)
    df_n.rename({'geo\TIME_PERIOD':'geo'},inplace=True,axis=1)

    df_n['country'] = df_n['geo'].replace(eurostat_dictionary)
    # drop columns that contain all nan values
    # Dropping columns with all NaN values
    df_n.dropna(axis=1, how='all', inplace=True)
    df_n = pd.melt(df_n,id_vars=['country','unit'],
                        value_vars= df_n.columns[4:-1],
                        var_name='year',
                        value_name='gdp')

    # merged_df = df.merge(df_n, on=['country', 'year'], how='left') 
    return df_n

df_n = download_oecd_data('nama_10_pc')
df_n

### Income quantiles



Use [this page](https://ec.europa.eu/eurostat/databrowser/view/icw_res_02/default/table?lang=en) to get the variable name for &ldquo;Mean and median economic resources of households by income, consumption and wealth quantiles - experimental statistics&rdquo;.



We drop the year 2010 as it does not lie in the period for which we have the other data that we download.



In [None]:
def get_mean_median_income():
    df_n = eurostat.get_data_df('icw_res_02') #icw_res_02
    df_n.rename({'geo\TIME_PERIOD':'geo'},inplace=True,axis=1)

    df_n = df_n[(df_n.indic_il=='INC_DISP')&(df_n.statinfo=='AVG')&(df_n.quant_inc.isin(['QU1', 'QU2', 'QU3', 'QU4', 'QU5']))&(df_n.quant_expn=='TOTAL')&(df_n.quant_wlth=='TOTAL')]

    df_n.drop(['unit','quant_expn','quant_wlth','indic_il','statinfo', '2010'],axis=1,inplace=True)
    df_n['country'] = df_n['geo'].replace(eurostat_dictionary)

    df_n.drop(['geo'],axis=1,inplace=True)

    df_n = pd.melt(df_n,id_vars=['country','quant_inc'],
                            value_vars= df_n.columns[1:-1],
                            var_name='year',
                            value_name='mean/median income')

    df_n.set_index(['country', 'year', 'quant_inc'], inplace=True)

    df_n = df_n[df_n.index.duplicated(keep='last')] # removing duplicate indices

    df_n.index.unique()

    df_n = df_n.unstack(level='quant_inc')
    return df_n

df_n = get_mean_median_income()
df_n


In [None]:
merged_df = df.merge(df_n.reset_index(), on=['country', 'year'], how='left')
merged_df.head()

In [None]:
df_n = eurostat.get_data_df('lfsa_urgan') 
df_n.head()

In [None]:
df_n = eurostat.get_data_df('yth_demo_080')
df_n.rename({'geo\TIME_PERIOD':'geo'},inplace=True,axis=1)

df_n['country'] = df_n['geo'].replace(eurostat_dictionary)

df_n.drop(['geo'],axis=1,inplace=True)
df_n.tail()

In [None]:

def get_3rd_country_nationals_returned_annualty(country):
    '''Third country nationals returned following an order to leave - annual data (rounded)'''
    df_n = eurostat.get_data_df('migr_eirtn')
    df_n.rename({'geo\TIME_PERIOD':'geo'},inplace=True,axis=1)

    df_n['country'] = df_n['geo'].replace(eurostat_dictionary)

    df_n = df_n[df_n['age']== 'TOTAL']

    df_n.set_index(['country','freq', 'c_dest', 'age', 'sex'], inplace=True)

    df_n.drop(['geo'],axis=1,inplace=True)

    df_n_reset = df_n.reset_index()

    # Assuming 'year_columns' represents the columns that contain years
    year_columns = [col for col in df_n_reset.columns if col not in ['country', 'freq', 'c_dest', 'age', 'sex', 'citizen', 'unit']]

    # Plotting for a specific country (e.g., 'Germany') and all the years

    country_data = df_n_reset[df_n_reset['country'] == country]
    country_data = country_data[country_data['sex'] == 'T']
    country_data = country_data[country_data['c_dest'] == 'THRD']


    # Selecting columns representing years
    year_columns = [col for col in country_data.columns if col not in ['country', 'freq', 'c_dest', 'age', 'sex', 'citizen', 'unit']]

    # Summing over rows for each year
    summed_data = country_data[year_columns].sum()
    print(summed_data)

    # Plotting the summed data
    plt.figure(figsize=(10, 6))
    plt.plot(summed_data.index, summed_data.values, marker='o', linestyle='-')
    plt.xlabel('Years')
    plt.ylabel('Summed Migration Data')
    plt.title(f'Third country nationals returned following an order to leave - annual data  (Summed Migration Data for {country} (Sex=T, c_dest=THRD))')
    plt.grid(True)
    plt.show()

    return summed_data

df_n = get_3rd_country_nationals_returned_annualty('Germany')



In [None]:

df_n = get_percentage_youth_at_home('France')

In [None]:

def youth_at_home(COUNTRY, SEX, AGE):
    df_n = eurostat.get_data_df('ilc_lvps08') # YOUNG PEOPLE LIVING AT HOME
    df_n.rename({'geo\TIME_PERIOD':'geo'},inplace=True,axis=1)

    # Frequency of heavy episodic drinking by sex, age and educational attainment level

    # df_n.drop(['unit','quant_expn','quant_wlth','indic_il','statinfo', '2010'],axis=1,inplace=True)

    df_n.drop(['unit', 'freq'],axis=1,inplace=True)
    df_n['country'] = df_n['geo'].replace(eurostat_dictionary)

    df_n.drop(['geo'],axis=1,inplace=True)
    print(df_n['age'].unique())

    df_n = df_n[df_n['country'] == COUNTRY]
    df_n = df_n[df_n['sex'] == SEX]

    

    df_n = df_n[df_n['age'] == AGE]


    df_n.drop(['2003', '2004'],axis=1, inplace=True) # 'sex', 'age', 'country', 

    df_n.set_index(['sex', 'age', 'country'], inplace=True)
    

    df_n = df_n.tail(3)
    df_n = df_n.T

    # Assuming df_n is your modified DataFrame
    plt.figure(figsize=(10, 6))

    for column in df_n.columns:

        plt.plot(df_n.index, df_n[column], marker='o', linestyle='-', label=column)

    plt.xlabel('Years')
    plt.ylabel('Percentage living with parents')
    plt.title(f'Percentage of youth ({SEX}) living with parents in {COUNTRY}')
    plt.legend()  # Add a legend to differentiate the columns
    plt.grid(True)
    plt.show()

    return df_n

df_n = youth_at_home('Netherlands', 'M', 'Y20-29')
df_n = youth_at_home('Netherlands', 'F', 'Y20-29')
df_n = youth_at_home('Netherlands', 'T', 'Y20-29')

 

In [None]:
df_n = youth_at_home('Netherlands')

In [None]:
df_n = youth_at_home('France')

In [None]:

def long_term_youth_unemployment(COUNTRY, SEX):
    df_n = eurostat.get_data_df('yth_empl_130') # Long-term youth unemployment
    df_n.rename({'geo\TIME_PERIOD':'geo'},inplace=True,axis=1)


    df_n.drop(['unit', 'freq'],axis=1,inplace=True)
    df_n['country'] = df_n['geo'].replace(eurostat_dictionary)

    df_n.drop(['geo'],axis=1,inplace=True)
    print(df_n)

    

    df_n = df_n[df_n['country'] == COUNTRY]
    df_n = df_n[df_n['sex'] == SEX]

    df_n.set_index(['sex', 'age', 'country'], inplace=True)
    
    # df_n = df_n.tail(1)
    df_n = df_n.T

    # Assuming df_n is your modified DataFrame
    plt.figure(figsize=(10, 6))

    for column in df_n.columns:
        plt.plot(df_n.index, df_n[column], marker='o', linestyle='-', label=column)

    plt.xlabel('Years')
    plt.ylabel('Percentage living with parents')
    plt.title(f'Long-term youth unemployment in {COUNTRY}')
    plt.legend()  # Add a legend to differentiate the columns
    plt.grid(True)
    plt.show()

    return df_n

df_n = long_term_youth_unemployment('Netherlands', 'T')

#### https://www.rtlnieuws.nl/nieuws/nederland/artikel/5406259/minder-studenten-wonen-op-kamers-bij-ouders-thuis-basisbeurs

"Het totale aantal uitwonende studenten is wel toegenomen, maar dat komt enkel en alleen door een toename van het aantal buitenlandse studenten. Een op de zes studenten komt uit een ander land. In het collegejaar 2015/2016 was minder dan 10 procent van buitenlandse herkomst."


#### https://www.cbs.nl/nl-nl/longread/statistische-trends/2023/de-groeiende-groep-jongvolwassen-thuiswonenden/3-data-en-methode

In [None]:
 # Mobile students from abroad enrolled by education level, sex and country of origin



def mobile_students_from_abroad(COUNTRY, SEX):
    df_n = eurostat.get_data_df('educ_uoe_mobs02') # YOUNG PEOPLE LIVING AT HOME
    df_n.rename({'geo\TIME_PERIOD':'geo'},inplace=True,axis=1)

    df_n.drop(['unit', 'freq'],axis=1,inplace=True)
    df_n['country'] = df_n['geo'].replace(eurostat_dictionary)

    df_n.drop(['geo'],axis=1,inplace=True)


    df_n = df_n[df_n['country'] == COUNTRY]
    df_n = df_n[df_n['sex'] == SEX]

    # df_n = df_n[df_n['isced11'] == 'ED6' or  df_n['isced11'] == 'ED7']
    df_n = df_n[(df_n['isced11'] == 'ED6') | (df_n['isced11'] == 'ED7')]

    
    df_n.set_index(['sex', 'country', 'partner', 'isced11'], inplace=True)
    
    # sum over all rows
    df_n = df_n.sum()
    
    
    # df_n = df_n.T


    df_n = pd.DataFrame(df_n)
    print(df_n)
    # # Assuming df_n is your modified DataFrame
    plt.figure(figsize=(10, 6))

    for column in df_n.columns:
        plt.plot(df_n.index, df_n[column], marker='o', linestyle='-', label=column)

    plt.xlabel('Years')
    plt.ylabel('Percentage living with parents')
    plt.title(f'Percentage of youth ({SEX}) living with parents in {COUNTRY}')
    plt.legend()  # Add a legend to differentiate the columns
    plt.grid(True)
    plt.show()

    return df_n

df_n = mobile_students_from_abroad('Netherlands', 'T')



 

##  Formulating the causal model
Suppose that policy X was introduced in year XXXX. To measure its effect, the outcome variable is the XXXX the following year. 

POSSIBLE VARIABLES;
1. Percentage of foreign students 
2. Rent prices (average)
3. ...

### The importance of time

1) Activity prior to the treatment (causes the treatment)
2) Activity after the treatment (is the outcome of applying treatment)


In [None]:
causal_graph = """digraph {
treatment[label="Program Signup in month i"];
pre_spends;
post_spends;
Z->treatment;
U[label="Unobserved Confounders"]; 
pre_spends -> treatment;
treatment->post_spends;
signup_month->post_spends; signup_month->pre_spends;
signup_month->treatment;
U->treatment; U->pre_spends; U->post_spends;
}"""

# Post-process the data based on the graph and the month of the treatment (signup)
df_i_signupmonth = df[df.signup_month.isin([0,i])].groupby(["user_id", "signup_month", "treatment"]).apply(
    lambda x: pd.Series({'pre_spends': np.sum(np.where(x.month < i, x.spend,0))/np.sum(np.where(x.month<i, 1,0)),
                        'post_spends': np.sum(np.where(x.month > i, x.spend,0))/np.sum(np.where(x.month>i, 1,0)) })
).reset_index()
# print(df_i_signupmonth)
model = dowhy.CausalModel(data=df_i_signupmonth,
                     graph=causal_graph.replace("\n", " "),
                     treatment="treatment",
                     outcome="post_spends")
model.view_model()
from IPython.display import Image, display
display(Image(filename="causal_model.png"))