In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
df = pd.read_csv('estat_hlth_cd_aro.csv')
df.columns = df.columns.str.strip()
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11610 entries, 0 to 11609
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   freq    11610 non-null  object
 1   unit    11610 non-null  object
 2   sex     11610 non-null  object
 3   age     11610 non-null  object
 4   icd10   11610 non-null  object
 5   resid   11610 non-null  object
 6   geo     11610 non-null  object
 7   2016    11610 non-null  object
 8   2017    11610 non-null  object
 9   2018    11610 non-null  object
 10  2019    11610 non-null  object
 11  2020    11610 non-null  object
 12  2021    11610 non-null  object
 13  2022    11610 non-null  object
 14  2023    11610 non-null  object
dtypes: object(15)
memory usage: 1.3+ MB


In [11]:
path = '../../ESTAT_CODELISTS/*.tsv'
files = glob.glob(path)
dataframes = {file.split('/')[-1].replace('.tsv', ''): pd.read_csv(
    file, sep='\t') for file in files}

In [13]:
for col in ["freq", "unit", "sex", "age", "icd10", "resid", "geo"]:
    curr_codelist = dataframes[f"ESTAT_{col.upper()}"]
    df[col] = df[col].map(curr_codelist.set_index('CODE')['Label'])

df.rename(columns={'geo': 'country'}, inplace=True)
df.head()

Unnamed: 0,freq,unit,sex,age,icd10,resid,country,2016,2017,2018,2019,2020,2021,2022,2023
0,Annual,Number,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Austria,1164,1830,2011,1980,1850,2047,2242,2444
1,Annual,Number,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Belgium,3334,3544,3493,3679,3527,2984,3461,:
2,Annual,Number,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Bulgaria,40,33,35,40,57,51,39,:
3,Annual,Number,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Switzerland,3347,3781,3798,3856,3789,3452,3978,:
4,Annual,Number,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Cyprus,108,117,164,179,155,194,221,:


In [14]:
df.replace(": z", np.nan, inplace=True)
df.replace(":", np.nan, inplace=True)

In [15]:
year_cols = [col for col in df.columns if col.isdigit()]

for col in year_cols:
    df[col] = df[col].astype(str).str.extract(r'(\d+)').astype(float)

In [16]:
df.head()

Unnamed: 0,freq,unit,sex,age,icd10,resid,country,2016,2017,2018,2019,2020,2021,2022,2023
0,Annual,Number,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Austria,1164.0,1830.0,2011.0,1980.0,1850.0,2047.0,2242.0,2444.0
1,Annual,Number,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Belgium,3334.0,3544.0,3493.0,3679.0,3527.0,2984.0,3461.0,
2,Annual,Number,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Bulgaria,40.0,33.0,35.0,40.0,57.0,51.0,39.0,
3,Annual,Number,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Switzerland,3347.0,3781.0,3798.0,3856.0,3789.0,3452.0,3978.0,
4,Annual,Number,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Cyprus,108.0,117.0,164.0,179.0,155.0,194.0,221.0,


In [17]:
for col in df.columns:
    if not col.isdigit() and df[col].nunique() == 1:
        df.drop(columns=[col], inplace=True)

In [18]:
df.head()

Unnamed: 0,sex,age,icd10,resid,country,2016,2017,2018,2019,2020,2021,2022,2023
0,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Austria,1164.0,1830.0,2011.0,1980.0,1850.0,2047.0,2242.0,2444.0
1,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Belgium,3334.0,3544.0,3493.0,3679.0,3527.0,2984.0,3461.0,
2,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Bulgaria,40.0,33.0,35.0,40.0,57.0,51.0,39.0,
3,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Switzerland,3347.0,3781.0,3798.0,3856.0,3789.0,3452.0,3978.0,
4,Females,Total,Mental and behavioural disorders (F00-F99),All deaths reported in the country,Cyprus,108.0,117.0,164.0,179.0,155.0,194.0,221.0,


In [19]:
df.to_csv("./clean/estat_hlth_cd_aro.csv", index=False)

In [2]:
df = pd.read_csv('./clean/estat_hlth_cd_aro.csv')

covid_conditions = ['COVID-19, virus identified', 'COVID-19, virus not identified', 'COVID-19, other']
covid_df = df[df['icd10'].isin(covid_conditions)].groupby(['sex', 'age', 'resid', 'country']).sum().reset_index()
covid_df['icd10'] = 'COVID-19'

# Drop the original rows and append the new aggregated row
df = df[~df['icd10'].isin(covid_conditions)]
df = pd.concat([df, covid_df], ignore_index=True)
df.icd10.unique()

array(['Mental and behavioural disorders (F00-F99)',
       'Mental and behavioural disorders due to use of alcohol',
       'Other mental and behavioural disorders (remainder of F00-F99)',
       'Total', 'Drug dependence, toxicomania (F11-F16, F18-F19)',
       'Intentional self-harm', 'COVID-19'], dtype=object)

In [3]:
mental = ['Mental and behavioural disorders (F00-F99)', 'Other mental and behavioural disorders (remainder of F00-F99)']
mental_df = df[df['icd10'].isin(mental)].groupby(['sex', 'age', 'resid', 'country']).sum().reset_index()
mental_df['icd10'] = 'Mental and behavioural disorders'

# Drop the original rows and append the new aggregated row
df = df[~df['icd10'].isin(mental)]
df = pd.concat([df, mental_df], ignore_index=True)
df.icd10.unique()

array(['Mental and behavioural disorders due to use of alcohol', 'Total',
       'Drug dependence, toxicomania (F11-F16, F18-F19)',
       'Intentional self-harm', 'COVID-19',
       'Mental and behavioural disorders'], dtype=object)

In [4]:
df = df.rename(columns={'Drug dependence, toxicomania (F11-F16, F18-F19)': 'Drug dependence, toxicomania'})
df.head()

Unnamed: 0,sex,age,icd10,resid,country,2016,2017,2018,2019,2020,2021,2022,2023
0,Females,Total,Mental and behavioural disorders due to use of...,All deaths reported in the country,Austria,87.0,104.0,123.0,104.0,108.0,143.0,135.0,144.0
1,Females,Total,Mental and behavioural disorders due to use of...,All deaths reported in the country,Belgium,99.0,112.0,90.0,93.0,111.0,79.0,103.0,
2,Females,Total,Mental and behavioural disorders due to use of...,All deaths reported in the country,Bulgaria,1.0,3.0,3.0,3.0,6.0,9.0,1.0,
3,Females,Total,Mental and behavioural disorders due to use of...,All deaths reported in the country,Switzerland,52.0,48.0,63.0,44.0,58.0,42.0,59.0,
4,Females,Total,Mental and behavioural disorders due to use of...,All deaths reported in the country,Cyprus,,0.0,2.0,1.0,1.0,1.0,0.0,


In [5]:
df.to_csv("./clean/estat_hlth_cd_aro.csv", index=False)

In [6]:
mental_only = df[df['icd10'] != 'COVID-19']
mental_only

Unnamed: 0,sex,age,icd10,resid,country,2016,2017,2018,2019,2020,2021,2022,2023
0,Females,Total,Mental and behavioural disorders due to use of...,All deaths reported in the country,Austria,87.0,104.0,123.0,104.0,108.0,143.0,135.0,144.0
1,Females,Total,Mental and behavioural disorders due to use of...,All deaths reported in the country,Belgium,99.0,112.0,90.0,93.0,111.0,79.0,103.0,
2,Females,Total,Mental and behavioural disorders due to use of...,All deaths reported in the country,Bulgaria,1.0,3.0,3.0,3.0,6.0,9.0,1.0,
3,Females,Total,Mental and behavioural disorders due to use of...,All deaths reported in the country,Switzerland,52.0,48.0,63.0,44.0,58.0,42.0,59.0,
4,Females,Total,Mental and behavioural disorders due to use of...,All deaths reported in the country,Cyprus,,0.0,2.0,1.0,1.0,1.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7807,Total,Total,Mental and behavioural disorders,All deaths reported in the country,Spain,21535.0,22196.0,22851.0,23426.0,22292.0,21094.0,23319.0,22650.0
7808,Total,Total,Mental and behavioural disorders,All deaths reported in the country,Sweden,6715.0,6790.0,7024.0,6683.0,6771.0,5958.0,6370.0,0.0
7809,Total,Total,Mental and behavioural disorders,All deaths reported in the country,Switzerland,5561.0,6232.0,6242.0,6321.0,6243.0,5772.0,6492.0,0.0
7810,Total,Total,Mental and behavioural disorders,All deaths reported in the country,Türkiye,837.0,829.0,779.0,552.0,541.0,725.0,645.0,0.0


In [7]:
mental_only.to_csv("./clean/mental_grouped_only.csv", index=False)