In [None]:
import pandas as pd
import sqlalchemy
import psycopg2
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sql_functions_sp as sf
import python_functions_sp as pf

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

In [None]:
# Floats (decimal numbers) should be displayed rounded with 2 decimal places
pd.options.display.float_format = "{:,.2f}".format
# Set style for plots
plt.style.use('fivethirtyeight') 

---

Import tables as DataFrames:

In [None]:
schema = 'capstone_health_education'
table = 'HFA_data_p1_filtered'

sql_query = f'SELECT * FROM {schema}."{table}";'
HFA_data_p1 = sf.get_dataframe(sql_query)

In [None]:
schema = 'capstone_health_education'
table = 'HFA_data_p2_filtered'

sql_query = f'SELECT * FROM {schema}."{table}";'
HFA_data_p2 = sf.get_dataframe(sql_query)

In [None]:
schema = 'capstone_health_education'
table = 'HFA_data_p3_filtered'

sql_query = f'SELECT * FROM {schema}."{table}";'
HFA_data_p3 = sf.get_dataframe(sql_query)

Put column names in lower case and snake case:

In [None]:
HFA_data_p1 = pf.columns_lower_snake_case_2(HFA_data_p1)
HFA_data_p2 = pf.columns_lower_snake_case_2(HFA_data_p2)
HFA_data_p3 = pf.columns_lower_snake_case_2(HFA_data_p3)

In [None]:
HFA_data_p1.shape

In [None]:
HFA_data_p2.shape

In [None]:
HFA_data_p3.shape

Join the three DataFrames:

In [None]:
frames = [HFA_data_p1, HFA_data_p2, HFA_data_p3]
HFA_df = pd.concat(frames)

In [None]:
HFA_df.shape    # 569+88++442 = 1900

---

# Exploratory Data Analysis - EDA
- understand the data
- drop unneeded columns
- duplicates
- missing values
- descriptive statistics
- extreme values / outliers

### Understand the Data
remember: dtype 'object' means string or mixed data-types

In [None]:
HFA_df.info()

In [None]:
HFA_df['country_region'].unique()

---

## Explore and Clean the Data

### **Unneeded columns** are dropped yet!

### Check for **duplicates**

In [None]:
HFA_df.duplicated().value_counts()

No duplicates!

### Check for **missing values**

In [None]:
HFA_df.isnull().sum()

For 2020, 2021 and 2022 we have data for less than 50% of the countries / regions OR measure variables => drop these years?!

In [None]:
HFA_df.groupby(['measure_code']).count().iloc[:, :12]

In [None]:
HFA_df.groupby(['measure_code']).count().iloc[:, 12:]

Less data for:
- HFA_411
- some years in HFA_417, especially since 2016
- **HFA_625**: only data for two years => drop it!
- a lot of missing values for years **2020**, **2021**, **2022** => drop them?

Dropping HFA_625: "Number cigarettes consumed per person per year" 

In [None]:
HFA_df['measure_code'].unique()

In [None]:
HFA_df = HFA_df[~(HFA_df['measure_code'] == 'HFA_625')]

In [None]:
HFA_df['measure_code'].unique()

### Rename values, i.e. give values meaningful names

Join various tables to...
- rename the measure variables
- give full name to countries and regions

Rename the measure variables:

In [None]:
schema = 'capstone_health_education'
table = 'HFA_metadata_sh4'

# sql_query = f'SELECT * FROM {schema}."{table}";'
sql_query = f'SELECT "Measure labels", "Unnamed: 1" FROM {schema}."{table}" WHERE "Measure labels" LIKE \'HFA%%\';'
measure_names = sf.get_dataframe(sql_query)

In [None]:
# schema = 'capstone_health_education'
# table = 'HFA_metadata_sh4'

# # sql_query = f'SELECT * FROM {schema}."{table}";'
# sql_query = f'SELECT "Measure labels", "Unnamed: 1" FROM {schema}."{table}";'
# measure_names = sf.get_dataframe(sql_query)

In [None]:
measure_names

In [None]:
measure_names.rename(columns={'Measure labels': 'measure_code', 'Unnamed: 1': 'measure_label'}, inplace=True)

In [None]:
HFA_df = pd.merge(HFA_df, measure_names, on='measure_code', how='left')

Give full names to countries and regions:

In [None]:
schema = 'capstone_health_education'
table = 'HFA_metadata_sh5'

sql_query = f'SELECT * FROM {schema}."{table}";'
country_names = sf.get_dataframe(sql_query)

In [None]:
country_names.drop(['ISO 2', 'ISO 3', 'WHO code', 'Short name'], axis=1, inplace=True)

In [None]:
country_names.rename(columns={'Code': 'country_region', 'Full name': 'name'}, inplace=True)

In [None]:
schema = 'capstone_health_education'
table = 'HFA_metadata_sh6'

sql_query = f'SELECT * FROM {schema}."{table}";'
region_names = sf.get_dataframe(sql_query)

In [None]:
region_names.drop(['Full name'], axis=1, inplace=True)

In [None]:
region_names.rename(columns={'Code': 'country_region', 'Short name': 'name'}, inplace=True)

In [None]:
cat = pd.concat([country_names, region_names], ignore_index=True)

In [None]:
HFA_df = pd.merge(HFA_df, cat, on='country_region', how='left').copy()

In [None]:
HFA_df

In [None]:
HFA_df = HFA_df[['measure_code', 'measure_label', 'country_region', 'name', '2000', '2001', '2002', '2003',
       '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021',
       '2022']]

In [None]:
HFA_df #.sample(10)

### Do some **descriptive statistics**

In [None]:
HFA_df.iloc[:, :15].describe()

In [None]:
HFA_df.iloc[:, 15:].describe()

Short explanation of the reported measures:
- count: Indication of how many values are present in the columns (NaNs/missing values are not counted).
- mean: average value of the data
- std: standard deviation of the data
- min: the smallest value in the data set
- 25%: 25 % of the data are below this value
- 50%: 50% of the data are below this value. This value is called the median.
- 75%: 75% of the data are below this value
- max: the largest expression in the data set

### Check for **extreme values / outliers**

In [None]:
HFA_df.sample(10)

To work with the numbers in the year-columns **split the huge DataFrame** for each measure code into smaller ones:

In [None]:
HFA_df['measure_code'].unique()

In [None]:
measure_codes = ['HFA_1', 'HFA_43', 'HFA_10', 'HFA_167', 'HFA_13', 'HFA_16',
               'HFA_19', 'HFA_101', 'HFA_194', 'HFA_260', 'HFA_417', 'HFA_275',
               'HFA_293', 'HFA_357', 'HFA_411', 'HFA_436', 'HFA_440', 'HFA_441',
               'HFA_442', 'HFA_443', 'HFA_444', 'HFA_445', 'HFA_446', 'HFA_454',
               'HFA_546', 'HFA_566', 'HFA_617', 'HFA_618', 'HFA_627', 'HFA_636',
               'HFA_637']

# Create new list to safe the new DataFrames
new_dataframes = []

# Iterate over the measure codes
for code in measure_codes:
    # Filter the row for each year
    HFA_df_measure_code = HFA_df[HFA_df['measure_code'] == code].copy()

    # Add the new DataFrame to the list
    new_dataframes.extend([HFA_df_measure_code])

# Save the newly created DataFrames in new Variables
(HFA_1, HFA_43, HFA_10, HFA_167, HFA_13, HFA_16,
HFA_19, HFA_101, HFA_194, HFA_260, HFA_417, HFA_275,
HFA_293, HFA_357, HFA_411, HFA_436, HFA_440, HFA_441,
HFA_442, HFA_443, HFA_444, HFA_445, HFA_446, HFA_454,
HFA_546, HFA_566, HFA_617, HFA_618, HFA_627, HFA_636,
HFA_637) = new_dataframes[:]

Reset the index:

In [None]:
dataframes = [HFA_1, HFA_43, HFA_10, HFA_167, HFA_13, HFA_16,
            HFA_19, HFA_101, HFA_194, HFA_260, HFA_417, HFA_275,
            HFA_293, HFA_357, HFA_411, HFA_436, HFA_440, HFA_441,
            HFA_442, HFA_443, HFA_444, HFA_445, HFA_446, HFA_454,
            HFA_546, HFA_566, HFA_617, HFA_618, HFA_627, HFA_636,
            HFA_637]

for df in dataframes:
    df.reset_index(drop=True, inplace=True)

Check that only one code is saved in each DataFrame:

In [None]:
print(HFA_1['measure_code'].unique(), 
      HFA_43['measure_code'].unique(), 
      HFA_10['measure_code'].unique(), 
      HFA_167['measure_code'].unique(), 
      HFA_13['measure_code'].unique(), 
      HFA_16['measure_code'].unique(), 
      HFA_19['measure_code'].unique(), 
      HFA_101['measure_code'].unique(), 
      HFA_194['measure_code'].unique(), 
      HFA_260['measure_code'].unique(), 
      HFA_417['measure_code'].unique(), 
      HFA_275['measure_code'].unique(), 
      HFA_293['measure_code'].unique(), 
      HFA_357['measure_code'].unique(), 
      HFA_411['measure_code'].unique(), 
      HFA_436['measure_code'].unique(), 
      HFA_440['measure_code'].unique(), 
      HFA_441['measure_code'].unique(), 
      HFA_442['measure_code'].unique(), 
      HFA_443['measure_code'].unique(), 
      HFA_444['measure_code'].unique(), 
      HFA_445['measure_code'].unique(), 
      HFA_446['measure_code'].unique(), 
      HFA_454['measure_code'].unique(), 
      HFA_546['measure_code'].unique(), 
      HFA_566['measure_code'].unique(), 
      HFA_617['measure_code'].unique(), 
      HFA_618['measure_code'].unique(), 
      HFA_627['measure_code'].unique(), 
      HFA_636['measure_code'].unique(), 
      HFA_637['measure_code'].unique())


---

Plot the DataFrames:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import textwrap

# Liste der measure labels
measure_labels = ['Mid-year population, by sex',
       'Life expectancy at birth (years), by sex',
       '% of population aged 0–14 years, by sex',
       'Motor vehicle traffic accidents, all ages, per 100 000, by sex (age-standardized death rate)',
       '% of population aged 65+ years, by sex',
       'Live births per 1000 population, by sex',
       'Number of live births, by sex',
       'Diseases of circulatory system, all ages, per 100 000, by sex (age-standardized death rate)',
       'All causes, all ages, per 100 000, by sex (age-standardized death rate)',
       'Mental disorders, diseases of nervous system and sense organs, all ages, per 100 000, by sex (age-standardized death rate)',
       '% population self-assessing health as good, by sex',
       'Symptoms, signs and ill-defined conditions, all ages, per 100 000, by sex (age-standardized death rate)',
       'Selected alcohol-related causes, per 100 000, by sex (age-standardized death rate)',
       'Incidence of cancer per 100 000, by sex',
       'Absenteeism from work due to illness, days per employee per year',
       'Road traffic accidents with injury per 100 000',
       'Average number of calories available per person per day (kcal)',
       '% of total energy available from fat',
       'Fat available per person per day (g)',
       '% of total energy available from protein',
       'Protein available per person per day (g)',
       'Average amount of cereal available per person per year (kg)',
       'Average amount of fruits and vegetables available per person per year (kg)',
       'People injured due to work-related accidents per 100 000',
       'Surgical wound infection rate (%), all operations',
       'Total health expenditure as % of GDP',
       'GINI coefficient (income distribution)',
       'Proportion of children of official primary school age not enrolled, by sex',
       'Age-standardized prevalence of overweight (defined as BMI = 25 kg/m2) in people aged 18 years and over, WHO estimates (%), by sex',
       'Healthy life expectancy (HALE) at birth',
       'Youth unemployment rate, % of total labor force ages 15-25']

# Liste der DataFrame-Namen
dataframe_keys = ['HFA_1', 'HFA_43', 'HFA_10', 'HFA_167', 'HFA_13', 'HFA_16', 'HFA_19', 'HFA_101', 'HFA_194', 'HFA_260', 
                  'HFA_417', 'HFA_275', 'HFA_293', 'HFA_357', 'HFA_411', 'HFA_436', 'HFA_440', 'HFA_441', 'HFA_442', 
                  'HFA_443', 'HFA_444', 'HFA_445', 'HFA_446', 'HFA_454', 'HFA_546', 'HFA_566', 'HFA_617', 'HFA_618', 
                  'HFA_627', 'HFA_636', 'HFA_637']

# Sicherstellen, dass die Anzahl der measure labels mit der Anzahl der DataFrame-Namen übereinstimmt
assert len(measure_labels) == len(dataframe_keys), "Die Anzahl der measure labels muss mit der Anzahl der DataFrame-Namen übereinstimmen."

# Calculate the number of rows and columns for the subplots
n_cols = 3
n_rows = (len(dataframe_keys) + n_cols - 1) // n_cols

# Generate boxplots for each dataframe
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 5))

# Flatten axes array for easy iteration if it's multi-dimensional
axes = axes.flatten()

# Set the max width for wrapping titles
max_title_width = 30

for i, (label, key) in enumerate(zip(measure_labels, dataframe_keys)):
    df = globals()[key]  # Zugriff auf die DataFrames über ihren Namen
    df.boxplot(ax=axes[i])
    wrapped_title = "\n".join(textwrap.wrap(label, max_title_width))
    axes[i].set_title(wrapped_title)
    plt.setp(axes[i].xaxis.get_majorticklabels(), rotation=90, fontsize=10)

# Remove empty subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


---

Write the DataFrames to the database:

In [None]:
# dataframes = [HFA_1, HFA_43, HFA_10, HFA_167, HFA_13, HFA_16,
#               HFA_19, HFA_101, HFA_194, HFA_260, HFA_417, HFA_275,
#               HFA_293, HFA_357, HFA_411, HFA_436, HFA_440, HFA_441,
#               HFA_442, HFA_443, HFA_444, HFA_445, HFA_446, HFA_454,
#               HFA_546, HFA_566, HFA_617, HFA_618, HFA_627, HFA_636,
#               HFA_637]

# table_names = ['hfa_1', 'hfa_43', 'hfa_10', 'hfa_167', 'hfa_13', 'hfa_16',
#                'hfa_19', 'hfa_101', 'hfa_194', 'hfa_260', 'hfa_417', 'hfa_275',
#                'hfa_293', 'hfa_357', 'hfa_411', 'hfa_436', 'hfa_440', 'hfa_441',
#                'hfa_442', 'hfa_443', 'hfa_444', 'hfa_445', 'hfa_446', 'hfa_454',
#                'hfa_546', 'hfa_566', 'hfa_617', 'hfa_618', 'hfa_627', 'hfa_636',
#                'hfa_637']

# for df, table_name in zip(dataframes, table_names):
#     sf.push_to_cloud(df, table_name)

In [None]:
# sf.push_to_cloud(HFA_df, 'hfa_filtered')

---

# Doing a flip-flop to turn the year-columns into rows and the measure codes into columns

Turning the year-columns into rows:

In [None]:
HFA_df.head(10)

In [None]:
HFA_melted = pd.melt(HFA_df,
                      id_vars=['country_region', 'name', 'measure_code', 'measure_label'],
                      value_vars=['2000','2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022'],
                      var_name='years',
                      value_name='value'
                      )

In [None]:
HFA_melted.head(10)

In [None]:
HFA_melted.columns

In [None]:
HFA_melted_pivoted = pd.pivot(HFA_melted,
                       columns='measure_code',
                       index=['name', 'years'],
                       values='value'
                       )

In [None]:
HFA_melted_pivoted.head()

In [None]:
HFA_melted_pivoted.reset_index(inplace=True)

In [None]:
HFA_melted_pivoted.head()

In [None]:
HFA_melted_pivoted.columns.name=None

In [None]:
HFA_melted_pivoted.head()

In [None]:
HFA_melted_pivoted.sample(5)

Write this melted and pivoted DataFrame into the database:

In [None]:
# sf.push_to_cloud(HFA_melted_pivoted, 'hfa_melted_pivoted')

---
---

# Playground

In [None]:
HFA_df.columns

In [None]:
# Display the whole string of the columns
pd.set_option('display.max_colwidth', None)

pd.DataFrame({'measure_code': HFA_df['measure_code'].unique(),
                    'measure_label': HFA_df['measure_label'].unique()})

In [None]:
HFA_df['name'].unique()