In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
# Import Data
oecd_healthcare = pd.read_csv('../data/OECD_healthcare_coverage.csv')
hypertension_adults = pd.read_csv('../data/hypertension_adults.csv')
hypertension_female = pd.read_csv('../data/hypertension_female.csv')
hypertension_male = pd.read_csv('../data/hypertension_male.csv')
overweight_adults = pd.read_csv('../data/overweight_adults.csv')
overweight_female = pd.read_csv('../data/overweight_female.csv')
overweight_male = pd.read_csv('../data/overweight_male.csv')
oop_spend_25pct = pd.read_csv('../data/spending_25_oop.csv')
urban_pop = pd.read_csv('../data/urban_population.csv')
urbpop_as_pct_total = pd.read_csv('../data/urban_pop_pct_tot.csv')
petersen_kff_data = pd.read_csv('../data/petersen_KFF_data.csv')

In [3]:
# Generate the list of column names for the years 1990 to 2023
year_columns = [f'{year} [YR{year}]' for year in range(1990, 2020)]
new_col_names = {col: int(col.split(' ')[0]) for col in year_columns} # Create a dictionary to rename the columns

# Filter the DataFrame for the specific Series Name and select the year columns
filtered_adult_hypertension = hypertension_adults[
    hypertension_adults['Series Name'] == "Prevalence of hypertension (% of adults ages 30-79)"
][['Country Name'] + year_columns]

# Change the column names to the year values
filtered_adult_hypertension = filtered_adult_hypertension.rename(columns=new_col_names)
filtered_adult_hypertension = filtered_adult_hypertension.rename(columns={'Country Name': 'Country'})

# Change only year columns names to numeric 
year_columns_numeric = list(new_col_names.values())
filtered_adult_hypertension[year_columns_numeric] = filtered_adult_hypertension[year_columns_numeric].apply(pd.to_numeric, errors='coerce')

# Convert the DataFrame to a long format
filtered_adult_hypertension = filtered_adult_hypertension.melt(id_vars='Country', var_name='Year', value_name='Hypertension Prevalence (%)')

# Plot the data 
fig = px.scatter(filtered_adult_hypertension, x='Year', y='Hypertension Prevalence (%)', color='Country', title='Hypertension Prevalence (%) by Country')
fig.show()


In [4]:
# Filter data to include only Reference area, time period, and observation value, rename columns, convert time period to numeric
oecd_healthcare_filtered = (
    oecd_healthcare[
        (oecd_healthcare['Unit of measure'] == 'Percentage of population') & 
        (oecd_healthcare['Insurance type'] == 'Public and primary voluntary health insurance')
    ]
    .loc[:,['Reference area','TIME_PERIOD','OBS_VALUE']]
    .rename(columns={'Reference area': 'Country', 'TIME_PERIOD': 'Year', 'OBS_VALUE': 'Healthcare Coverage (%)'})
)
oecd_healthcare_filtered['Year'] = pd.to_numeric(oecd_healthcare_filtered['Year'])
oecd_healthcare_filtered

Unnamed: 0,Country,Year,Healthcare Coverage (%)
0,Netherlands,1960,71.0
1,Netherlands,1961,71.0
2,Netherlands,1962,71.0
3,Netherlands,1963,71.0
4,Netherlands,1964,71.0
...,...,...,...
11324,Bulgaria,2019,88.6
11325,Bulgaria,2020,88.0
11326,Bulgaria,2021,88.2
11327,Bulgaria,2022,93.5


In [5]:
# plot data
fig = px.scatter(oecd_healthcare_filtered, x='Year', y='Healthcare Coverage (%)', color='Country', title='Healthcare Coverage (%) by Country')
fig.show()

In [6]:
# Merge the two dataframes on the 'Country' and 'Year' columns
merged_data = pd.merge(filtered_adult_hypertension, oecd_healthcare_filtered, on=['Country', 'Year']).drop_duplicates()

In [7]:
israel = merged_data[merged_data['Country'] == 'Israel']

In [8]:
# plot the merged data to show the relationship between hypertension prevalence and healthcare coverage
fig = px.scatter(merged_data,
                 y='Hypertension Prevalence (%)',
                 x='Healthcare Coverage (%)',
                 color='Country',
                 title='Hypertension Prevalence (%) vs Healthcare Coverage (%) by Country',
                 hover_data=['Year', 'Country', 'Hypertension Prevalence (%)', 'Healthcare Coverage (%)'])

# toggle the visibility of the traces to off by default
for trace in fig.data:
    trace.visible = 'legendonly'

fig.show()

In [9]:
oecd_healthcare['Insurance type'].unique()

array(['Public and primary voluntary health insurance',
       'Total voluntary health insurance',
       'Primary voluntary health insurance',
       'Duplicate voluntary health insurance',
       'Complementary voluntary health insurance',
       'Supplementary voluntary health insurance',
       'Government/compulsory health insurance'], dtype=object)