In [10]:
import pandas as pd
import os

# Folder path containing CSV files
folder_path = '/home/jakob/Uni/data_visualization/Rastetter-Dietrich-Denig/data' # "Medium" predictions are used

# Get list of all CSV files in the folder
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# List to store DataFrames
dfs = []

# Read each CSV file into a DataFrame and rename columns
for file_path in file_paths:
    df = pd.read_csv(file_path)
    
    # Rename column 'Year(s)' to 'Year'
    df.rename(columns={'Year(s)': 'Year', 'Time Period': 'Year', 'Reference Area': 'Country or Area', 'Observation Value': 'Value'}, 
              inplace=True)

    df = df[(df['Year'] != 2101)]
    
    if 'Sex' in df.columns:
        # Filter rows where 'sex' column is 'all genders'
        df = df[df['Sex'] == 'All genders']
    
    # Filter columns
    df = df[['Country or Area', 'Year', 'Value']]
    
    # Extract the last part of the file path as the column suffix
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    
    # Create custom column names
    custom_names = [f'{file_name}' if col == 'Value' else col for col in ['Country or Area', 'Year', 'Value']]
    
    # Rename columns
    df.columns = custom_names
    dfs.append(df)

# Merge the DataFrames based on 'Country or Area' and 'Year'
combined_df = dfs[0]
for df in dfs[1:]:
    combined_df = pd.merge(combined_df, df, on=['Country or Area', 'Year'], how='outer')

In [11]:
# regions to filter out (NOT READY, ONLY SOME SELECTED), no list found to differ from country and area
regions_to_exclude = ['Africa', 'Asia', 'Australia/New Zealand', 'Australia/New Zealand', 'World', 'Eastern Africa', 
                      'Eastern Africa', 'Eastern and South-Eastern Asia', 'Eastern Europe', 'High-income countries', 
                      'Land-locked Developing Countries (LLDC)', 'Less developed regions, excluding China', 'Less developed regions', 
                      'Less developed regions, excluding least developed countries', 'Small Island Developing States (SIDS)', 
                      'No income group available', 'Northern Africa', 'Northern Africa and Western Asia', 'Northern America', 
                      'Northern Europe', 'Low-income countries', 'Middle-income countries', 'More developed regions', 'Middle Africa', 
                      'Eastern Asia', 'Soutern Asia', 'Western Europe', 'Upper-middle-income countries', 'Lower-middle-income countries', 
                      'Europe', 'South-Eastern Asia', 'Southern Europe', 'Southern Asia', 'Southern Africa', 'Europe and Northern America', 
                      'Central Asia', 'Central America', 'Central and Southern Asia', 'Sub-Saharan Africa', 'Least developed countries', 
                      'Latin America and the Caribbean', 'Western Africa', 'South America', 'Western Asia']

def filter_regions(df, regions_to_exclude):
    filtered_df = df[~df['Country or Area'].isin(regions_to_exclude)]
    return filtered_df

# filter out regions
filtered_combined_df = filter_regions(combined_df, regions_to_exclude)

display(filtered_combined_df.head(10))

Unnamed: 0,Country or Area,Year,total_fertility_rate,school_life_expextancy_ISCED_1-8,total_population,life_expectancy,infant_mortality_rate,GDP_per_capita_constant_dollar_2017
0,Afghanistan,2100,1.872,,110854.784,77.5384,7.4881,
1,Afghanistan,2099,1.8764,,110621.216,77.3909,7.6078,
2,Afghanistan,2098,1.8849,,110366.921,77.2354,7.7567,
3,Afghanistan,2097,1.8816,,110094.664,77.0851,7.8799,
4,Afghanistan,2096,1.8949,,109802.281,76.937,8.0198,
5,Afghanistan,2095,1.8984,,109487.14,76.7848,8.1644,
6,Afghanistan,2094,1.9005,,109153.078,76.6453,8.291,
7,Afghanistan,2093,1.901,,108801.262,76.4932,8.4333,
8,Afghanistan,2092,1.9066,,108430.416,76.3537,8.5686,
9,Afghanistan,2091,1.9044,,108041.524,76.2051,8.7096,


In [12]:
def visualize_data(df, year):
    # Filter out countries/areas with a population under 1000
    df_filtered = df[df['total_population'] >= 10000]

    # Extract relevant columns
    fertility_rate = df_filtered['total_fertility_rate']
    infant_mortality_rate = df_filtered['infant_mortality_rate']
    total_population = df_filtered['total_population']
    gdp_per_capita = df_filtered['GDP_per_capita_constant_dollar_2017']  # Adding GDP per capita data

    # Calculate max values for scaling from the entire DataFrame
    max_fertility = df_filtered['total_fertility_rate'].max()
    max_infant_mortality = df_filtered['infant_mortality_rate'].max()
    max_total_population = df_filtered['total_population'].max()
    min_gdp_per_capita = df_filtered['GDP_per_capita_constant_dollar_2017'].min()
    max_gdp_per_capita = df_filtered['GDP_per_capita_constant_dollar_2017'].max()

    # Filter data for the specified year
    df_year = df_filtered[df_filtered['Year'] == year]

    # Extract relevant columns for the specified year
    fertility_rate_year = df_year['total_fertility_rate']
    infant_mortality_rate_year = df_year['infant_mortality_rate']
    total_population_year = df_year['total_population']
    gdp_per_capita_year = df_year['GDP_per_capita_constant_dollar_2017']

    # Calculate size of circles based on total population for the specified year
    circle_size_year = total_population_year / max_total_population * 500

    # Create figure using Plotly Express
    fig = px.scatter(df_year, x=fertility_rate_year, y=infant_mortality_rate_year, size=circle_size_year,
                     labels={'x': 'Fertility Rate', 'y': 'Infant Mortality Rate'},
                     title=f'Visualization of Fertility Rate vs Infant Mortality Rate ({year})',
                     size_max=30, hover_name=df_year['Country or Area'],
                     hover_data={'total_population': True, 'GDP_per_capita_constant_dollar_2017': True},  # Add GDP per capita to hover data
                     color=gdp_per_capita_year,  # Color based on GDP per capita for the specified year
                     color_continuous_scale='Inferno',  # Choose a color scale
                     range_color=[min_gdp_per_capita, max_gdp_per_capita])  # Set the range of color scale

    # Set x-axis and y-axis limits based on min and max values from entire DataFrame
    fig.update_xaxes(range=[0, max_fertility])
    fig.update_yaxes(range=[0, max_infant_mortality])

    # Dimension
    fig.update_layout(width=1024, height=512)

    # Show plot
    fig.show()

In [13]:
import plotly.express as px

def visualize_life_expectancy(df, year):
    # Filter data for the specified year
    df_year = df[df['Year'] == year]
    
    # Calculate overall range of life expectancy
    min_life_expectancy = df['life_expectancy'].min()
    max_life_expectancy = df['life_expectancy'].max()
    
    # Create figure using Plotly Express choropleth map
    fig = px.choropleth(df_year, 
                        locations='Country or Area',
                        locationmode='country names',
                        color='life_expectancy',
                        hover_name='Country or Area',
                        title=f'Life Expectancy by Country ({year})',
                        color_continuous_scale='Viridis',
                        range_color=[min_life_expectancy, max_life_expectancy])
    
    # Show plot
    fig.show()

def visualize_school_life_expectancy(df, year):
    # Filter data for the specified year
    df_year = df[df['Year'] == year]
    
    # Calculate overall range of life expectancy
    min_life_expectancy = df['school_life_expextancy_ISCED_1-8'].min()
    max_life_expectancy = df['school_life_expextancy_ISCED_1-8'].max()
    
    # Create figure using Plotly Express choropleth map
    fig = px.choropleth(df_year, 
                        locations='Country or Area',
                        locationmode='country names',
                        color='school_life_expextancy_ISCED_1-8',
                        hover_name='Country or Area',
                        title=f'School Life Expectancy by Country ({year})',
                        color_continuous_scale='Viridis',
                        range_color=[min_life_expectancy, max_life_expectancy])
    
    # Show plot
    fig.show()

In [14]:
def visualize_all(df, years):
    for year in years:
        visualize_data(df, year)
    for year in years:
        visualize_life_expectancy(df, year)
    for year in years:
        visualize_school_life_expectancy(df, year)

In [15]:
years_to_display = [1964, 2014, 2064]
visualize_all(filtered_combined_df, years_to_display)

In [16]:
# school life expectancy already loaded
# ideas: vaccine, poverty, education