# PYRAMID CHART 📊
## EVOLUTION OF POPULATION PYRAMID IN SPAIN (1971-2024)

*Data sources:*
- Population: https://www.ine.es/jaxiT3/Tabla.htm?t=56934
- Life expectancy: https://datosmacro.expansion.com/demografia/esperanza-vida/espana

In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patheffects as pe
import matplotlib.font_manager as fm
import re
import imageio
import os

In [2]:
# Load the population data from a CSV file
df = pd.read_csv('data/raw_pop_data.csv', sep=';')
df = df.rename(columns={"Edad simple": "Edad"})

# Print the size of the dataframe
print(f"The size of the dataframe is: {df.shape}")

The size of the dataframe is: (69651, 4)


In [3]:
# Show the first few rows of the dataframe
df.head()

Unnamed: 0,Edad,Sexo,Periodo,Total
0,Todas las edades,Total,1 de enero de 2024,48.619.695
1,Todas las edades,Total,1 de octubre de 2023,48.486.865
2,Todas las edades,Total,1 de julio de 2023,48.320.520
3,Todas las edades,Total,1 de abril de 2023,48.205.962
4,Todas las edades,Total,1 de enero de 2023,48.085.361


In [4]:
# Load life expectancy data from a CSV file
life_exp = pd.read_csv('data/life_expectancy.csv', sep='\t')
# Remove unnecessary columns
life_exp = life_exp[["Año", "Esperanza de vida"]]
# Sort the data by year
life_exp = life_exp.sort_values(by="Año")

# Extrapolate the data for the missing year 2024 by using linear interpolation with the last 5 years of data
# Get the last 5 years of data
last_5_years = life_exp.tail(5)

# Calculate the average annual increase over the last 5 years
avg_annual_increase = (last_5_years["Esperanza de vida"].iloc[-1] - last_5_years["Esperanza de vida"].iloc[0]) / 4

# Get the last known year and life expectancy
last_year = life_exp["Año"].max()
last_life_exp = life_exp.loc[life_exp["Año"] == last_year, "Esperanza de vida"].values[0]

# Add a new row for 2024
years_to_extrapolate = 2024 - last_year
extrapolated_value = last_life_exp + (avg_annual_increase * years_to_extrapolate)
life_exp = pd.concat([life_exp, pd.DataFrame({"Año": [2024], "Esperanza de vida": [extrapolated_value]})], ignore_index=True)

In [5]:
# Remove the rows with "Total" values in the "Sexo" column
df = df[df['Sexo'] != 'Total']

# Remove the rows with "Todas las edades" values in the "Edad" column
df = df[df['Edad'] != 'Todas las edades']

# Remove all non "enero" values in the "Periodo" column
df = df[df['Periodo'].str.contains('enero')]

# Remove the "1 de enero de " prefix from the "Periodo" column and convert it to integer
df["Periodo"] = df["Periodo"].str.replace("1 de enero de ", "").astype(int)

# Get the integer part of the "Edad" column
df["Edad"] = df["Edad"].str.extract(r'(\d+)').astype(int)

# Replace dots with empty strings in the "Total" column and convert it to integer
# Handle NaN and empty string values before conversion
df["Total"] = df["Total"].str.replace(".", "", regex=False)
df["Total"] = pd.to_numeric(df["Total"], errors='coerce').fillna(0).astype(int)

# Multiply the "Total" column for "Mujeres" by *-1
df.loc[df["Sexo"] == "Mujeres", "Total"] *= -1

# Sort the dataframe by "Periodo" and "Edad"
df = df.sort_values(by=["Periodo", "Edad"])

In [6]:
# Check the first few rows of the cleaned dataframe
df.head()

Unnamed: 0,Edad,Sexo,Periodo,Total
1064,0,Hombres,1971,332704
1277,0,Mujeres,1971,-314740
1703,1,Hombres,1971,322690
1916,1,Mujeres,1971,-306134
2342,2,Hombres,1971,326470


In [7]:
# Set the range of years for the boomer generation
boomer_years = range(1958, 1977)

# Set the color palette for the pyramid chart
color_men = "#E6B89C"
color_women = "#554971"

# Set the maximum total value for the x-axis limits
max_total = int(np.ceil(abs(df.loc[df["Edad"] < 84, "Total"]).max() / 10_000) * 10_000)

# Iterate over the years in the dataframe
for year in range(df["Periodo"].min(), df["Periodo"].max() + 1):

        # Filter the dataframe for the specified year
        df_year = df[df["Periodo"] == year].copy()
        
        # Set nan the values of the "Total" column for the age group 85
        df_year.loc[df_year["Edad"] == 85, "Total"] = np.nan
        # As 85 age is repeated, we remove the duplicates
        df_year = df_year.drop_duplicates(subset=["Sexo", "Edad"])

        # Prepare separate dataframes for men and women
        men = df_year[df_year["Sexo"] == "Hombres"].copy()
        women = df_year[df_year["Sexo"] == "Mujeres"].copy()
        
        # Interpolate the missing values for the age group 85
        men["Total"] = men["Total"].interpolate(method='linear', limit_direction='both')
        women["Total"] = women["Total"].interpolate(method='linear', limit_direction='both')

        # Create the pyramid chart
        fig, ax = plt.subplots(figsize=(9, 8))

        # Calculate birth years based on the current year and age
        men['Birth_Year'] = year - men["Edad"]
        women['Birth_Year'] = year - women["Edad"]

        # Plot men bars with different transparency for boomers vs non-boomers
        for _, row in men.iterrows():
                alpha = 1.0 if row['Birth_Year'] in boomer_years else 0.7
                ax.barh(row["Edad"], row["Total"], height=1,
                        color=color_men, alpha=alpha, zorder=2)

        # Plot women bars with different transparency for boomers vs non-boomers
        for _, row in women.iterrows():
                alpha = 1.0 if row['Birth_Year'] in boomer_years else 0.7
                ax.barh(row["Edad"], row["Total"], height=1,
                        color=color_women, alpha=alpha, zorder=2)

        # Get the life expectancy for the specified year
        life_exp_year = life_exp[life_exp["Año"] == year]["Esperanza de vida"].values[0]
        # Plot the life expectancy line
        ax.axhline(y=life_exp_year, color='red', lw=0.75)
        # Plot previous life expectancy lines in gray
        for _, row in life_exp[life_exp["Año"] < year].iterrows():
                ax.axhline(y=row["Esperanza de vida"], color='gray', lw=0.5, alpha=0.4, zorder=1)
                
        # Set the background color to light gray
        ax.set_facecolor((0.9, 0.9, 0.9, 0.7))

        # Add labels and customize the chart
        ax.set_xlabel("Población")
        ax.set_ylabel("Edad")
        ax.set_title(r"$\bf{Transformación\ Demográfica\ en\ España:}$" + 
                     "\nEvolución de la Pirámide Poblacional a lo Largo del Tiempo",
                     horizontalalignment="left",
                     x=-0,
                     y=1.075,
                     fontsize=14)
        ax.grid(axis="x", linestyle="--")

        # Add label for life expectancy
        ax.text(0.15, life_exp_year + 0.5,
                f"Esperanza de vida: {life_exp_year:.1f} años",
                ha="center", color="red", fontsize=8,
                transform=ax.get_yaxis_transform())

        # Add genders labels
        ax.text(0.25, 1.025, "Mujeres", ha="center", color=color_women,
                        va="center", transform=ax.transAxes, fontsize=10, weight="bold")
        ax.text(0.75, 1.025, "Hombres", ha="center", color=color_men,
                        va="center", transform=ax.transAxes, fontsize=10, weight="bold")

        # Add label for boomer generation
        boomer_mid_age = year - sum(boomer_years) // len(boomer_years)
        ax.text(0.5, boomer_mid_age, "Generación\nBaby Boom", ha="center", 
                path_effects=[pe.withStroke(linewidth=3, foreground='white')],
                va="center", fontsize=9, weight="bold")

        # Add year label
        ax.text(0.95, 0.95, f"{year}", ha="right", va="top",
                path_effects=[pe.withStroke(linewidth=6, foreground='white')],
                transform=ax.transAxes, fontsize=24, weight="bold")

        # Add a text for the source of the data
        ax.text(1, -0.1,
                r'$\it{Fuentes\ de\ datos:\ INE\ y\ datos.macro}$',
                ha="right", va="center", transform=ax.transAxes, fontsize=8)

        # Fix x-axis to show absolute values with ticks on both sides       
        # Set the x-axis limits symmetrically
        ax.set_xlim(-max_total, max_total)

        # Define tick positions (including 0 in the middle)
        num_ticks = 3  # Number of ticks on each side
        tick_positions = np.linspace(-max_total, max_total, 2 * num_ticks + 1, dtype=int)

        # Set ticks
        ax.set_xticks(tick_positions)

        # Convert tick labels to absolute values in millions
        tick_labels = [f'{int(abs(x)/1e3):.0f} mil' if x != 0 else '0' for x in tick_positions]

        # Apply labels
        ax.set_xticklabels(tick_labels)

        # Add a vertical line at x=0
        ax.axvline(x=0, color='black', linestyle='-', lw=1)
        
        # Save the figure with a dynamic filename
        plt.savefig(f"plots/pyramid_{year}.png", dpi=300, bbox_inches='tight')
        plt.close(fig)

In [8]:
# Create a GIF from the saved images

# Get the list of image files
image_files = sorted([f for f in os.listdir("plots") if f.endswith(".png")],
                     key=lambda x: int(re.search(r'_(\d+)', x).group(1)))

# Create a list to hold the images
images = []

# Read each image and append it to the list
for filename in image_files:
    filepath = os.path.join("plots", filename)
    images.append(imageio.imread(filepath))

# Save the images as a GIF
imageio.mimsave('plots/pop_pyramid_evolution.gif', images, duration=0.1, loop=0)

# Delete the individual images after creating the GIF
for filename in image_files:
    os.remove(os.path.join("plots", filename))

  images.append(imageio.imread(filepath))
