# Load raw data

In [10]:
import pandas as pd
file_name = "WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_FULL.xlsx"

df = pd.read_excel(
    file_name,
    sheet_name="Estimates",
    skiprows=16,
    usecols=[
        "Region, subregion, country or area *",
        "Type",
        "Year",
        "Total Population, as of 1 July (thousands)",
        "Total Fertility Rate (live births per woman)",
        "Life Expectancy at Birth, both sexes (years)",
        "Net Migration Rate (per 1,000 population)",
        "Median Age, as of 1 July (years)",
        "Crude Death Rate (deaths per 1,000 population)",
        "Sex Ratio at Birth (males per 100 female births)",
    ],
    engine="openpyxl",
)


# Filter to countries only

In [11]:
df = df[df["Type"] == "Country/Area"].copy()

# Rename columns

In [12]:
df = df.rename(columns={
    "Region, subregion, country or area *": "country",
    "Total Population, as of 1 July (thousands)": "population_thousands",
    "Total Fertility Rate (live births per woman)": "tfr",
    "Life Expectancy at Birth, both sexes (years)": "life_exp",
    "Net Migration Rate (per 1,000 population)": "net_mig_rate",
    "Median Age, as of 1 July (years)": "median_age",
    "Crude Death Rate (deaths per 1,000 population)": "crude_death_rate",
    "Sex Ratio at Birth (males per 100 female births)": "sex_ratio_birth",
})


# Convert to numeric & clean

In [13]:
num_cols = [
    "Year",
    "population_thousands",
    "tfr",
    "life_exp",
    "net_mig_rate",
    "median_age",
    "crude_death_rate",
    "sex_ratio_birth",
]

df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")
df = df.dropna(subset=["country", "Year", "population_thousands"]).copy()


# Final training dataset

In [17]:
training_data = (
    df[
        [
            "country",
            "Year",
            "population_thousands",
            "tfr",
            "life_exp",
            "net_mig_rate",
            "median_age",
            "crude_death_rate",
            "sex_ratio_birth",
        ]
    ]
    .sort_values(["country", "Year"])
    .reset_index(drop=True)
)

training_data.head()

Unnamed: 0,country,Year,population_thousands,tfr,life_exp,net_mig_rate,median_age,crude_death_rate,sex_ratio_birth
0,Afghanistan,1950.0,7776.176,7.248,28.156,0.792,18.395,37.418,104.9
1,Afghanistan,1951.0,7879.339,7.26,28.584,0.622,18.37,36.647,104.9
2,Afghanistan,1952.0,7987.783,7.26,29.014,0.018,18.333,36.063,104.9
3,Afghanistan,1953.0,8096.698,7.266,29.452,-1.095,18.289,35.534,104.9
4,Afghanistan,1954.0,8207.95,7.254,29.698,-0.833,18.239,35.233,104.9


# Export to CSV

In [15]:
training_data.to_csv("training_data.csv", index=False)