# Extracting the required data

In [21]:
import pandas as pd

# Load the Excel file
file_path = "WPP2024_GEN_F01_DEMOGRAPHIC_INDICATORS_FULL.xlsx"

# Read the "Estimates" sheet (real header starts at row 17)
df = pd.read_excel(file_path, sheet_name="Estimates", header=16)

# Keep only countries/areas (exclude regions like World, Asia, etc.)
df = df[df["Type"] == "Country/Area"]

# Extract required columns and rename them
training_data = df[
    [
        "Region, subregion, country or area *",
        "Year",
        "Total Fertility Rate (live births per woman)",
        "Life Expectancy at Birth, both sexes (years)",
        "Net Number of Migrants (thousands)",
        "Population Growth Rate (percentage)",
        "Crude Birth Rate (births per 1,000 population)",
        "Total Population, as of 1 July (thousands)"
    ]
].rename(columns={
    "Region, subregion, country or area *": "country",
    "Year": "year",
    "Total Fertility Rate (live births per woman)": "tfr",
    "Life Expectancy at Birth, both sexes (years)": "life_expectancy",
    "Net Number of Migrants (thousands)": "net_migrants",
    "Population Growth Rate (percentage)": "pop_growth_rate",
    "Crude Birth Rate (births per 1,000 population)": "crude_birth_rate",
    "Total Population, as of 1 July (thousands)": "population"
})

# Drop rows with missing actual values
training_data = training_data.dropna()

# Convert population and migrants from thousands → full numbers
training_data["net_migrants"] = training_data["net_migrants"] * 1000
training_data["population"] = training_data["population"] * 1000

# Save as CSV
training_data.to_csv("training_data.csv", index=False)

print("✅ training_data.csv created successfully!")
print(training_data.head())

training_data.shape

✅ training_data.csv created successfully!
      country    year    tfr life_expectancy net_migrants pop_growth_rate  \
2594  Burundi  1950.0  6.923          40.938     -13334.0           2.272   
2595  Burundi  1951.0  6.914          41.229     -13202.0           2.185   
2596  Burundi  1952.0    6.9           41.48     -13691.0           2.111   
2597  Burundi  1953.0  6.915          41.743     -14930.0           2.039   
2598  Burundi  1954.0  6.917          41.973     -14570.0           2.028   

     crude_birth_rate population  
2594            51.98  2254938.0  
2595           51.195  2305746.0  
2596           50.442  2355804.0  
2597           49.929  2405186.0  
2598           49.321  2454586.0  


(17538, 8)